From d6ce6251286e80254e276da02f0dee96a8ccbecf Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Thu, 4 Jun 2026 18:30:45 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Neelectric/Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.07 Source: Original Platform --- .gitattributes | 36 + README.md | 60 + all_results.json | 11 + chat_template.jinja | 121 + config.json | 36 + generation_config.json | 9 + model-00001-of-00004.safetensors | 3 + model-00002-of-00004.safetensors | 3 + model-00003-of-00004.safetensors | 3 + model-00004-of-00004.safetensors | 3 + model.safetensors.index.json | 299 + special_tokens_map.json | 11 + tokenizer.json | 3 + tokenizer_config.json | 2063 + train_results.json | 11 + trainer_state.json | 283042 ++++++++++++++++++++++++++++ training_args.bin | 3 + 17 files changed, 285717 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00004.safetensors create mode 100644 model-00002-of-00004.safetensors create mode 100644 model-00003-of-00004.safetensors create mode 100644 model-00004-of-00004.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..de0b074 --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/OpenR1-Math-220k_all_Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.07 +tags: +- generated_from_trainer +- trl +- sft +- open-r1 +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.07 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/OpenR1-Math-220k_all_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/OpenR1-Math-220k_all_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.07", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_math/runs/4ohmx5b7) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.1.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.5 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..7ed10e7 --- /dev/null +++ b/all_results.json @@ -0,0 +1,11 @@ +{ + "ewc_loss": 0.08365151286125183, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045809714356437325, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.4756091437999748, + "train_runtime": 47393.4997, + "train_samples": 125770, + "train_samples_per_second": 7.961, + "train_steps_per_second": 0.498 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..06df27b --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..50f6077 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..f1ae600 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff58530b11721f3f27ad1c3c42cc1171fab4dba53705d2140141e2c08521fd54 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..8b37f79 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5ad9a52e6507f30ecf4e81efb1152e67ed39edf2a53a5dd5fd497de7c46806d +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..9c85208 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4594e411a7e5910ffb5742f35f8ab1dc36df36606e6093a5d1c3b7526f14934 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..28dec21 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c34b20ef6bd2824778c0828091d806ab2274b295221379a2d5edd6681a69ca7 +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9d4773c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,11 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3beeacc --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..7ed10e7 --- /dev/null +++ b/train_results.json @@ -0,0 +1,11 @@ +{ + "ewc_loss": 0.08365151286125183, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045809714356437325, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.4756091437999748, + "train_runtime": 47393.4997, + "train_samples": 125770, + "train_samples_per_second": 7.961, + "train_steps_per_second": 0.498 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..8fdf2a7 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,283042 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 23583, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012721027859051011, + "ewc_loss": 0.0, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 0.0, + "grad_norm": 4.83560848236084, + "learning_rate": 0.0, + "loss": 0.7982, + "mean_token_accuracy": 0.7762961387634277, + "num_tokens": 38493.0, + "step": 1 + }, + { + "epoch": 0.00025442055718102023, + "ewc_loss": 0.0, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 0.0, + "grad_norm": 4.588322162628174, + "learning_rate": 4.2390843577787196e-10, + "loss": 0.8329, + "mean_token_accuracy": 0.765798807144165, + "num_tokens": 80419.0, + "step": 2 + }, + { + "epoch": 0.0003816308357715303, + "ewc_loss": 1.9855799066026934e-15, + "ewc_loss_diag": 9.486769009248164e-19, + "ewc_loss_parallel": 1.0349515906599134e-17, + "grad_norm": 4.725632667541504, + "learning_rate": 8.478168715557439e-10, + "loss": 0.7225, + "mean_token_accuracy": 0.7959207892417908, + "num_tokens": 118717.0, + "step": 3 + }, + { + "epoch": 0.0005088411143620405, + "ewc_loss": 1.1824178788866213e-13, + "ewc_loss_diag": 7.676151381197371e-17, + "ewc_loss_parallel": 4.141435511399689e-16, + "grad_norm": 5.367116928100586, + "learning_rate": 1.271725307333616e-09, + "loss": 0.8139, + "mean_token_accuracy": 0.7712171077728271, + "num_tokens": 150155.0, + "step": 4 + }, + { + "epoch": 0.0006360513929525506, + "ewc_loss": 2.6445152491449964e-12, + "ewc_loss_diag": 2.3314683517128287e-15, + "ewc_loss_parallel": 3.139351859661509e-15, + "grad_norm": 4.300771236419678, + "learning_rate": 1.6956337431114878e-09, + "loss": 0.7919, + "mean_token_accuracy": 0.7750316262245178, + "num_tokens": 193616.0, + "step": 5 + }, + { + "epoch": 0.0007632616715430606, + "ewc_loss": 9.669557689273578e-12, + "ewc_loss_diag": 7.93809462606987e-15, + "ewc_loss_parallel": 1.7114791978382546e-14, + "grad_norm": 5.208383083343506, + "learning_rate": 2.1195421788893596e-09, + "loss": 0.7895, + "mean_token_accuracy": 0.7781112790107727, + "num_tokens": 227640.0, + "step": 6 + }, + { + "epoch": 0.0008904719501335708, + "ewc_loss": 1.932299632800838e-11, + "ewc_loss_diag": 1.5987211554602254e-14, + "ewc_loss_parallel": 3.293152290575682e-14, + "grad_norm": 4.832429885864258, + "learning_rate": 2.543450614667232e-09, + "loss": 0.8161, + "mean_token_accuracy": 0.7750434875488281, + "num_tokens": 265114.0, + "step": 7 + }, + { + "epoch": 0.001017682228724081, + "ewc_loss": 6.757972653703703e-11, + "ewc_loss_diag": 5.284661597215745e-14, + "ewc_loss_parallel": 1.482903562314078e-13, + "grad_norm": 4.941293716430664, + "learning_rate": 2.967359050445104e-09, + "loss": 0.7585, + "mean_token_accuracy": 0.7888950705528259, + "num_tokens": 299865.0, + "step": 8 + }, + { + "epoch": 0.001144892507314591, + "ewc_loss": 1.0190647631302951e-10, + "ewc_loss_diag": 7.860379014346108e-14, + "ewc_loss_parallel": 2.323518710333622e-13, + "grad_norm": 4.572570323944092, + "learning_rate": 3.3912674862229757e-09, + "loss": 0.8143, + "mean_token_accuracy": 0.773809552192688, + "num_tokens": 342063.0, + "step": 9 + }, + { + "epoch": 0.0012721027859051012, + "ewc_loss": 2.383645225645381e-10, + "ewc_loss_diag": 1.9451107391432743e-13, + "ewc_loss_parallel": 4.373265801143472e-13, + "grad_norm": 5.459090709686279, + "learning_rate": 3.815175922000847e-09, + "loss": 0.8641, + "mean_token_accuracy": 0.7635496854782104, + "num_tokens": 374864.0, + "step": 10 + }, + { + "epoch": 0.0013993130644956112, + "ewc_loss": 4.768296868462585e-10, + "ewc_loss_diag": 3.4638958368304884e-13, + "ewc_loss_parallel": 1.31221682825311e-12, + "grad_norm": 4.299531936645508, + "learning_rate": 4.239084357778719e-09, + "loss": 0.7766, + "mean_token_accuracy": 0.7750029563903809, + "num_tokens": 416605.0, + "step": 11 + }, + { + "epoch": 0.0015265233430861213, + "ewc_loss": 6.561117205095002e-10, + "ewc_loss_diag": 4.796163466380676e-13, + "ewc_loss_parallel": 1.7589851016713842e-12, + "grad_norm": 5.462878704071045, + "learning_rate": 4.662992793556591e-09, + "loss": 0.8345, + "mean_token_accuracy": 0.7690281867980957, + "num_tokens": 448798.0, + "step": 12 + }, + { + "epoch": 0.0016537336216766315, + "ewc_loss": 8.145344398968746e-10, + "ewc_loss_diag": 5.897504706808832e-13, + "ewc_loss_parallel": 2.2518185233133536e-12, + "grad_norm": 5.437325954437256, + "learning_rate": 5.086901229334464e-09, + "loss": 0.8736, + "mean_token_accuracy": 0.7538779973983765, + "num_tokens": 480084.0, + "step": 13 + }, + { + "epoch": 0.0017809439002671415, + "ewc_loss": 2.5921220725422245e-09, + "ewc_loss_diag": 1.9042545318370685e-12, + "ewc_loss_parallel": 6.858212782401285e-12, + "grad_norm": 4.376641273498535, + "learning_rate": 5.510809665112336e-09, + "loss": 0.8057, + "mean_token_accuracy": 0.7796688079833984, + "num_tokens": 524543.0, + "step": 14 + }, + { + "epoch": 0.0019081541788576518, + "ewc_loss": 3.750125543433569e-09, + "ewc_loss_diag": 2.7000623958883807e-12, + "ewc_loss_parallel": 1.043469564226962e-11, + "grad_norm": 4.793035507202148, + "learning_rate": 5.934718100890208e-09, + "loss": 0.7517, + "mean_token_accuracy": 0.7885146737098694, + "num_tokens": 563314.0, + "step": 15 + }, + { + "epoch": 0.002035364457448162, + "ewc_loss": 4.241362816514993e-09, + "ewc_loss_diag": 2.9842794901924208e-12, + "ewc_loss_parallel": 1.2582201773525536e-11, + "grad_norm": 5.064942359924316, + "learning_rate": 6.3586265366680796e-09, + "loss": 0.8422, + "mean_token_accuracy": 0.7627540826797485, + "num_tokens": 598421.0, + "step": 16 + }, + { + "epoch": 0.002162574736038672, + "ewc_loss": 5.301057814932619e-09, + "ewc_loss_diag": 3.723243935382925e-12, + "ewc_loss_parallel": 1.575767467898448e-11, + "grad_norm": 5.027410984039307, + "learning_rate": 6.782534972445951e-09, + "loss": 0.8317, + "mean_token_accuracy": 0.7625238299369812, + "num_tokens": 634690.0, + "step": 17 + }, + { + "epoch": 0.002289785014629182, + "ewc_loss": 6.182084177908109e-09, + "ewc_loss_diag": 4.376943252282217e-12, + "ewc_loss_parallel": 1.8165094428646e-11, + "grad_norm": 5.124436378479004, + "learning_rate": 7.206443408223823e-09, + "loss": 0.8692, + "mean_token_accuracy": 0.7595652341842651, + "num_tokens": 674653.0, + "step": 18 + }, + { + "epoch": 0.0024169952932196924, + "ewc_loss": 2.102230389766646e-08, + "ewc_loss_diag": 1.6257217794191092e-11, + "ewc_loss_parallel": 4.724158564339831e-11, + "grad_norm": 5.05863618850708, + "learning_rate": 7.630351844001695e-09, + "loss": 0.8005, + "mean_token_accuracy": 0.7766331434249878, + "num_tokens": 708238.0, + "step": 19 + }, + { + "epoch": 0.0025442055718102024, + "ewc_loss": 2.7060927720867767e-08, + "ewc_loss_diag": 1.9895196601282805e-11, + "ewc_loss_parallel": 7.153908021528821e-11, + "grad_norm": 4.606060028076172, + "learning_rate": 8.054260279779567e-09, + "loss": 0.7824, + "mean_token_accuracy": 0.7783790826797485, + "num_tokens": 749312.0, + "step": 20 + }, + { + "epoch": 0.0026714158504007124, + "ewc_loss": 3.102434931179232e-08, + "ewc_loss_diag": 2.2396307031158358e-11, + "ewc_loss_parallel": 8.672607076931627e-11, + "grad_norm": 5.410043239593506, + "learning_rate": 8.478168715557438e-09, + "loss": 0.8526, + "mean_token_accuracy": 0.7666885852813721, + "num_tokens": 783532.0, + "step": 21 + }, + { + "epoch": 0.0027986261289912225, + "ewc_loss": 3.376129242838033e-08, + "ewc_loss_diag": 2.3987922759260982e-11, + "ewc_loss_parallel": 9.779736742654421e-11, + "grad_norm": 5.07509708404541, + "learning_rate": 8.902077151335311e-09, + "loss": 0.83, + "mean_token_accuracy": 0.7736845016479492, + "num_tokens": 817429.0, + "step": 22 + }, + { + "epoch": 0.0029258364075817325, + "ewc_loss": 3.999985054292665e-08, + "ewc_loss_diag": 2.8535396268125623e-11, + "ewc_loss_parallel": 1.1478096945127803e-10, + "grad_norm": 4.811967849731445, + "learning_rate": 9.325985587113182e-09, + "loss": 0.7635, + "mean_token_accuracy": 0.7882040739059448, + "num_tokens": 853964.0, + "step": 23 + }, + { + "epoch": 0.0030530466861722425, + "ewc_loss": 4.357457328296732e-08, + "ewc_loss_diag": 3.069544618483633e-11, + "ewc_loss_parallel": 1.2840929708435311e-10, + "grad_norm": 5.44645881652832, + "learning_rate": 9.749894022891054e-09, + "loss": 0.8274, + "mean_token_accuracy": 0.767427921295166, + "num_tokens": 885070.0, + "step": 24 + }, + { + "epoch": 0.003180256964762753, + "ewc_loss": 4.757853844239435e-08, + "ewc_loss_diag": 3.342393029015511e-11, + "ewc_loss_parallel": 1.405092708850475e-10, + "grad_norm": 4.827850818634033, + "learning_rate": 1.0173802458668929e-08, + "loss": 0.8799, + "mean_token_accuracy": 0.7512384057044983, + "num_tokens": 926893.0, + "step": 25 + }, + { + "epoch": 0.003307467243353263, + "ewc_loss": 1.4474659337793128e-07, + "ewc_loss_diag": 1.141415850725025e-10, + "ewc_loss_parallel": 3.065957487891069e-10, + "grad_norm": 4.872725486755371, + "learning_rate": 1.05977108944468e-08, + "loss": 0.8162, + "mean_token_accuracy": 0.7674546837806702, + "num_tokens": 964773.0, + "step": 26 + }, + { + "epoch": 0.003434677521943773, + "ewc_loss": 1.913322620339386e-07, + "ewc_loss_diag": 1.446096575818956e-10, + "ewc_loss_parallel": 4.697726097013799e-10, + "grad_norm": 4.701942443847656, + "learning_rate": 1.1021619330224672e-08, + "loss": 0.7687, + "mean_token_accuracy": 0.7851215600967407, + "num_tokens": 1002725.0, + "step": 27 + }, + { + "epoch": 0.003561887800534283, + "ewc_loss": 2.1538645000873657e-07, + "ewc_loss_diag": 1.5825207810848951e-10, + "ewc_loss_parallel": 5.70616109918376e-10, + "grad_norm": 4.9667744636535645, + "learning_rate": 1.1445527766002543e-08, + "loss": 0.8732, + "mean_token_accuracy": 0.7581998705863953, + "num_tokens": 1040296.0, + "step": 28 + }, + { + "epoch": 0.003689098079124793, + "ewc_loss": 2.3708440721748048e-07, + "ewc_loss_diag": 1.7189449863508344e-10, + "ewc_loss_parallel": 6.478973468837523e-10, + "grad_norm": 4.439574241638184, + "learning_rate": 1.1869436201780416e-08, + "loss": 0.7658, + "mean_token_accuracy": 0.7849854230880737, + "num_tokens": 1081711.0, + "step": 29 + }, + { + "epoch": 0.0038163083577153036, + "ewc_loss": 2.492761836947466e-07, + "ewc_loss_diag": 1.7917045624926686e-10, + "ewc_loss_parallel": 7.046226380147402e-10, + "grad_norm": 4.796191215515137, + "learning_rate": 1.2293344637558287e-08, + "loss": 0.8123, + "mean_token_accuracy": 0.7748741507530212, + "num_tokens": 1120556.0, + "step": 30 + }, + { + "epoch": 0.003943518636305814, + "ewc_loss": 2.758741288744204e-07, + "ewc_loss_diag": 1.9917933968827128e-10, + "ewc_loss_parallel": 7.65711105632505e-10, + "grad_norm": 4.776007652282715, + "learning_rate": 1.2717253073336159e-08, + "loss": 0.7642, + "mean_token_accuracy": 0.7897487282752991, + "num_tokens": 1157723.0, + "step": 31 + }, + { + "epoch": 0.004070728914896324, + "ewc_loss": 3.0169104547894676e-07, + "ewc_loss_diag": 2.1736923372372985e-10, + "ewc_loss_parallel": 8.469286938428411e-10, + "grad_norm": 4.686396598815918, + "learning_rate": 1.314116150911403e-08, + "loss": 0.8101, + "mean_token_accuracy": 0.7738462686538696, + "num_tokens": 1197879.0, + "step": 32 + }, + { + "epoch": 0.004197939193486834, + "ewc_loss": 3.2202802913161577e-07, + "ewc_loss_diag": 2.3101165425032377e-10, + "ewc_loss_parallel": 9.1060026186085e-10, + "grad_norm": 4.733565330505371, + "learning_rate": 1.3565069944891903e-08, + "loss": 0.8106, + "mean_token_accuracy": 0.7749589681625366, + "num_tokens": 1237342.0, + "step": 33 + }, + { + "epoch": 0.004325149472077344, + "ewc_loss": 3.372846038018906e-07, + "ewc_loss_diag": 2.4010660126805305e-10, + "ewc_loss_parallel": 9.700338310381085e-10, + "grad_norm": 4.4836249351501465, + "learning_rate": 1.3988978380669775e-08, + "loss": 0.782, + "mean_token_accuracy": 0.7824907302856445, + "num_tokens": 1280197.0, + "step": 34 + }, + { + "epoch": 0.004452359750667854, + "ewc_loss": 4.0271902435051743e-07, + "ewc_loss_diag": 2.964952727779746e-10, + "ewc_loss_parallel": 1.0655842874740529e-09, + "grad_norm": 4.635850429534912, + "learning_rate": 1.4412886816447646e-08, + "loss": 0.7642, + "mean_token_accuracy": 0.7851548194885254, + "num_tokens": 1318625.0, + "step": 35 + }, + { + "epoch": 0.004579570029258364, + "ewc_loss": 1.023570803226903e-06, + "ewc_loss_diag": 8.149072527885437e-10, + "ewc_loss_parallel": 2.0773225362091807e-09, + "grad_norm": 4.755696773529053, + "learning_rate": 1.4836795252225519e-08, + "loss": 0.8217, + "mean_token_accuracy": 0.7722973823547363, + "num_tokens": 1356868.0, + "step": 36 + }, + { + "epoch": 0.004706780307848874, + "ewc_loss": 1.3115402452967828e-06, + "ewc_loss_diag": 1.0040821507573128e-09, + "ewc_loss_parallel": 3.057119224436633e-09, + "grad_norm": 4.797682285308838, + "learning_rate": 1.526070368800339e-08, + "loss": 0.7934, + "mean_token_accuracy": 0.7809262275695801, + "num_tokens": 1394696.0, + "step": 37 + }, + { + "epoch": 0.004833990586439385, + "ewc_loss": 1.4899515008437447e-06, + "ewc_loss_diag": 1.1204974725842476e-09, + "ewc_loss_parallel": 3.7236447258948147e-09, + "grad_norm": 4.345782279968262, + "learning_rate": 1.5684612123781262e-08, + "loss": 0.7826, + "mean_token_accuracy": 0.7827894687652588, + "num_tokens": 1438738.0, + "step": 38 + }, + { + "epoch": 0.004961200865029895, + "ewc_loss": 1.6242638594121672e-06, + "ewc_loss_diag": 1.2078089639544487e-09, + "ewc_loss_parallel": 4.172697742887976e-09, + "grad_norm": 4.724465370178223, + "learning_rate": 1.6108520559559135e-08, + "loss": 0.7484, + "mean_token_accuracy": 0.7925288677215576, + "num_tokens": 1475089.0, + "step": 39 + }, + { + "epoch": 0.005088411143620405, + "ewc_loss": 1.7194373640450067e-06, + "ewc_loss_diag": 1.2660166248679161e-09, + "ewc_loss_parallel": 4.5283865546252855e-09, + "grad_norm": 4.700695991516113, + "learning_rate": 1.6532428995337004e-08, + "loss": 0.8084, + "mean_token_accuracy": 0.7745454907417297, + "num_tokens": 1514566.0, + "step": 40 + }, + { + "epoch": 0.005215621422210915, + "ewc_loss": 1.8066916709358338e-06, + "ewc_loss_diag": 1.3242242857813835e-09, + "ewc_loss_parallel": 4.804882713926872e-09, + "grad_norm": 4.9158616065979, + "learning_rate": 1.6956337431114877e-08, + "loss": 0.8231, + "mean_token_accuracy": 0.7740737199783325, + "num_tokens": 1552560.0, + "step": 41 + }, + { + "epoch": 0.005342831700801425, + "ewc_loss": 1.8735834146355046e-06, + "ewc_loss_diag": 1.367880031466484e-09, + "ewc_loss_parallel": 5.0267652262903084e-09, + "grad_norm": 5.3616156578063965, + "learning_rate": 1.738024586689275e-08, + "loss": 0.8007, + "mean_token_accuracy": 0.7770598530769348, + "num_tokens": 1584759.0, + "step": 42 + }, + { + "epoch": 0.005470041979391935, + "ewc_loss": 1.970255652850028e-06, + "ewc_loss_diag": 1.4479155652225018e-09, + "ewc_loss_parallel": 5.248430134940918e-09, + "grad_norm": 4.806038856506348, + "learning_rate": 1.7804154302670622e-08, + "loss": 0.8018, + "mean_token_accuracy": 0.775444507598877, + "num_tokens": 1621825.0, + "step": 43 + }, + { + "epoch": 0.005597252257982445, + "ewc_loss": 2.148204657714814e-06, + "ewc_loss_diag": 1.5861587598919868e-09, + "ewc_loss_parallel": 5.612309728064702e-09, + "grad_norm": 4.728168487548828, + "learning_rate": 1.8228062738448494e-08, + "loss": 0.8274, + "mean_token_accuracy": 0.7678640484809875, + "num_tokens": 1662946.0, + "step": 44 + }, + { + "epoch": 0.005724462536572955, + "ewc_loss": 2.2778474431106588e-06, + "ewc_loss_diag": 1.6807462088763714e-09, + "ewc_loss_parallel": 5.940163028128609e-09, + "grad_norm": 4.603828430175781, + "learning_rate": 1.8651971174226364e-08, + "loss": 0.8153, + "mean_token_accuracy": 0.7734089493751526, + "num_tokens": 1699433.0, + "step": 45 + }, + { + "epoch": 0.005851672815163465, + "ewc_loss": 2.365472482779296e-06, + "ewc_loss_diag": 1.7462298274040222e-09, + "ewc_loss_parallel": 6.220366444154024e-09, + "grad_norm": 4.306037425994873, + "learning_rate": 1.9075879610004236e-08, + "loss": 0.7464, + "mean_token_accuracy": 0.7883710861206055, + "num_tokens": 1742812.0, + "step": 46 + }, + { + "epoch": 0.005978883093753975, + "ewc_loss": 2.467455942678498e-06, + "ewc_loss_diag": 1.8189894035458565e-09, + "ewc_loss_parallel": 6.495142201856652e-09, + "grad_norm": 4.823169708251953, + "learning_rate": 1.949978804578211e-08, + "loss": 0.8016, + "mean_token_accuracy": 0.7783809900283813, + "num_tokens": 1778725.0, + "step": 47 + }, + { + "epoch": 0.006106093372344485, + "ewc_loss": 2.5501235541014466e-06, + "ewc_loss_diag": 1.862645149230957e-09, + "ewc_loss_parallel": 6.874782965127224e-09, + "grad_norm": 4.773406982421875, + "learning_rate": 1.9923696481559985e-08, + "loss": 0.8478, + "mean_token_accuracy": 0.7623263001441956, + "num_tokens": 1816592.0, + "step": 48 + }, + { + "epoch": 0.006233303650934996, + "ewc_loss": 3.4961453820869792e-06, + "ewc_loss_diag": 2.5902409106492996e-09, + "ewc_loss_parallel": 9.033432668559271e-09, + "grad_norm": 4.338850498199463, + "learning_rate": 2.0347604917337857e-08, + "loss": 0.7396, + "mean_token_accuracy": 0.7900300621986389, + "num_tokens": 1859907.0, + "step": 49 + }, + { + "epoch": 0.006360513929525506, + "ewc_loss": 6.542772098327987e-06, + "ewc_loss_diag": 4.976755008101463e-09, + "ewc_loss_parallel": 1.5657844798511178e-08, + "grad_norm": 4.5695343017578125, + "learning_rate": 2.0771513353115727e-08, + "loss": 0.729, + "mean_token_accuracy": 0.799554169178009, + "num_tokens": 1896627.0, + "step": 50 + }, + { + "epoch": 0.006487724208116016, + "ewc_loss": 8.659815648570657e-06, + "ewc_loss_diag": 6.548361852765083e-09, + "ewc_loss_parallel": 2.1033041974760636e-08, + "grad_norm": 4.844273567199707, + "learning_rate": 2.11954217888936e-08, + "loss": 0.8443, + "mean_token_accuracy": 0.7668993473052979, + "num_tokens": 1934041.0, + "step": 51 + }, + { + "epoch": 0.006614934486706526, + "ewc_loss": 9.651634172769263e-06, + "ewc_loss_diag": 7.2177499532699585e-09, + "ewc_loss_parallel": 2.4394722686338355e-08, + "grad_norm": 4.446019649505615, + "learning_rate": 2.1619330224671472e-08, + "loss": 0.7734, + "mean_token_accuracy": 0.7844105958938599, + "num_tokens": 1976482.0, + "step": 52 + }, + { + "epoch": 0.006742144765297036, + "ewc_loss": 1.0101851330546197e-05, + "ewc_loss_diag": 7.450580596923828e-09, + "ewc_loss_parallel": 2.651270847309206e-08, + "grad_norm": 5.182474136352539, + "learning_rate": 2.2043238660449344e-08, + "loss": 0.8028, + "mean_token_accuracy": 0.7724302411079407, + "num_tokens": 2009224.0, + "step": 53 + }, + { + "epoch": 0.006869355043887546, + "ewc_loss": 1.0627049960021395e-05, + "ewc_loss_diag": 7.799826562404633e-09, + "ewc_loss_parallel": 2.8188416933971894e-08, + "grad_norm": 4.7552876472473145, + "learning_rate": 2.2467147096227214e-08, + "loss": 0.8473, + "mean_token_accuracy": 0.7620348334312439, + "num_tokens": 2049235.0, + "step": 54 + }, + { + "epoch": 0.006996565322478056, + "ewc_loss": 1.1055499271606095e-05, + "ewc_loss_diag": 8.09086486697197e-09, + "ewc_loss_parallel": 2.949267319252158e-08, + "grad_norm": 4.470492362976074, + "learning_rate": 2.2891055532005086e-08, + "loss": 0.8603, + "mean_token_accuracy": 0.7610819935798645, + "num_tokens": 2090260.0, + "step": 55 + }, + { + "epoch": 0.007123775601068566, + "ewc_loss": 1.1479968634375837e-05, + "ewc_loss_diag": 8.381903171539307e-09, + "ewc_loss_parallel": 3.075713550515502e-08, + "grad_norm": 4.88676118850708, + "learning_rate": 2.331496396778296e-08, + "loss": 0.8658, + "mean_token_accuracy": 0.7604718208312988, + "num_tokens": 2126686.0, + "step": 56 + }, + { + "epoch": 0.007250985879659076, + "ewc_loss": 1.1896552678081207e-05, + "ewc_loss_diag": 8.672941476106644e-09, + "ewc_loss_parallel": 3.1942743561330644e-08, + "grad_norm": 4.261096477508545, + "learning_rate": 2.373887240356083e-08, + "loss": 0.7675, + "mean_token_accuracy": 0.7847990989685059, + "num_tokens": 2171355.0, + "step": 57 + }, + { + "epoch": 0.007378196158249586, + "ewc_loss": 1.2183103535789996e-05, + "ewc_loss_diag": 8.905772119760513e-09, + "ewc_loss_parallel": 3.302011464256793e-08, + "grad_norm": 4.4738640785217285, + "learning_rate": 2.4162780839338704e-08, + "loss": 0.7407, + "mean_token_accuracy": 0.7928991317749023, + "num_tokens": 2211660.0, + "step": 58 + }, + { + "epoch": 0.007505406436840096, + "ewc_loss": 1.2337186490185559e-05, + "ewc_loss_diag": 8.96397978067398e-09, + "ewc_loss_parallel": 3.3964898449312386e-08, + "grad_norm": 5.398654937744141, + "learning_rate": 2.4586689275116573e-08, + "loss": 0.8708, + "mean_token_accuracy": 0.7595768570899963, + "num_tokens": 2244411.0, + "step": 59 + }, + { + "epoch": 0.007632616715430607, + "ewc_loss": 1.2695797522610519e-05, + "ewc_loss_diag": 9.19681042432785e-09, + "ewc_loss_parallel": 3.516682056670106e-08, + "grad_norm": 4.218489170074463, + "learning_rate": 2.5010597710894446e-08, + "loss": 0.7977, + "mean_token_accuracy": 0.7711631655693054, + "num_tokens": 2285253.0, + "step": 60 + }, + { + "epoch": 0.007759826994021117, + "ewc_loss": 1.3348882930586115e-05, + "ewc_loss_diag": 9.66247171163559e-09, + "ewc_loss_parallel": 3.692930050647192e-08, + "grad_norm": 4.36163330078125, + "learning_rate": 2.5434506146672318e-08, + "loss": 0.7623, + "mean_token_accuracy": 0.785445511341095, + "num_tokens": 2328577.0, + "step": 61 + }, + { + "epoch": 0.007887037272611627, + "ewc_loss": 1.394270748278359e-05, + "ewc_loss_diag": 1.0069925338029861e-08, + "ewc_loss_parallel": 3.869522302579753e-08, + "grad_norm": 4.564056873321533, + "learning_rate": 2.585841458245019e-08, + "loss": 0.8424, + "mean_token_accuracy": 0.7701941728591919, + "num_tokens": 2366524.0, + "step": 62 + }, + { + "epoch": 0.008014247551202136, + "ewc_loss": 1.4564695447916165e-05, + "ewc_loss_diag": 1.05355866253376e-08, + "ewc_loss_parallel": 4.01467303845493e-08, + "grad_norm": 4.076579570770264, + "learning_rate": 2.628232301822806e-08, + "loss": 0.7558, + "mean_token_accuracy": 0.7896238565444946, + "num_tokens": 2408628.0, + "step": 63 + }, + { + "epoch": 0.008141457829792647, + "ewc_loss": 1.4990411727922037e-05, + "ewc_loss_diag": 1.0826624929904938e-08, + "ewc_loss_parallel": 4.142365739312481e-08, + "grad_norm": 4.155259609222412, + "learning_rate": 2.6706231454005933e-08, + "loss": 0.7951, + "mean_token_accuracy": 0.7753005027770996, + "num_tokens": 2451800.0, + "step": 64 + }, + { + "epoch": 0.008268668108383158, + "ewc_loss": 1.522304410173092e-05, + "ewc_loss_diag": 1.0943040251731873e-08, + "ewc_loss_parallel": 4.255790031493234e-08, + "grad_norm": 5.537159442901611, + "learning_rate": 2.7130139889783805e-08, + "loss": 0.8661, + "mean_token_accuracy": 0.7551565170288086, + "num_tokens": 2481448.0, + "step": 65 + }, + { + "epoch": 0.008395878386973667, + "ewc_loss": 1.5739777154522017e-05, + "ewc_loss_diag": 1.1350493878126144e-08, + "ewc_loss_parallel": 4.4148951161560035e-08, + "grad_norm": 4.138428688049316, + "learning_rate": 2.7554048325561678e-08, + "loss": 0.7928, + "mean_token_accuracy": 0.7754371166229248, + "num_tokens": 2526339.0, + "step": 66 + }, + { + "epoch": 0.008523088665564178, + "ewc_loss": 1.6226025763899088e-05, + "ewc_loss_diag": 1.1583324521780014e-08, + "ewc_loss_parallel": 4.6627235406049294e-08, + "grad_norm": 4.2904953956604, + "learning_rate": 2.797795676133955e-08, + "loss": 0.8179, + "mean_token_accuracy": 0.7701419591903687, + "num_tokens": 2570691.0, + "step": 67 + }, + { + "epoch": 0.008650298944154687, + "ewc_loss": 1.787598193914164e-05, + "ewc_loss_diag": 1.2631062418222427e-08, + "ewc_loss_parallel": 5.2397975736084845e-08, + "grad_norm": 4.413398265838623, + "learning_rate": 2.840186519711742e-08, + "loss": 0.8044, + "mean_token_accuracy": 0.7752468585968018, + "num_tokens": 2609207.0, + "step": 68 + }, + { + "epoch": 0.008777509222745198, + "ewc_loss": 2.8006277716485783e-05, + "ewc_loss_diag": 2.0838342607021332e-08, + "ewc_loss_parallel": 7.144652869328638e-08, + "grad_norm": 4.679084300994873, + "learning_rate": 2.8825773632895292e-08, + "loss": 0.7928, + "mean_token_accuracy": 0.777147114276886, + "num_tokens": 2645494.0, + "step": 69 + }, + { + "epoch": 0.008904719501335707, + "ewc_loss": 4.006102972198278e-05, + "ewc_loss_diag": 3.003515303134918e-08, + "ewc_loss_parallel": 1.002029037522334e-07, + "grad_norm": 4.1691155433654785, + "learning_rate": 2.9249682068673165e-08, + "loss": 0.7494, + "mean_token_accuracy": 0.79206383228302, + "num_tokens": 2686258.0, + "step": 70 + }, + { + "epoch": 0.009031929779926218, + "ewc_loss": 4.7700497816549614e-05, + "ewc_loss_diag": 3.562308847904205e-08, + "ewc_loss_parallel": 1.2176130326224666e-07, + "grad_norm": 4.7393670082092285, + "learning_rate": 2.9673590504451037e-08, + "loss": 0.8044, + "mean_token_accuracy": 0.7774579524993896, + "num_tokens": 2720544.0, + "step": 71 + }, + { + "epoch": 0.009159140058516728, + "ewc_loss": 5.188299110159278e-05, + "ewc_loss_diag": 3.818422555923462e-08, + "ewc_loss_parallel": 1.373601890009013e-07, + "grad_norm": 4.548948287963867, + "learning_rate": 3.0097498940228907e-08, + "loss": 0.7675, + "mean_token_accuracy": 0.7858436107635498, + "num_tokens": 2758068.0, + "step": 72 + }, + { + "epoch": 0.009286350337107238, + "ewc_loss": 5.565972242038697e-05, + "ewc_loss_diag": 4.0512531995773315e-08, + "ewc_loss_parallel": 1.5128561869914847e-07, + "grad_norm": 4.76308012008667, + "learning_rate": 3.052140737600678e-08, + "loss": 0.8519, + "mean_token_accuracy": 0.7625856399536133, + "num_tokens": 2793342.0, + "step": 73 + }, + { + "epoch": 0.009413560615697748, + "ewc_loss": 5.8678851928561926e-05, + "ewc_loss_diag": 4.21423465013504e-08, + "ewc_loss_parallel": 1.6478765019201091e-07, + "grad_norm": 4.92156457901001, + "learning_rate": 3.094531581178465e-08, + "loss": 0.8887, + "mean_token_accuracy": 0.7555423974990845, + "num_tokens": 2828003.0, + "step": 74 + }, + { + "epoch": 0.009540770894288259, + "ewc_loss": 6.079998274799436e-05, + "ewc_loss_diag": 4.307366907596588e-08, + "ewc_loss_parallel": 1.764621941902078e-07, + "grad_norm": 3.982774257659912, + "learning_rate": 3.1369224247562524e-08, + "loss": 0.7638, + "mean_token_accuracy": 0.7840520143508911, + "num_tokens": 2874755.0, + "step": 75 + }, + { + "epoch": 0.00966798117287877, + "ewc_loss": 6.263772957026958e-05, + "ewc_loss_diag": 4.400499165058136e-08, + "ewc_loss_parallel": 1.8530293743879156e-07, + "grad_norm": 4.346661567687988, + "learning_rate": 3.17931326833404e-08, + "loss": 0.7864, + "mean_token_accuracy": 0.776702880859375, + "num_tokens": 2909703.0, + "step": 76 + }, + { + "epoch": 0.009795191451469279, + "ewc_loss": 6.453075184253976e-05, + "ewc_loss_diag": 4.540197551250458e-08, + "ewc_loss_parallel": 1.9231222836424422e-07, + "grad_norm": 3.8788020610809326, + "learning_rate": 3.221704111911827e-08, + "loss": 0.7159, + "mean_token_accuracy": 0.7989048957824707, + "num_tokens": 2954020.0, + "step": 77 + }, + { + "epoch": 0.00992240173005979, + "ewc_loss": 6.533507257699966e-05, + "ewc_loss_diag": 4.563480615615845e-08, + "ewc_loss_parallel": 1.9797123229636782e-07, + "grad_norm": 4.9640913009643555, + "learning_rate": 3.264094955489614e-08, + "loss": 0.8117, + "mean_token_accuracy": 0.774562418460846, + "num_tokens": 2984894.0, + "step": 78 + }, + { + "epoch": 0.010049612008650299, + "ewc_loss": 6.683988613076508e-05, + "ewc_loss_diag": 4.6566128730773926e-08, + "ewc_loss_parallel": 2.034826138697099e-07, + "grad_norm": 4.631453037261963, + "learning_rate": 3.306485799067401e-08, + "loss": 0.7181, + "mean_token_accuracy": 0.7954184412956238, + "num_tokens": 3017773.0, + "step": 79 + }, + { + "epoch": 0.01017682228724081, + "ewc_loss": 6.706453859806061e-05, + "ewc_loss_diag": 4.6333298087120056e-08, + "ewc_loss_parallel": 2.081133487763509e-07, + "grad_norm": 3.9603734016418457, + "learning_rate": 3.348876642645188e-08, + "loss": 0.7174, + "mean_token_accuracy": 0.7981616854667664, + "num_tokens": 3059739.0, + "step": 80 + }, + { + "epoch": 0.010304032565831319, + "ewc_loss": 6.734005000907928e-05, + "ewc_loss_diag": 4.6333298087120056e-08, + "ewc_loss_parallel": 2.108684782342607e-07, + "grad_norm": 4.53375768661499, + "learning_rate": 3.391267486222975e-08, + "loss": 0.8096, + "mean_token_accuracy": 0.7719967365264893, + "num_tokens": 3094910.0, + "step": 81 + }, + { + "epoch": 0.01043124284442183, + "ewc_loss": 6.810598279116675e-05, + "ewc_loss_diag": 4.6798959374427795e-08, + "ewc_loss_parallel": 2.1375942083068367e-07, + "grad_norm": 4.276072025299072, + "learning_rate": 3.4336583298007626e-08, + "loss": 0.8137, + "mean_token_accuracy": 0.7739881277084351, + "num_tokens": 3133884.0, + "step": 82 + }, + { + "epoch": 0.010558453123012339, + "ewc_loss": 6.759059760952368e-05, + "ewc_loss_diag": 4.6100467443466187e-08, + "ewc_loss_parallel": 2.1575810649210325e-07, + "grad_norm": 4.839631080627441, + "learning_rate": 3.47604917337855e-08, + "loss": 0.8461, + "mean_token_accuracy": 0.760955274105072, + "num_tokens": 3166755.0, + "step": 83 + }, + { + "epoch": 0.01068566340160285, + "ewc_loss": 6.808708712924272e-05, + "ewc_loss_diag": 4.6333298087120056e-08, + "ewc_loss_parallel": 2.183388545518028e-07, + "grad_norm": 4.406005382537842, + "learning_rate": 3.518440016956337e-08, + "loss": 0.8151, + "mean_token_accuracy": 0.7706390023231506, + "num_tokens": 3204825.0, + "step": 84 + }, + { + "epoch": 0.010812873680193359, + "ewc_loss": 6.996688898652792e-05, + "ewc_loss_diag": 4.7730281949043274e-08, + "ewc_loss_parallel": 2.2283168732428749e-07, + "grad_norm": 4.3916335105896, + "learning_rate": 3.5608308605341244e-08, + "loss": 0.8504, + "mean_token_accuracy": 0.7624970078468323, + "num_tokens": 3244036.0, + "step": 85 + }, + { + "epoch": 0.01094008395878387, + "ewc_loss": 7.145719428081065e-05, + "ewc_loss_diag": 4.866160452365875e-08, + "ewc_loss_parallel": 2.281980329144062e-07, + "grad_norm": 4.195014476776123, + "learning_rate": 3.6032217041119116e-08, + "loss": 0.8377, + "mean_token_accuracy": 0.7668390274047852, + "num_tokens": 3285969.0, + "step": 86 + }, + { + "epoch": 0.01106729423737438, + "ewc_loss": 7.312887464649975e-05, + "ewc_loss_diag": 4.98257577419281e-08, + "ewc_loss_parallel": 2.3299389795283787e-07, + "grad_norm": 4.146434783935547, + "learning_rate": 3.645612547689699e-08, + "loss": 0.8091, + "mean_token_accuracy": 0.7747771739959717, + "num_tokens": 3327648.0, + "step": 87 + }, + { + "epoch": 0.01119450451596489, + "ewc_loss": 7.524125976487994e-05, + "ewc_loss_diag": 5.145557224750519e-08, + "ewc_loss_parallel": 2.3742848043184495e-07, + "grad_norm": 4.274801254272461, + "learning_rate": 3.6880033912674855e-08, + "loss": 0.8122, + "mean_token_accuracy": 0.7726479768753052, + "num_tokens": 3367399.0, + "step": 88 + }, + { + "epoch": 0.0113217147945554, + "ewc_loss": 7.68245809013024e-05, + "ewc_loss_diag": 5.2619725465774536e-08, + "ewc_loss_parallel": 2.413407571566495e-07, + "grad_norm": 4.482771873474121, + "learning_rate": 3.730394234845273e-08, + "loss": 0.7952, + "mean_token_accuracy": 0.7794363498687744, + "num_tokens": 3405402.0, + "step": 89 + }, + { + "epoch": 0.01144892507314591, + "ewc_loss": 7.914105663076043e-05, + "ewc_loss_diag": 5.448237061500549e-08, + "ewc_loss_parallel": 2.4543197696402785e-07, + "grad_norm": 4.5832977294921875, + "learning_rate": 3.77278507842306e-08, + "loss": 0.8787, + "mean_token_accuracy": 0.7541581392288208, + "num_tokens": 3441791.0, + "step": 90 + }, + { + "epoch": 0.01157613535173642, + "ewc_loss": 8.053273631958291e-05, + "ewc_loss_diag": 5.564652383327484e-08, + "ewc_loss_parallel": 2.4981207502605685e-07, + "grad_norm": 4.507504463195801, + "learning_rate": 3.815175922000847e-08, + "loss": 0.8383, + "mean_token_accuracy": 0.7685991525650024, + "num_tokens": 3480151.0, + "step": 91 + }, + { + "epoch": 0.01170334563032693, + "ewc_loss": 8.245620119851083e-05, + "ewc_loss_diag": 5.704350769519806e-08, + "ewc_loss_parallel": 2.5474156473137555e-07, + "grad_norm": 4.397891998291016, + "learning_rate": 3.8575667655786345e-08, + "loss": 0.8017, + "mean_token_accuracy": 0.7760173678398132, + "num_tokens": 3516867.0, + "step": 92 + }, + { + "epoch": 0.01183055590891744, + "ewc_loss": 8.467599400319159e-05, + "ewc_loss_diag": 5.8673322200775146e-08, + "ewc_loss_parallel": 2.60250232031467e-07, + "grad_norm": 3.980140447616577, + "learning_rate": 3.899957609156422e-08, + "loss": 0.7777, + "mean_token_accuracy": 0.7824535965919495, + "num_tokens": 3557466.0, + "step": 93 + }, + { + "epoch": 0.01195776618750795, + "ewc_loss": 8.790750143816695e-05, + "ewc_loss_diag": 6.100162863731384e-08, + "ewc_loss_parallel": 2.687234257336968e-07, + "grad_norm": 4.19019079208374, + "learning_rate": 3.94234845273421e-08, + "loss": 0.8072, + "mean_token_accuracy": 0.7729145288467407, + "num_tokens": 3596009.0, + "step": 94 + }, + { + "epoch": 0.012084976466098461, + "ewc_loss": 9.558361489325762e-05, + "ewc_loss_diag": 6.658956408500671e-08, + "ewc_loss_parallel": 2.8826408993154473e-07, + "grad_norm": 4.369713306427002, + "learning_rate": 3.984739296311997e-08, + "loss": 0.7991, + "mean_token_accuracy": 0.7743916511535645, + "num_tokens": 3631582.0, + "step": 95 + }, + { + "epoch": 0.01221218674468897, + "ewc_loss": 0.00011585335596464574, + "ewc_loss_diag": 8.149072527885437e-08, + "ewc_loss_parallel": 3.431420338984026e-07, + "grad_norm": 4.273683071136475, + "learning_rate": 4.027130139889784e-08, + "loss": 0.8251, + "mean_token_accuracy": 0.7679690718650818, + "num_tokens": 3672855.0, + "step": 96 + }, + { + "epoch": 0.012339397023279481, + "ewc_loss": 0.00014427435235120356, + "ewc_loss_diag": 1.0011717677116394e-07, + "ewc_loss_parallel": 4.4138542421023885e-07, + "grad_norm": 4.02767276763916, + "learning_rate": 4.0695209834675715e-08, + "loss": 0.816, + "mean_token_accuracy": 0.7728443145751953, + "num_tokens": 3714100.0, + "step": 97 + }, + { + "epoch": 0.012466607301869992, + "ewc_loss": 0.00017336788005195558, + "ewc_loss_diag": 1.1827796697616577e-07, + "ewc_loss_parallel": 5.511226390808588e-07, + "grad_norm": 4.152449131011963, + "learning_rate": 4.111911827045358e-08, + "loss": 0.767, + "mean_token_accuracy": 0.7853162884712219, + "num_tokens": 3752834.0, + "step": 98 + }, + { + "epoch": 0.012593817580460501, + "ewc_loss": 0.00019388642976991832, + "ewc_loss_diag": 1.2852251529693604e-07, + "ewc_loss_parallel": 6.514040364891116e-07, + "grad_norm": 4.157062530517578, + "learning_rate": 4.154302670623145e-08, + "loss": 0.7773, + "mean_token_accuracy": 0.7813180088996887, + "num_tokens": 3791513.0, + "step": 99 + }, + { + "epoch": 0.012721027859051012, + "ewc_loss": 0.00020977425447199494, + "ewc_loss_diag": 1.3597309589385986e-07, + "ewc_loss_parallel": 7.339882586165913e-07, + "grad_norm": 4.580276966094971, + "learning_rate": 4.1966935142009326e-08, + "loss": 0.8004, + "mean_token_accuracy": 0.7746337652206421, + "num_tokens": 3825085.0, + "step": 100 + }, + { + "epoch": 0.012848238137641521, + "ewc_loss": 0.00021987553918734193, + "ewc_loss_diag": 1.387670636177063e-07, + "ewc_loss_parallel": 8.063908012445609e-07, + "grad_norm": 3.996941089630127, + "learning_rate": 4.23908435777872e-08, + "loss": 0.8011, + "mean_token_accuracy": 0.7764537930488586, + "num_tokens": 3866492.0, + "step": 101 + }, + { + "epoch": 0.012975448416232032, + "ewc_loss": 0.00022560631623491645, + "ewc_loss_diag": 1.3969838619232178e-07, + "ewc_loss_parallel": 8.636985739940428e-07, + "grad_norm": 4.345249652862549, + "learning_rate": 4.281475201356507e-08, + "loss": 0.8281, + "mean_token_accuracy": 0.7636137008666992, + "num_tokens": 3904053.0, + "step": 102 + }, + { + "epoch": 0.013102658694822541, + "ewc_loss": 0.00023312450502999127, + "ewc_loss_diag": 1.424923539161682e-07, + "ewc_loss_parallel": 9.102702165364462e-07, + "grad_norm": 4.330751895904541, + "learning_rate": 4.3238660449342943e-08, + "loss": 0.7366, + "mean_token_accuracy": 0.7949707508087158, + "num_tokens": 3935118.0, + "step": 103 + }, + { + "epoch": 0.013229868973413052, + "ewc_loss": 0.00024083621974568814, + "ewc_loss_diag": 1.4621764421463013e-07, + "ewc_loss_parallel": 9.49240529735107e-07, + "grad_norm": 3.9287688732147217, + "learning_rate": 4.3662568885120816e-08, + "loss": 0.7871, + "mean_token_accuracy": 0.7764948010444641, + "num_tokens": 3977899.0, + "step": 104 + }, + { + "epoch": 0.013357079252003561, + "ewc_loss": 0.00024264049716293812, + "ewc_loss_diag": 1.4528632164001465e-07, + "ewc_loss_parallel": 9.768200470716693e-07, + "grad_norm": 3.9107720851898193, + "learning_rate": 4.408647732089869e-08, + "loss": 0.8072, + "mean_token_accuracy": 0.7738487124443054, + "num_tokens": 4019416.0, + "step": 105 + }, + { + "epoch": 0.013484289530594072, + "ewc_loss": 0.0002458117378409952, + "ewc_loss_diag": 1.4621764421463013e-07, + "ewc_loss_parallel": 9.989955742639722e-07, + "grad_norm": 3.867074489593506, + "learning_rate": 4.451038575667656e-08, + "loss": 0.7964, + "mean_token_accuracy": 0.7729198932647705, + "num_tokens": 4063887.0, + "step": 106 + }, + { + "epoch": 0.013611499809184581, + "ewc_loss": 0.00024652312276884913, + "ewc_loss_diag": 1.4528632164001465e-07, + "ewc_loss_parallel": 1.0156461485166801e-06, + "grad_norm": 3.972304582595825, + "learning_rate": 4.493429419245443e-08, + "loss": 0.8271, + "mean_token_accuracy": 0.7675241231918335, + "num_tokens": 4101309.0, + "step": 107 + }, + { + "epoch": 0.013738710087775092, + "ewc_loss": 0.0002479146933183074, + "ewc_loss_diag": 1.4528632164001465e-07, + "ewc_loss_parallel": 1.0295619858879945e-06, + "grad_norm": 4.093613147735596, + "learning_rate": 4.53582026282323e-08, + "loss": 0.8116, + "mean_token_accuracy": 0.7710212469100952, + "num_tokens": 4136669.0, + "step": 108 + }, + { + "epoch": 0.013865920366365603, + "ewc_loss": 0.00024813864729367197, + "ewc_loss_diag": 1.4435499906539917e-07, + "ewc_loss_parallel": 1.0413382369733881e-06, + "grad_norm": 3.6251535415649414, + "learning_rate": 4.578211106401017e-08, + "loss": 0.7427, + "mean_token_accuracy": 0.7901040315628052, + "num_tokens": 4178583.0, + "step": 109 + }, + { + "epoch": 0.013993130644956112, + "ewc_loss": 0.0002496081870049238, + "ewc_loss_diag": 1.4528632164001465e-07, + "ewc_loss_parallel": 1.0464967772350064e-06, + "grad_norm": 3.6829071044921875, + "learning_rate": 4.6206019499788045e-08, + "loss": 0.7735, + "mean_token_accuracy": 0.7771437168121338, + "num_tokens": 4219552.0, + "step": 110 + }, + { + "epoch": 0.014120340923546623, + "ewc_loss": 0.0002518290129955858, + "ewc_loss_diag": 1.471489667892456e-07, + "ewc_loss_parallel": 1.04963169178518e-06, + "grad_norm": 3.7328574657440186, + "learning_rate": 4.662992793556592e-08, + "loss": 0.766, + "mean_token_accuracy": 0.7853384017944336, + "num_tokens": 4264487.0, + "step": 111 + }, + { + "epoch": 0.014247551202137132, + "ewc_loss": 0.0002540494897402823, + "ewc_loss_diag": 1.4901161193847656e-07, + "ewc_loss_parallel": 1.0527630820433842e-06, + "grad_norm": 3.793107032775879, + "learning_rate": 4.705383637134379e-08, + "loss": 0.7322, + "mean_token_accuracy": 0.7902196645736694, + "num_tokens": 4303015.0, + "step": 112 + }, + { + "epoch": 0.014374761480727643, + "ewc_loss": 0.000256156490650028, + "ewc_loss_diag": 1.5087425708770752e-07, + "ewc_loss_parallel": 1.0547595366006135e-06, + "grad_norm": 3.7428476810455322, + "learning_rate": 4.747774480712166e-08, + "loss": 0.7871, + "mean_token_accuracy": 0.7777751684188843, + "num_tokens": 4345446.0, + "step": 113 + }, + { + "epoch": 0.014501971759318152, + "ewc_loss": 0.0002588724310044199, + "ewc_loss_diag": 1.5366822481155396e-07, + "ewc_loss_parallel": 1.0533086651776102e-06, + "grad_norm": 3.865741014480591, + "learning_rate": 4.7901653242899535e-08, + "loss": 0.7395, + "mean_token_accuracy": 0.7910842895507812, + "num_tokens": 4383854.0, + "step": 114 + }, + { + "epoch": 0.014629182037908663, + "ewc_loss": 0.00026040489319711924, + "ewc_loss_diag": 1.555308699607849e-07, + "ewc_loss_parallel": 1.0495599553905777e-06, + "grad_norm": 3.922743558883667, + "learning_rate": 4.832556167867741e-08, + "loss": 0.8082, + "mean_token_accuracy": 0.7732766270637512, + "num_tokens": 4420750.0, + "step": 115 + }, + { + "epoch": 0.014756392316499172, + "ewc_loss": 0.00026090798201039433, + "ewc_loss_diag": 1.564621925354004e-07, + "ewc_loss_parallel": 1.0450540912643191e-06, + "grad_norm": 4.03597354888916, + "learning_rate": 4.8749470114455274e-08, + "loss": 0.8051, + "mean_token_accuracy": 0.7733260989189148, + "num_tokens": 4458761.0, + "step": 116 + }, + { + "epoch": 0.014883602595089683, + "ewc_loss": 0.00026438161148689687, + "ewc_loss_diag": 1.601874828338623e-07, + "ewc_loss_parallel": 1.0416432587589952e-06, + "grad_norm": 3.8304591178894043, + "learning_rate": 4.9173378550233146e-08, + "loss": 0.7656, + "mean_token_accuracy": 0.7809469699859619, + "num_tokens": 4496547.0, + "step": 117 + }, + { + "epoch": 0.015010812873680193, + "ewc_loss": 0.00026703940238803625, + "ewc_loss_diag": 1.6298145055770874e-07, + "ewc_loss_parallel": 1.0396108791610459e-06, + "grad_norm": 3.8840012550354004, + "learning_rate": 4.959728698601102e-08, + "loss": 0.7633, + "mean_token_accuracy": 0.789857029914856, + "num_tokens": 4533357.0, + "step": 118 + }, + { + "epoch": 0.015138023152270703, + "ewc_loss": 0.0002700605255085975, + "ewc_loss_diag": 1.6577541828155518e-07, + "ewc_loss_parallel": 1.0412120445835171e-06, + "grad_norm": 3.7084498405456543, + "learning_rate": 5.002119542178889e-08, + "loss": 0.7378, + "mean_token_accuracy": 0.7909243106842041, + "num_tokens": 4573570.0, + "step": 119 + }, + { + "epoch": 0.015265233430861214, + "ewc_loss": 0.00027426000451669097, + "ewc_loss_diag": 1.695007085800171e-07, + "ewc_loss_parallel": 1.045059889293043e-06, + "grad_norm": 3.9661381244659424, + "learning_rate": 5.0445103857566764e-08, + "loss": 0.7929, + "mean_token_accuracy": 0.7752392292022705, + "num_tokens": 4613195.0, + "step": 120 + }, + { + "epoch": 0.015392443709451724, + "ewc_loss": 0.00027488195337355137, + "ewc_loss_diag": 1.695007085800171e-07, + "ewc_loss_parallel": 1.0512792414374417e-06, + "grad_norm": 3.573425054550171, + "learning_rate": 5.0869012293344637e-08, + "loss": 0.7133, + "mean_token_accuracy": 0.7975232601165771, + "num_tokens": 4657230.0, + "step": 121 + }, + { + "epoch": 0.015519653988042234, + "ewc_loss": 0.0002830027951858938, + "ewc_loss_diag": 1.7695128917694092e-07, + "ewc_loss_parallel": 1.0561936960584717e-06, + "grad_norm": 3.7387125492095947, + "learning_rate": 5.129292072912251e-08, + "loss": 0.7871, + "mean_token_accuracy": 0.7805429697036743, + "num_tokens": 4701436.0, + "step": 122 + }, + { + "epoch": 0.015646864266632744, + "ewc_loss": 0.00028469774406403303, + "ewc_loss_diag": 1.778826117515564e-07, + "ewc_loss_parallel": 1.0636064189384342e-06, + "grad_norm": 3.8089797496795654, + "learning_rate": 5.171682916490038e-08, + "loss": 0.7746, + "mean_token_accuracy": 0.7806854248046875, + "num_tokens": 4741806.0, + "step": 123 + }, + { + "epoch": 0.015774074545223254, + "ewc_loss": 0.0002893218188546598, + "ewc_loss_diag": 1.825392246246338e-07, + "ewc_loss_parallel": 1.071700239663187e-06, + "grad_norm": 3.994018793106079, + "learning_rate": 5.2140737600678254e-08, + "loss": 0.8116, + "mean_token_accuracy": 0.765474796295166, + "num_tokens": 4780671.0, + "step": 124 + }, + { + "epoch": 0.015901284823813765, + "ewc_loss": 0.0002989618806168437, + "ewc_loss_diag": 1.909211277961731e-07, + "ewc_loss_parallel": 1.0822702734003542e-06, + "grad_norm": 4.050777435302734, + "learning_rate": 5.256464603645612e-08, + "loss": 0.7602, + "mean_token_accuracy": 0.7865599393844604, + "num_tokens": 4816879.0, + "step": 125 + }, + { + "epoch": 0.016028495102404273, + "ewc_loss": 0.00030719354981556535, + "ewc_loss_diag": 1.9744038581848145e-07, + "ewc_loss_parallel": 1.0978295676977723e-06, + "grad_norm": 3.8453361988067627, + "learning_rate": 5.298855447223399e-08, + "loss": 0.8132, + "mean_token_accuracy": 0.766836404800415, + "num_tokens": 4858704.0, + "step": 126 + }, + { + "epoch": 0.016155705380994784, + "ewc_loss": 0.00031955240410752594, + "ewc_loss_diag": 2.076849341392517e-07, + "ewc_loss_parallel": 1.1165141131641576e-06, + "grad_norm": 3.851292610168457, + "learning_rate": 5.3412462908011865e-08, + "loss": 0.7697, + "mean_token_accuracy": 0.7827739119529724, + "num_tokens": 4896780.0, + "step": 127 + }, + { + "epoch": 0.016282915659585295, + "ewc_loss": 0.00033086928306147456, + "ewc_loss_diag": 2.1606683731079102e-07, + "ewc_loss_parallel": 1.1438522733442369e-06, + "grad_norm": 4.060369968414307, + "learning_rate": 5.383637134378974e-08, + "loss": 0.7995, + "mean_token_accuracy": 0.7723513841629028, + "num_tokens": 4932108.0, + "step": 128 + }, + { + "epoch": 0.016410125938175806, + "ewc_loss": 0.00034306960878893733, + "ewc_loss_diag": 2.2444874048233032e-07, + "ewc_loss_parallel": 1.189561544379103e-06, + "grad_norm": 3.788039207458496, + "learning_rate": 5.426027977956761e-08, + "loss": 0.7753, + "mean_token_accuracy": 0.782103419303894, + "num_tokens": 4970384.0, + "step": 129 + }, + { + "epoch": 0.016537336216766316, + "ewc_loss": 0.00036465763696469367, + "ewc_loss_diag": 2.384185791015625e-07, + "ewc_loss_parallel": 1.2623905831787852e-06, + "grad_norm": 4.1446685791015625, + "learning_rate": 5.468418821534548e-08, + "loss": 0.7834, + "mean_token_accuracy": 0.776983916759491, + "num_tokens": 5005996.0, + "step": 130 + }, + { + "epoch": 0.016664546495356824, + "ewc_loss": 0.0003822289581876248, + "ewc_loss_diag": 2.4400651454925537e-07, + "ewc_loss_parallel": 1.380883304591407e-06, + "grad_norm": 3.8995866775512695, + "learning_rate": 5.5108096651123356e-08, + "loss": 0.7628, + "mean_token_accuracy": 0.7820370197296143, + "num_tokens": 5044208.0, + "step": 131 + }, + { + "epoch": 0.016791756773947335, + "ewc_loss": 0.00041208305628970265, + "ewc_loss_diag": 2.551823854446411e-07, + "ewc_loss_parallel": 1.5649834494979586e-06, + "grad_norm": 3.814971923828125, + "learning_rate": 5.553200508690123e-08, + "loss": 0.7766, + "mean_token_accuracy": 0.776856541633606, + "num_tokens": 5085173.0, + "step": 132 + }, + { + "epoch": 0.016918967052537846, + "ewc_loss": 0.0004524635733105242, + "ewc_loss_diag": 2.682209014892578e-07, + "ewc_loss_parallel": 1.8352741335547762e-06, + "grad_norm": 4.212623119354248, + "learning_rate": 5.59559135226791e-08, + "loss": 0.7916, + "mean_token_accuracy": 0.7744676470756531, + "num_tokens": 5118440.0, + "step": 133 + }, + { + "epoch": 0.017046177331128357, + "ewc_loss": 0.000495098065584898, + "ewc_loss_diag": 2.812594175338745e-07, + "ewc_loss_parallel": 2.1471780655701878e-06, + "grad_norm": 4.234402656555176, + "learning_rate": 5.637982195845697e-08, + "loss": 0.794, + "mean_token_accuracy": 0.7752307057380676, + "num_tokens": 5155202.0, + "step": 134 + }, + { + "epoch": 0.017173387609718864, + "ewc_loss": 0.0005602125311270356, + "ewc_loss_diag": 3.166496753692627e-07, + "ewc_loss_parallel": 2.4359262624784606e-06, + "grad_norm": 4.057627201080322, + "learning_rate": 5.680373039423484e-08, + "loss": 0.7787, + "mean_token_accuracy": 0.7805251479148865, + "num_tokens": 5193240.0, + "step": 135 + }, + { + "epoch": 0.017300597888309375, + "ewc_loss": 0.0006037313723936677, + "ewc_loss_diag": 3.3527612686157227e-07, + "ewc_loss_parallel": 2.6803800210473128e-06, + "grad_norm": 3.9490439891815186, + "learning_rate": 5.722763883001271e-08, + "loss": 0.7663, + "mean_token_accuracy": 0.7811964750289917, + "num_tokens": 5230975.0, + "step": 136 + }, + { + "epoch": 0.017427808166899886, + "ewc_loss": 0.0006381140556186438, + "ewc_loss_diag": 3.4831464290618896e-07, + "ewc_loss_parallel": 2.8906929401273374e-06, + "grad_norm": 4.231963157653809, + "learning_rate": 5.7651547265790585e-08, + "loss": 0.8264, + "mean_token_accuracy": 0.7642083168029785, + "num_tokens": 5264786.0, + "step": 137 + }, + { + "epoch": 0.017555018445490397, + "ewc_loss": 0.0006901394226588309, + "ewc_loss_diag": 3.818422555923462e-07, + "ewc_loss_parallel": 3.086697006438044e-06, + "grad_norm": 3.8462777137756348, + "learning_rate": 5.807545570156846e-08, + "loss": 0.7894, + "mean_token_accuracy": 0.7750978469848633, + "num_tokens": 5301554.0, + "step": 138 + }, + { + "epoch": 0.017682228724080904, + "ewc_loss": 0.000720020558219403, + "ewc_loss_diag": 3.948807716369629e-07, + "ewc_loss_parallel": 3.2519940305064665e-06, + "grad_norm": 3.8713605403900146, + "learning_rate": 5.849936413734633e-08, + "loss": 0.7285, + "mean_token_accuracy": 0.7914576530456543, + "num_tokens": 5337356.0, + "step": 139 + }, + { + "epoch": 0.017809439002671415, + "ewc_loss": 0.0007435014704242349, + "ewc_loss_diag": 4.041939973831177e-07, + "ewc_loss_parallel": 3.3914354844455374e-06, + "grad_norm": 3.7746694087982178, + "learning_rate": 5.89232725731242e-08, + "loss": 0.8002, + "mean_token_accuracy": 0.7750080823898315, + "num_tokens": 5375767.0, + "step": 140 + }, + { + "epoch": 0.017936649281261926, + "ewc_loss": 0.0007679217378608882, + "ewc_loss_diag": 4.172325134277344e-07, + "ewc_loss_parallel": 3.5021239455090836e-06, + "grad_norm": 3.745710611343384, + "learning_rate": 5.9347181008902075e-08, + "loss": 0.7511, + "mean_token_accuracy": 0.7839619517326355, + "num_tokens": 5413683.0, + "step": 141 + }, + { + "epoch": 0.018063859559852437, + "ewc_loss": 0.0007840355974622071, + "ewc_loss_diag": 4.246830940246582e-07, + "ewc_loss_parallel": 3.586968659874401e-06, + "grad_norm": 3.44364595413208, + "learning_rate": 5.977108944467995e-08, + "loss": 0.7334, + "mean_token_accuracy": 0.7888803482055664, + "num_tokens": 5457750.0, + "step": 142 + }, + { + "epoch": 0.018191069838442948, + "ewc_loss": 0.0007931283907964826, + "ewc_loss_diag": 4.284083843231201e-07, + "ewc_loss_parallel": 3.6397498206497403e-06, + "grad_norm": 3.675448417663574, + "learning_rate": 6.019499788045781e-08, + "loss": 0.7275, + "mean_token_accuracy": 0.7907487750053406, + "num_tokens": 5495743.0, + "step": 143 + }, + { + "epoch": 0.018318280117033455, + "ewc_loss": 0.0008046021685004234, + "ewc_loss_diag": 4.377216100692749e-07, + "ewc_loss_parallel": 3.6781934795726556e-06, + "grad_norm": 3.5955188274383545, + "learning_rate": 6.061890631623569e-08, + "loss": 0.7408, + "mean_token_accuracy": 0.7827454805374146, + "num_tokens": 5532198.0, + "step": 144 + }, + { + "epoch": 0.018445490395623966, + "ewc_loss": 0.0008070479962043464, + "ewc_loss_diag": 4.377216100692749e-07, + "ewc_loss_parallel": 3.7026516110927332e-06, + "grad_norm": 3.6467251777648926, + "learning_rate": 6.104281475201356e-08, + "loss": 0.7535, + "mean_token_accuracy": 0.783065676689148, + "num_tokens": 5568977.0, + "step": 145 + }, + { + "epoch": 0.018572700674214477, + "ewc_loss": 0.0008125546155497432, + "ewc_loss_diag": 4.414469003677368e-07, + "ewc_loss_parallel": 3.719570941029815e-06, + "grad_norm": 3.6495795249938965, + "learning_rate": 6.146672318779143e-08, + "loss": 0.7663, + "mean_token_accuracy": 0.7744837403297424, + "num_tokens": 5606229.0, + "step": 146 + }, + { + "epoch": 0.018699910952804988, + "ewc_loss": 0.0008174928952939808, + "ewc_loss_diag": 4.4517219066619873e-07, + "ewc_loss_parallel": 3.7308066112018423e-06, + "grad_norm": 3.289486885070801, + "learning_rate": 6.18906316235693e-08, + "loss": 0.7679, + "mean_token_accuracy": 0.7772600650787354, + "num_tokens": 5649828.0, + "step": 147 + }, + { + "epoch": 0.018827121231395495, + "ewc_loss": 0.0008281854097731411, + "ewc_loss_diag": 4.5634806156158447e-07, + "ewc_loss_parallel": 3.7232907743600663e-06, + "grad_norm": 3.439722776412964, + "learning_rate": 6.231454005934718e-08, + "loss": 0.7138, + "mean_token_accuracy": 0.7976592779159546, + "num_tokens": 5687433.0, + "step": 148 + }, + { + "epoch": 0.018954331509986006, + "ewc_loss": 0.0008256370783783495, + "ewc_loss_diag": 4.544854164123535e-07, + "ewc_loss_parallel": 3.7168811104493216e-06, + "grad_norm": 3.5185165405273438, + "learning_rate": 6.273844849512505e-08, + "loss": 0.7448, + "mean_token_accuracy": 0.7838608622550964, + "num_tokens": 5723247.0, + "step": 149 + }, + { + "epoch": 0.019081541788576517, + "ewc_loss": 0.0008333057630807161, + "ewc_loss_diag": 4.6193599700927734e-07, + "ewc_loss_parallel": 3.7172735574131366e-06, + "grad_norm": 3.409571409225464, + "learning_rate": 6.316235693090292e-08, + "loss": 0.7599, + "mean_token_accuracy": 0.7790826559066772, + "num_tokens": 5760305.0, + "step": 150 + }, + { + "epoch": 0.019208752067167028, + "ewc_loss": 0.0008353812154382467, + "ewc_loss_diag": 4.6566128730773926e-07, + "ewc_loss_parallel": 3.6998810628574574e-06, + "grad_norm": 3.2504403591156006, + "learning_rate": 6.35862653666808e-08, + "loss": 0.7609, + "mean_token_accuracy": 0.7777302265167236, + "num_tokens": 5800586.0, + "step": 151 + }, + { + "epoch": 0.01933596234575754, + "ewc_loss": 0.0008423265535384417, + "ewc_loss_diag": 4.7497451305389404e-07, + "ewc_loss_parallel": 3.6739675124408677e-06, + "grad_norm": 3.327293634414673, + "learning_rate": 6.401017380245867e-08, + "loss": 0.7524, + "mean_token_accuracy": 0.7795425057411194, + "num_tokens": 5840351.0, + "step": 152 + }, + { + "epoch": 0.019463172624348046, + "ewc_loss": 0.0008416672935709357, + "ewc_loss_diag": 4.76837158203125e-07, + "ewc_loss_parallel": 3.6483015719568357e-06, + "grad_norm": 3.327889919281006, + "learning_rate": 6.443408223823654e-08, + "loss": 0.7931, + "mean_token_accuracy": 0.7712817192077637, + "num_tokens": 5880162.0, + "step": 153 + }, + { + "epoch": 0.019590382902938557, + "ewc_loss": 0.0008433801122009754, + "ewc_loss_diag": 4.805624485015869e-07, + "ewc_loss_parallel": 3.627282922025188e-06, + "grad_norm": 3.4786300659179688, + "learning_rate": 6.485799067401441e-08, + "loss": 0.8487, + "mean_token_accuracy": 0.7559289336204529, + "num_tokens": 5922213.0, + "step": 154 + }, + { + "epoch": 0.019717593181529068, + "ewc_loss": 0.0008412462193518877, + "ewc_loss_diag": 4.805624485015869e-07, + "ewc_loss_parallel": 3.605943675211165e-06, + "grad_norm": 3.3740274906158447, + "learning_rate": 6.528189910979228e-08, + "loss": 0.7302, + "mean_token_accuracy": 0.7869173884391785, + "num_tokens": 5957461.0, + "step": 155 + }, + { + "epoch": 0.01984480346011958, + "ewc_loss": 0.0008383687818422914, + "ewc_loss_diag": 4.805624485015869e-07, + "ewc_loss_parallel": 3.577169081836473e-06, + "grad_norm": 3.2655179500579834, + "learning_rate": 6.570580754557016e-08, + "loss": 0.7154, + "mean_token_accuracy": 0.7856046557426453, + "num_tokens": 5994675.0, + "step": 156 + }, + { + "epoch": 0.019972013738710086, + "ewc_loss": 0.0008352301665581763, + "ewc_loss_diag": 4.805624485015869e-07, + "ewc_loss_parallel": 3.5457831017993158e-06, + "grad_norm": 3.303309202194214, + "learning_rate": 6.612971598134802e-08, + "loss": 0.6786, + "mean_token_accuracy": 0.801680326461792, + "num_tokens": 6032514.0, + "step": 157 + }, + { + "epoch": 0.020099224017300597, + "ewc_loss": 0.0008323559886775911, + "ewc_loss_diag": 4.805624485015869e-07, + "ewc_loss_parallel": 3.517041477607563e-06, + "grad_norm": 3.3805794715881348, + "learning_rate": 6.655362441712589e-08, + "loss": 0.7505, + "mean_token_accuracy": 0.780363142490387, + "num_tokens": 6069160.0, + "step": 158 + }, + { + "epoch": 0.020226434295891108, + "ewc_loss": 0.0008299853070639074, + "ewc_loss_diag": 4.805624485015869e-07, + "ewc_loss_parallel": 3.49333458871115e-06, + "grad_norm": 3.3987035751342773, + "learning_rate": 6.697753285290376e-08, + "loss": 0.8047, + "mean_token_accuracy": 0.7652815580368042, + "num_tokens": 6106652.0, + "step": 159 + }, + { + "epoch": 0.02035364457448162, + "ewc_loss": 0.0008314662845805287, + "ewc_loss_diag": 4.842877388000488e-07, + "ewc_loss_parallel": 3.4699974094110075e-06, + "grad_norm": 3.424743890762329, + "learning_rate": 6.740144128868163e-08, + "loss": 0.7392, + "mean_token_accuracy": 0.7859691381454468, + "num_tokens": 6143070.0, + "step": 160 + }, + { + "epoch": 0.020480854853072127, + "ewc_loss": 0.0008292733109556139, + "ewc_loss_diag": 4.842877388000488e-07, + "ewc_loss_parallel": 3.4480674457881832e-06, + "grad_norm": 3.0947930812835693, + "learning_rate": 6.78253497244595e-08, + "loss": 0.7319, + "mean_token_accuracy": 0.7871524095535278, + "num_tokens": 6186495.0, + "step": 161 + }, + { + "epoch": 0.020608065131662637, + "ewc_loss": 0.000829671451356262, + "ewc_loss_diag": 4.880130290985107e-07, + "ewc_loss_parallel": 3.4139020499424078e-06, + "grad_norm": 3.5198397636413574, + "learning_rate": 6.824925816023738e-08, + "loss": 0.732, + "mean_token_accuracy": 0.7871538400650024, + "num_tokens": 6222454.0, + "step": 162 + }, + { + "epoch": 0.02073527541025315, + "ewc_loss": 0.0008322347421199083, + "ewc_loss_diag": 4.917383193969727e-07, + "ewc_loss_parallel": 3.4013876302196877e-06, + "grad_norm": 3.7626287937164307, + "learning_rate": 6.867316659601525e-08, + "loss": 0.7437, + "mean_token_accuracy": 0.7853783965110779, + "num_tokens": 6253260.0, + "step": 163 + }, + { + "epoch": 0.02086248568884366, + "ewc_loss": 0.0008357973420061171, + "ewc_loss_diag": 4.954636096954346e-07, + "ewc_loss_parallel": 3.398866965653724e-06, + "grad_norm": 3.006911039352417, + "learning_rate": 6.909707503179312e-08, + "loss": 0.7254, + "mean_token_accuracy": 0.7909374833106995, + "num_tokens": 6294773.0, + "step": 164 + }, + { + "epoch": 0.02098969596743417, + "ewc_loss": 0.0008363488013856113, + "ewc_loss_diag": 4.991888999938965e-07, + "ewc_loss_parallel": 3.366234523127787e-06, + "grad_norm": 3.1878058910369873, + "learning_rate": 6.9520983467571e-08, + "loss": 0.7688, + "mean_token_accuracy": 0.776786208152771, + "num_tokens": 6335334.0, + "step": 165 + }, + { + "epoch": 0.021116906246024678, + "ewc_loss": 0.0008498630486428738, + "ewc_loss_diag": 5.140900611877441e-07, + "ewc_loss_parallel": 3.3487895052530803e-06, + "grad_norm": 3.3011324405670166, + "learning_rate": 6.994489190334887e-08, + "loss": 0.704, + "mean_token_accuracy": 0.7922652959823608, + "num_tokens": 6372255.0, + "step": 166 + }, + { + "epoch": 0.02124411652461519, + "ewc_loss": 0.0008535012020729482, + "ewc_loss_diag": 5.178153514862061e-07, + "ewc_loss_parallel": 3.347023721289588e-06, + "grad_norm": 2.965954065322876, + "learning_rate": 7.036880033912674e-08, + "loss": 0.6807, + "mean_token_accuracy": 0.8010852932929993, + "num_tokens": 6411971.0, + "step": 167 + }, + { + "epoch": 0.0213713268032057, + "ewc_loss": 0.0008597144624218345, + "ewc_loss_diag": 5.252659320831299e-07, + "ewc_loss_parallel": 3.3328622066619573e-06, + "grad_norm": 3.4738664627075195, + "learning_rate": 7.079270877490461e-08, + "loss": 0.7505, + "mean_token_accuracy": 0.7804635167121887, + "num_tokens": 6450366.0, + "step": 168 + }, + { + "epoch": 0.02149853708179621, + "ewc_loss": 0.0008717486052773893, + "ewc_loss_diag": 5.364418029785156e-07, + "ewc_loss_parallel": 3.3387627809133846e-06, + "grad_norm": 3.5200085639953613, + "learning_rate": 7.121661721068249e-08, + "loss": 0.7128, + "mean_token_accuracy": 0.7899201512336731, + "num_tokens": 6482199.0, + "step": 169 + }, + { + "epoch": 0.021625747360386718, + "ewc_loss": 0.0008695422438904643, + "ewc_loss_diag": 5.327165126800537e-07, + "ewc_loss_parallel": 3.354846285219537e-06, + "grad_norm": 3.2321689128875732, + "learning_rate": 7.164052564646036e-08, + "loss": 0.6951, + "mean_token_accuracy": 0.7986329793930054, + "num_tokens": 6519697.0, + "step": 170 + }, + { + "epoch": 0.02175295763897723, + "ewc_loss": 0.0008857323555275798, + "ewc_loss_diag": 5.476176738739014e-07, + "ewc_loss_parallel": 3.3641597383393673e-06, + "grad_norm": 3.343043088912964, + "learning_rate": 7.206443408223823e-08, + "loss": 0.7547, + "mean_token_accuracy": 0.7793744802474976, + "num_tokens": 6554687.0, + "step": 171 + }, + { + "epoch": 0.02188016791756774, + "ewc_loss": 0.0008955231169238687, + "ewc_loss_diag": 5.587935447692871e-07, + "ewc_loss_parallel": 3.38577342517965e-06, + "grad_norm": 3.350515604019165, + "learning_rate": 7.24883425180161e-08, + "loss": 0.7179, + "mean_token_accuracy": 0.7835988998413086, + "num_tokens": 6593066.0, + "step": 172 + }, + { + "epoch": 0.02200737819615825, + "ewc_loss": 0.0008984776213765144, + "ewc_loss_diag": 5.587935447692871e-07, + "ewc_loss_parallel": 3.4153183605667436e-06, + "grad_norm": 3.1007802486419678, + "learning_rate": 7.291225095379398e-08, + "loss": 0.676, + "mean_token_accuracy": 0.8031926155090332, + "num_tokens": 6635484.0, + "step": 173 + }, + { + "epoch": 0.02213458847474876, + "ewc_loss": 0.0009051027009263635, + "ewc_loss_diag": 5.62518835067749e-07, + "ewc_loss_parallel": 3.4434219742252026e-06, + "grad_norm": 3.047574281692505, + "learning_rate": 7.333615938957185e-08, + "loss": 0.7682, + "mean_token_accuracy": 0.7768030166625977, + "num_tokens": 6677090.0, + "step": 174 + }, + { + "epoch": 0.02226179875333927, + "ewc_loss": 0.0009162714704871178, + "ewc_loss_diag": 5.699694156646729e-07, + "ewc_loss_parallel": 3.478815415292047e-06, + "grad_norm": 3.326245069503784, + "learning_rate": 7.376006782534971e-08, + "loss": 0.6828, + "mean_token_accuracy": 0.7970843315124512, + "num_tokens": 6710250.0, + "step": 175 + }, + { + "epoch": 0.02238900903192978, + "ewc_loss": 0.0009337185765616596, + "ewc_loss_diag": 5.811452865600586e-07, + "ewc_loss_parallel": 3.5388459309615428e-06, + "grad_norm": 2.9314780235290527, + "learning_rate": 7.418397626112758e-08, + "loss": 0.7097, + "mean_token_accuracy": 0.7895039319992065, + "num_tokens": 6752998.0, + "step": 176 + }, + { + "epoch": 0.02251621931052029, + "ewc_loss": 0.0009462409652769566, + "ewc_loss_diag": 5.885958671569824e-07, + "ewc_loss_parallel": 3.5877762911695754e-06, + "grad_norm": 3.304720163345337, + "learning_rate": 7.460788469690545e-08, + "loss": 0.7104, + "mean_token_accuracy": 0.7948204874992371, + "num_tokens": 6789568.0, + "step": 177 + }, + { + "epoch": 0.0226434295891108, + "ewc_loss": 0.0010375198908150196, + "ewc_loss_diag": 6.705522537231445e-07, + "ewc_loss_parallel": 3.66133122042811e-06, + "grad_norm": 11.869390487670898, + "learning_rate": 7.503179313268333e-08, + "loss": 0.7507, + "mean_token_accuracy": 0.7790154218673706, + "num_tokens": 6822810.0, + "step": 178 + }, + { + "epoch": 0.02277063986770131, + "ewc_loss": 0.0009907415369525552, + "ewc_loss_diag": 6.07222318649292e-07, + "ewc_loss_parallel": 3.842047135549365e-06, + "grad_norm": 3.099409341812134, + "learning_rate": 7.54557015684612e-08, + "loss": 0.6896, + "mean_token_accuracy": 0.7948775887489319, + "num_tokens": 6861598.0, + "step": 179 + }, + { + "epoch": 0.02289785014629182, + "ewc_loss": 0.0010125139961019158, + "ewc_loss_diag": 6.221234798431396e-07, + "ewc_loss_parallel": 3.907183781848289e-06, + "grad_norm": 3.025289297103882, + "learning_rate": 7.587961000423907e-08, + "loss": 0.7117, + "mean_token_accuracy": 0.7923677563667297, + "num_tokens": 6900220.0, + "step": 180 + }, + { + "epoch": 0.02302506042488233, + "ewc_loss": 0.0010307537158951163, + "ewc_loss_diag": 6.332993507385254e-07, + "ewc_loss_parallel": 3.975139406975359e-06, + "grad_norm": 3.0348501205444336, + "learning_rate": 7.630351844001694e-08, + "loss": 0.8197, + "mean_token_accuracy": 0.7609233260154724, + "num_tokens": 6946983.0, + "step": 181 + }, + { + "epoch": 0.02315227070347284, + "ewc_loss": 0.0010609261225908995, + "ewc_loss_diag": 6.51925802230835e-07, + "ewc_loss_parallel": 4.086128228664165e-06, + "grad_norm": 3.19156813621521, + "learning_rate": 7.672742687579482e-08, + "loss": 0.7238, + "mean_token_accuracy": 0.7888374924659729, + "num_tokens": 6986019.0, + "step": 182 + }, + { + "epoch": 0.02327948098206335, + "ewc_loss": 0.0010807814542204142, + "ewc_loss_diag": 6.593763828277588e-07, + "ewc_loss_parallel": 4.208388418192044e-06, + "grad_norm": 3.7657487392425537, + "learning_rate": 7.715133531157269e-08, + "loss": 0.6973, + "mean_token_accuracy": 0.790746808052063, + "num_tokens": 7020722.0, + "step": 183 + }, + { + "epoch": 0.02340669126065386, + "ewc_loss": 0.0010948353447020054, + "ewc_loss_diag": 6.593763828277588e-07, + "ewc_loss_parallel": 4.348927177488804e-06, + "grad_norm": 3.50290584564209, + "learning_rate": 7.757524374735056e-08, + "loss": 0.7684, + "mean_token_accuracy": 0.7760382890701294, + "num_tokens": 7062691.0, + "step": 184 + }, + { + "epoch": 0.02353390153924437, + "ewc_loss": 0.0011210584780201316, + "ewc_loss_diag": 6.705522537231445e-07, + "ewc_loss_parallel": 4.49671779279015e-06, + "grad_norm": 3.1802239418029785, + "learning_rate": 7.799915218312844e-08, + "loss": 0.7158, + "mean_token_accuracy": 0.7909860014915466, + "num_tokens": 7101090.0, + "step": 185 + }, + { + "epoch": 0.02366111181783488, + "ewc_loss": 0.0011441222159191966, + "ewc_loss_diag": 6.780028343200684e-07, + "ewc_loss_parallel": 4.651061317417771e-06, + "grad_norm": 3.026872396469116, + "learning_rate": 7.842306061890631e-08, + "loss": 0.6888, + "mean_token_accuracy": 0.792034387588501, + "num_tokens": 7139891.0, + "step": 186 + }, + { + "epoch": 0.023788322096425393, + "ewc_loss": 0.0011642740573734045, + "ewc_loss_diag": 6.817281246185303e-07, + "ewc_loss_parallel": 4.814432486455189e-06, + "grad_norm": 3.197511911392212, + "learning_rate": 7.88469690546842e-08, + "loss": 0.7363, + "mean_token_accuracy": 0.7830324769020081, + "num_tokens": 7179501.0, + "step": 187 + }, + { + "epoch": 0.0239155323750159, + "ewc_loss": 0.0011922282865270972, + "ewc_loss_diag": 6.92903995513916e-07, + "ewc_loss_parallel": 4.9795339691627305e-06, + "grad_norm": 3.454960823059082, + "learning_rate": 7.927087749046207e-08, + "loss": 0.7052, + "mean_token_accuracy": 0.7906781435012817, + "num_tokens": 7213809.0, + "step": 188 + }, + { + "epoch": 0.02404274265360641, + "ewc_loss": 0.0012212868314236403, + "ewc_loss_diag": 7.040798664093018e-07, + "ewc_loss_parallel": 5.15567899128655e-06, + "grad_norm": 3.9969325065612793, + "learning_rate": 7.969478592623994e-08, + "loss": 0.7797, + "mean_token_accuracy": 0.7686359882354736, + "num_tokens": 7254493.0, + "step": 189 + }, + { + "epoch": 0.024169952932196922, + "ewc_loss": 0.0012511431705206633, + "ewc_loss_diag": 7.189810276031494e-07, + "ewc_loss_parallel": 5.33980164618697e-06, + "grad_norm": 3.3441171646118164, + "learning_rate": 8.011869436201781e-08, + "loss": 0.6371, + "mean_token_accuracy": 0.8112084865570068, + "num_tokens": 7292643.0, + "step": 190 + }, + { + "epoch": 0.024297163210787433, + "ewc_loss": 0.0012858799891546369, + "ewc_loss_diag": 7.37607479095459e-07, + "ewc_loss_parallel": 5.496433914231602e-06, + "grad_norm": 3.295706033706665, + "learning_rate": 8.054260279779568e-08, + "loss": 0.7589, + "mean_token_accuracy": 0.7791359424591064, + "num_tokens": 7329810.0, + "step": 191 + }, + { + "epoch": 0.02442437348937794, + "ewc_loss": 0.0013236472150310874, + "ewc_loss_diag": 7.599592208862305e-07, + "ewc_loss_parallel": 5.645224518957548e-06, + "grad_norm": 3.4413232803344727, + "learning_rate": 8.096651123357356e-08, + "loss": 0.7449, + "mean_token_accuracy": 0.7799975872039795, + "num_tokens": 7367630.0, + "step": 192 + }, + { + "epoch": 0.02455158376796845, + "ewc_loss": 0.0014383799862116575, + "ewc_loss_diag": 8.568167686462402e-07, + "ewc_loss_parallel": 5.800731742056087e-06, + "grad_norm": 11.861684799194336, + "learning_rate": 8.139041966935143e-08, + "loss": 0.727, + "mean_token_accuracy": 0.7852466702461243, + "num_tokens": 7411580.0, + "step": 193 + }, + { + "epoch": 0.024678794046558962, + "ewc_loss": 0.001400127774104476, + "ewc_loss_diag": 7.897615432739258e-07, + "ewc_loss_parallel": 6.104854037403129e-06, + "grad_norm": 3.487530469894409, + "learning_rate": 8.181432810512929e-08, + "loss": 0.6805, + "mean_token_accuracy": 0.7984491586685181, + "num_tokens": 7451631.0, + "step": 194 + }, + { + "epoch": 0.024806004325149473, + "ewc_loss": 0.0014105986338108778, + "ewc_loss_diag": 7.860362529754639e-07, + "ewc_loss_parallel": 6.2477101892000064e-06, + "grad_norm": 3.9730801582336426, + "learning_rate": 8.223823654090716e-08, + "loss": 0.6896, + "mean_token_accuracy": 0.7950122356414795, + "num_tokens": 7493645.0, + "step": 195 + }, + { + "epoch": 0.024933214603739984, + "ewc_loss": 0.0014417143538594246, + "ewc_loss_diag": 8.046627044677734e-07, + "ewc_loss_parallel": 6.368131835188251e-06, + "grad_norm": 4.11984395980835, + "learning_rate": 8.266214497668503e-08, + "loss": 0.65, + "mean_token_accuracy": 0.8082858324050903, + "num_tokens": 7538042.0, + "step": 196 + }, + { + "epoch": 0.02506042488233049, + "ewc_loss": 0.0014603901654481888, + "ewc_loss_diag": 8.121132850646973e-07, + "ewc_loss_parallel": 6.4785963331814855e-06, + "grad_norm": 3.4562504291534424, + "learning_rate": 8.30860534124629e-08, + "loss": 0.7121, + "mean_token_accuracy": 0.7888089418411255, + "num_tokens": 7575374.0, + "step": 197 + }, + { + "epoch": 0.025187635160921002, + "ewc_loss": 0.001502740429714322, + "ewc_loss_diag": 8.493661880493164e-07, + "ewc_loss_parallel": 6.520629085571272e-06, + "grad_norm": 4.2992401123046875, + "learning_rate": 8.350996184824078e-08, + "loss": 0.7038, + "mean_token_accuracy": 0.7904607057571411, + "num_tokens": 7606880.0, + "step": 198 + }, + { + "epoch": 0.025314845439511513, + "ewc_loss": 0.0015324435662478209, + "ewc_loss_diag": 8.717179298400879e-07, + "ewc_loss_parallel": 6.5887788878171705e-06, + "grad_norm": 3.8012120723724365, + "learning_rate": 8.393387028401865e-08, + "loss": 0.682, + "mean_token_accuracy": 0.7995498776435852, + "num_tokens": 7644840.0, + "step": 199 + }, + { + "epoch": 0.025442055718102024, + "ewc_loss": 0.0015372014604508877, + "ewc_loss_diag": 8.717179298400879e-07, + "ewc_loss_parallel": 6.636357284151018e-06, + "grad_norm": 4.201877117156982, + "learning_rate": 8.435777871979652e-08, + "loss": 0.7232, + "mean_token_accuracy": 0.7853130102157593, + "num_tokens": 7683856.0, + "step": 200 + }, + { + "epoch": 0.02556926599669253, + "ewc_loss": 0.0015514923725277185, + "ewc_loss_diag": 8.828938007354736e-07, + "ewc_loss_parallel": 6.702972314087674e-06, + "grad_norm": 3.7953131198883057, + "learning_rate": 8.47816871555744e-08, + "loss": 0.6798, + "mean_token_accuracy": 0.7978370189666748, + "num_tokens": 7715306.0, + "step": 201 + }, + { + "epoch": 0.025696476275283042, + "ewc_loss": 0.001571831526234746, + "ewc_loss_diag": 8.977949619293213e-07, + "ewc_loss_parallel": 6.75377668812871e-06, + "grad_norm": 5.94434118270874, + "learning_rate": 8.520559559135227e-08, + "loss": 0.6419, + "mean_token_accuracy": 0.8078411817550659, + "num_tokens": 7752442.0, + "step": 202 + }, + { + "epoch": 0.025823686553873553, + "ewc_loss": 0.0015892509836703539, + "ewc_loss_diag": 9.015202522277832e-07, + "ewc_loss_parallel": 6.889823453093413e-06, + "grad_norm": 6.585144996643066, + "learning_rate": 8.562950402713014e-08, + "loss": 0.7272, + "mean_token_accuracy": 0.7868369817733765, + "num_tokens": 7799891.0, + "step": 203 + }, + { + "epoch": 0.025950896832464064, + "ewc_loss": 0.0016085810493677855, + "ewc_loss_diag": 9.052455425262451e-07, + "ewc_loss_parallel": 7.044978246995015e-06, + "grad_norm": 3.804781675338745, + "learning_rate": 8.605341246290801e-08, + "loss": 0.7133, + "mean_token_accuracy": 0.7856937646865845, + "num_tokens": 7830696.0, + "step": 204 + }, + { + "epoch": 0.026078107111054575, + "ewc_loss": 0.0016276923706755042, + "ewc_loss_diag": 9.238719940185547e-07, + "ewc_loss_parallel": 7.045356142043602e-06, + "grad_norm": 3.3679940700531006, + "learning_rate": 8.647732089868589e-08, + "loss": 0.6854, + "mean_token_accuracy": 0.7976288795471191, + "num_tokens": 7870968.0, + "step": 205 + }, + { + "epoch": 0.026205317389645082, + "ewc_loss": 0.0016281197313219309, + "ewc_loss_diag": 9.313225746154785e-07, + "ewc_loss_parallel": 6.973335985094309e-06, + "grad_norm": 3.8926022052764893, + "learning_rate": 8.690122933446376e-08, + "loss": 0.7102, + "mean_token_accuracy": 0.7901864051818848, + "num_tokens": 7914418.0, + "step": 206 + }, + { + "epoch": 0.026332527668235593, + "ewc_loss": 0.0016256889794021845, + "ewc_loss_diag": 9.350478649139404e-07, + "ewc_loss_parallel": 6.910881893418264e-06, + "grad_norm": 3.895524024963379, + "learning_rate": 8.732513777024163e-08, + "loss": 0.6475, + "mean_token_accuracy": 0.8114708662033081, + "num_tokens": 7954544.0, + "step": 207 + }, + { + "epoch": 0.026459737946826104, + "ewc_loss": 0.0016358126886188984, + "ewc_loss_diag": 9.499490261077881e-07, + "ewc_loss_parallel": 6.8595304583141115e-06, + "grad_norm": 3.002937078475952, + "learning_rate": 8.77490462060195e-08, + "loss": 0.6957, + "mean_token_accuracy": 0.7964330911636353, + "num_tokens": 7997373.0, + "step": 208 + }, + { + "epoch": 0.026586948225416615, + "ewc_loss": 0.0016296871472150087, + "ewc_loss_diag": 9.5367431640625e-07, + "ewc_loss_parallel": 6.760128599125892e-06, + "grad_norm": 4.736880302429199, + "learning_rate": 8.817295464179738e-08, + "loss": 0.659, + "mean_token_accuracy": 0.7994205951690674, + "num_tokens": 8033314.0, + "step": 209 + }, + { + "epoch": 0.026714158504007122, + "ewc_loss": 0.0016425957437604666, + "ewc_loss_diag": 9.611248970031738e-07, + "ewc_loss_parallel": 6.8129206738376524e-06, + "grad_norm": 3.2985594272613525, + "learning_rate": 8.859686307757525e-08, + "loss": 0.6785, + "mean_token_accuracy": 0.7983112931251526, + "num_tokens": 8069362.0, + "step": 210 + }, + { + "epoch": 0.026841368782597633, + "ewc_loss": 0.0016392564866691828, + "ewc_loss_diag": 9.611248970031738e-07, + "ewc_loss_parallel": 6.7795281211147085e-06, + "grad_norm": 5.299745082855225, + "learning_rate": 8.902077151335312e-08, + "loss": 0.7116, + "mean_token_accuracy": 0.7870131134986877, + "num_tokens": 8109126.0, + "step": 211 + }, + { + "epoch": 0.026968579061188144, + "ewc_loss": 0.0016455805161967874, + "ewc_loss_diag": 9.611248970031738e-07, + "ewc_loss_parallel": 6.8427680162130855e-06, + "grad_norm": 4.3921427726745605, + "learning_rate": 8.944467994913098e-08, + "loss": 0.7644, + "mean_token_accuracy": 0.7784276008605957, + "num_tokens": 8143083.0, + "step": 212 + }, + { + "epoch": 0.027095789339778655, + "ewc_loss": 0.0016498814802616835, + "ewc_loss_diag": 9.611248970031738e-07, + "ewc_loss_parallel": 6.885777565912576e-06, + "grad_norm": 4.184386253356934, + "learning_rate": 8.986858838490885e-08, + "loss": 0.6151, + "mean_token_accuracy": 0.8112788200378418, + "num_tokens": 8176636.0, + "step": 213 + }, + { + "epoch": 0.027222999618369163, + "ewc_loss": 0.0016500360798090696, + "ewc_loss_diag": 9.611248970031738e-07, + "ewc_loss_parallel": 6.88732370690559e-06, + "grad_norm": 2.992264747619629, + "learning_rate": 9.029249682068673e-08, + "loss": 0.6754, + "mean_token_accuracy": 0.7976685762405396, + "num_tokens": 8220262.0, + "step": 214 + }, + { + "epoch": 0.027350209896959674, + "ewc_loss": 0.0016392009565606713, + "ewc_loss_diag": 9.611248970031738e-07, + "ewc_loss_parallel": 6.778972419851925e-06, + "grad_norm": 3.8633012771606445, + "learning_rate": 9.07164052564646e-08, + "loss": 0.6927, + "mean_token_accuracy": 0.7950614094734192, + "num_tokens": 8256303.0, + "step": 215 + }, + { + "epoch": 0.027477420175550184, + "ewc_loss": 0.0016421910841017962, + "ewc_loss_diag": 9.685754776000977e-07, + "ewc_loss_parallel": 6.732579095114488e-06, + "grad_norm": 4.620268821716309, + "learning_rate": 9.114031369224247e-08, + "loss": 0.6596, + "mean_token_accuracy": 0.8058813214302063, + "num_tokens": 8293888.0, + "step": 216 + }, + { + "epoch": 0.027604630454140695, + "ewc_loss": 0.0016586213605478406, + "ewc_loss_diag": 9.834766387939453e-07, + "ewc_loss_parallel": 6.744294751115376e-06, + "grad_norm": 3.7134807109832764, + "learning_rate": 9.156422212802034e-08, + "loss": 0.6209, + "mean_token_accuracy": 0.8132199645042419, + "num_tokens": 8335368.0, + "step": 217 + }, + { + "epoch": 0.027731840732731206, + "ewc_loss": 0.0016923720249906182, + "ewc_loss_diag": 1.0207295417785645e-06, + "ewc_loss_parallel": 6.700331596221076e-06, + "grad_norm": 6.114353656768799, + "learning_rate": 9.198813056379822e-08, + "loss": 0.6748, + "mean_token_accuracy": 0.8005077838897705, + "num_tokens": 8370338.0, + "step": 218 + }, + { + "epoch": 0.027859051011321714, + "ewc_loss": 0.0016765948385000229, + "ewc_loss_diag": 9.98377799987793e-07, + "ewc_loss_parallel": 6.7714413489738945e-06, + "grad_norm": 3.714303970336914, + "learning_rate": 9.241203899957609e-08, + "loss": 0.6606, + "mean_token_accuracy": 0.8018555641174316, + "num_tokens": 8409904.0, + "step": 219 + }, + { + "epoch": 0.027986261289912225, + "ewc_loss": 0.0016799414297565818, + "ewc_loss_diag": 1.0058283805847168e-06, + "ewc_loss_parallel": 6.728613698214758e-06, + "grad_norm": 3.734790802001953, + "learning_rate": 9.283594743535396e-08, + "loss": 0.7877, + "mean_token_accuracy": 0.7690469622612, + "num_tokens": 8446579.0, + "step": 220 + }, + { + "epoch": 0.028113471568502735, + "ewc_loss": 0.0016808134969323874, + "ewc_loss_diag": 1.0132789611816406e-06, + "ewc_loss_parallel": 6.661040970357135e-06, + "grad_norm": 4.067516326904297, + "learning_rate": 9.325985587113183e-08, + "loss": 0.6361, + "mean_token_accuracy": 0.8108148574829102, + "num_tokens": 8483663.0, + "step": 221 + }, + { + "epoch": 0.028240681847093246, + "ewc_loss": 0.0016762627055868506, + "ewc_loss_diag": 1.0132789611816406e-06, + "ewc_loss_parallel": 6.615532129217172e-06, + "grad_norm": 3.7995588779449463, + "learning_rate": 9.368376430690971e-08, + "loss": 0.6646, + "mean_token_accuracy": 0.8009459972381592, + "num_tokens": 8516346.0, + "step": 222 + }, + { + "epoch": 0.028367892125683754, + "ewc_loss": 0.0016929125413298607, + "ewc_loss_diag": 1.0356307029724121e-06, + "ewc_loss_parallel": 6.553149432875216e-06, + "grad_norm": 3.63881778717041, + "learning_rate": 9.410767274268758e-08, + "loss": 0.6622, + "mean_token_accuracy": 0.8027766942977905, + "num_tokens": 8556914.0, + "step": 223 + }, + { + "epoch": 0.028495102404274265, + "ewc_loss": 0.0016706236638128757, + "ewc_loss_diag": 1.0207295417785645e-06, + "ewc_loss_parallel": 6.482847766164923e-06, + "grad_norm": 3.897094488143921, + "learning_rate": 9.453158117846545e-08, + "loss": 0.6511, + "mean_token_accuracy": 0.8022081851959229, + "num_tokens": 8596548.0, + "step": 224 + }, + { + "epoch": 0.028622312682864776, + "ewc_loss": 0.001667336910031736, + "ewc_loss_diag": 1.0207295417785645e-06, + "ewc_loss_parallel": 6.449980446632253e-06, + "grad_norm": 5.006571292877197, + "learning_rate": 9.495548961424333e-08, + "loss": 0.6811, + "mean_token_accuracy": 0.8007537722587585, + "num_tokens": 8634366.0, + "step": 225 + }, + { + "epoch": 0.028749522961455286, + "ewc_loss": 0.0016711992211639881, + "ewc_loss_diag": 1.0207295417785645e-06, + "ewc_loss_parallel": 6.488603503385093e-06, + "grad_norm": 4.884306907653809, + "learning_rate": 9.53793980500212e-08, + "loss": 0.6211, + "mean_token_accuracy": 0.8109502792358398, + "num_tokens": 8666480.0, + "step": 226 + }, + { + "epoch": 0.028876733240045797, + "ewc_loss": 0.0016810910310596228, + "ewc_loss_diag": 1.0281801223754883e-06, + "ewc_loss_parallel": 6.5112276388390455e-06, + "grad_norm": 5.720471382141113, + "learning_rate": 9.580330648579907e-08, + "loss": 0.7707, + "mean_token_accuracy": 0.7763099670410156, + "num_tokens": 8705880.0, + "step": 227 + }, + { + "epoch": 0.029003943518636305, + "ewc_loss": 0.0017159656854346395, + "ewc_loss_diag": 1.0579824447631836e-06, + "ewc_loss_parallel": 6.55479834676953e-06, + "grad_norm": 4.170193195343018, + "learning_rate": 9.622721492157694e-08, + "loss": 0.632, + "mean_token_accuracy": 0.8096908330917358, + "num_tokens": 8743772.0, + "step": 228 + }, + { + "epoch": 0.029131153797226816, + "ewc_loss": 0.0017023759428411722, + "ewc_loss_diag": 1.0505318641662598e-06, + "ewc_loss_parallel": 6.4951946114888415e-06, + "grad_norm": 4.502719402313232, + "learning_rate": 9.665112335735482e-08, + "loss": 0.6573, + "mean_token_accuracy": 0.7995584011077881, + "num_tokens": 8775919.0, + "step": 229 + }, + { + "epoch": 0.029258364075817327, + "ewc_loss": 0.0016960952198132873, + "ewc_loss_diag": 1.0505318641662598e-06, + "ewc_loss_parallel": 6.432387635868508e-06, + "grad_norm": 4.758431911468506, + "learning_rate": 9.707503179313267e-08, + "loss": 0.6063, + "mean_token_accuracy": 0.8146357536315918, + "num_tokens": 8809336.0, + "step": 230 + }, + { + "epoch": 0.029385574354407838, + "ewc_loss": 0.0017061741091310978, + "ewc_loss_diag": 1.0654330253601074e-06, + "ewc_loss_parallel": 6.380589184118435e-06, + "grad_norm": 5.289504051208496, + "learning_rate": 9.749894022891055e-08, + "loss": 0.66, + "mean_token_accuracy": 0.8011305332183838, + "num_tokens": 8845952.0, + "step": 231 + }, + { + "epoch": 0.029512784632998345, + "ewc_loss": 0.0017039681551977992, + "ewc_loss_diag": 1.0654330253601074e-06, + "ewc_loss_parallel": 6.3585289353795815e-06, + "grad_norm": 3.720961570739746, + "learning_rate": 9.792284866468842e-08, + "loss": 0.6796, + "mean_token_accuracy": 0.7934096455574036, + "num_tokens": 8889801.0, + "step": 232 + }, + { + "epoch": 0.029639994911588856, + "ewc_loss": 0.001701907254755497, + "ewc_loss_diag": 1.0728836059570312e-06, + "ewc_loss_parallel": 6.261626367631834e-06, + "grad_norm": 4.414895534515381, + "learning_rate": 9.834675710046629e-08, + "loss": 0.6393, + "mean_token_accuracy": 0.8048785924911499, + "num_tokens": 8925429.0, + "step": 233 + }, + { + "epoch": 0.029767205190179367, + "ewc_loss": 0.0017724696081131697, + "ewc_loss_diag": 1.1548399925231934e-06, + "ewc_loss_parallel": 6.204310466273455e-06, + "grad_norm": 12.99094295501709, + "learning_rate": 9.877066553624416e-08, + "loss": 0.6723, + "mean_token_accuracy": 0.7969233393669128, + "num_tokens": 8963360.0, + "step": 234 + }, + { + "epoch": 0.029894415468769878, + "ewc_loss": 0.001708976342342794, + "ewc_loss_diag": 1.0728836059570312e-06, + "ewc_loss_parallel": 6.332316843327135e-06, + "grad_norm": 6.76467752456665, + "learning_rate": 9.919457397202204e-08, + "loss": 0.6834, + "mean_token_accuracy": 0.7975665926933289, + "num_tokens": 8998314.0, + "step": 235 + }, + { + "epoch": 0.030021625747360385, + "ewc_loss": 0.0017218261491507292, + "ewc_loss_diag": 1.080334186553955e-06, + "ewc_loss_parallel": 6.384520929714199e-06, + "grad_norm": 5.558391571044922, + "learning_rate": 9.961848240779991e-08, + "loss": 0.7526, + "mean_token_accuracy": 0.7782219648361206, + "num_tokens": 9034804.0, + "step": 236 + }, + { + "epoch": 0.030148836025950896, + "ewc_loss": 0.0017311747651547194, + "ewc_loss_diag": 1.0952353477478027e-06, + "ewc_loss_parallel": 6.325418780761538e-06, + "grad_norm": 4.3529767990112305, + "learning_rate": 1.0004239084357778e-07, + "loss": 0.675, + "mean_token_accuracy": 0.7988383769989014, + "num_tokens": 9070840.0, + "step": 237 + }, + { + "epoch": 0.030276046304541407, + "ewc_loss": 0.0017255591228604317, + "ewc_loss_diag": 1.1026859283447266e-06, + "ewc_loss_parallel": 6.1929690673423465e-06, + "grad_norm": 4.130670547485352, + "learning_rate": 1.0046629927935566e-07, + "loss": 0.6573, + "mean_token_accuracy": 0.803827166557312, + "num_tokens": 9109370.0, + "step": 238 + }, + { + "epoch": 0.030403256583131918, + "ewc_loss": 0.0017206985503435135, + "ewc_loss_diag": 1.1101365089416504e-06, + "ewc_loss_parallel": 6.068069978937274e-06, + "grad_norm": 4.086219310760498, + "learning_rate": 1.0089020771513353e-07, + "loss": 0.6014, + "mean_token_accuracy": 0.819934606552124, + "num_tokens": 9148191.0, + "step": 239 + }, + { + "epoch": 0.03053046686172243, + "ewc_loss": 0.0017244168557226658, + "ewc_loss_diag": 1.1324882507324219e-06, + "ewc_loss_parallel": 5.952664650976658e-06, + "grad_norm": 5.047841548919678, + "learning_rate": 1.013141161509114e-07, + "loss": 0.668, + "mean_token_accuracy": 0.8003172874450684, + "num_tokens": 9182923.0, + "step": 240 + }, + { + "epoch": 0.030657677140312936, + "ewc_loss": 0.0017119727563112974, + "ewc_loss_diag": 1.125037670135498e-06, + "ewc_loss_parallel": 5.904516910959501e-06, + "grad_norm": 4.921689987182617, + "learning_rate": 1.0173802458668927e-07, + "loss": 0.6668, + "mean_token_accuracy": 0.7974379062652588, + "num_tokens": 9215882.0, + "step": 241 + }, + { + "epoch": 0.030784887418903447, + "ewc_loss": 0.0017241997411474586, + "ewc_loss_diag": 1.1399388313293457e-06, + "ewc_loss_parallel": 5.874199814570602e-06, + "grad_norm": 5.4004902839660645, + "learning_rate": 1.0216193302246715e-07, + "loss": 0.699, + "mean_token_accuracy": 0.7862868309020996, + "num_tokens": 9249718.0, + "step": 242 + }, + { + "epoch": 0.030912097697493958, + "ewc_loss": 0.0017174315871670842, + "ewc_loss_diag": 1.1324882507324219e-06, + "ewc_loss_parallel": 5.88281181990169e-06, + "grad_norm": 3.911611557006836, + "learning_rate": 1.0258584145824502e-07, + "loss": 0.6219, + "mean_token_accuracy": 0.8130860328674316, + "num_tokens": 9291146.0, + "step": 243 + }, + { + "epoch": 0.03103930797608447, + "ewc_loss": 0.0017166156321763992, + "ewc_loss_diag": 1.1399388313293457e-06, + "ewc_loss_parallel": 5.79835841563181e-06, + "grad_norm": 5.630151748657227, + "learning_rate": 1.0300974989402289e-07, + "loss": 0.6136, + "mean_token_accuracy": 0.813210129737854, + "num_tokens": 9326403.0, + "step": 244 + }, + { + "epoch": 0.031166518254674976, + "ewc_loss": 0.001724401256069541, + "ewc_loss_diag": 1.1473894119262695e-06, + "ewc_loss_parallel": 5.799920018034754e-06, + "grad_norm": 4.891995429992676, + "learning_rate": 1.0343365832980076e-07, + "loss": 0.6669, + "mean_token_accuracy": 0.8006339073181152, + "num_tokens": 9368491.0, + "step": 245 + }, + { + "epoch": 0.03129372853326549, + "ewc_loss": 0.0017284685745835304, + "ewc_loss_diag": 1.1548399925231934e-06, + "ewc_loss_parallel": 5.764300567534519e-06, + "grad_norm": 3.6438913345336914, + "learning_rate": 1.0385756676557864e-07, + "loss": 0.6625, + "mean_token_accuracy": 0.8020209074020386, + "num_tokens": 9409041.0, + "step": 246 + }, + { + "epoch": 0.031420938811855995, + "ewc_loss": 0.0017180487047880888, + "ewc_loss_diag": 1.1548399925231934e-06, + "ewc_loss_parallel": 5.660100669047097e-06, + "grad_norm": 3.9821560382843018, + "learning_rate": 1.0428147520135651e-07, + "loss": 0.6351, + "mean_token_accuracy": 0.8092702031135559, + "num_tokens": 9448591.0, + "step": 247 + }, + { + "epoch": 0.03154814909044651, + "ewc_loss": 0.0017047564033418894, + "ewc_loss_diag": 1.1473894119262695e-06, + "ewc_loss_parallel": 5.603471436188556e-06, + "grad_norm": 5.068302154541016, + "learning_rate": 1.0470538363713437e-07, + "loss": 0.6661, + "mean_token_accuracy": 0.7983150482177734, + "num_tokens": 9486604.0, + "step": 248 + }, + { + "epoch": 0.031675359369037016, + "ewc_loss": 0.0017209285870194435, + "ewc_loss_diag": 1.1622905731201172e-06, + "ewc_loss_parallel": 5.6126064009731635e-06, + "grad_norm": 4.223698139190674, + "learning_rate": 1.0512929207291224e-07, + "loss": 0.6716, + "mean_token_accuracy": 0.7932463884353638, + "num_tokens": 9525285.0, + "step": 249 + }, + { + "epoch": 0.03180256964762753, + "ewc_loss": 0.0017243309412151575, + "ewc_loss_diag": 1.169741153717041e-06, + "ewc_loss_parallel": 5.570335360971512e-06, + "grad_norm": 6.063340187072754, + "learning_rate": 1.0555320050869011e-07, + "loss": 0.6519, + "mean_token_accuracy": 0.799207329750061, + "num_tokens": 9557898.0, + "step": 250 + }, + { + "epoch": 0.03192977992621804, + "ewc_loss": 0.0017298806924372911, + "ewc_loss_diag": 1.169741153717041e-06, + "ewc_loss_parallel": 5.625832727673696e-06, + "grad_norm": 5.5990309715271, + "learning_rate": 1.0597710894446799e-07, + "loss": 0.7114, + "mean_token_accuracy": 0.782896101474762, + "num_tokens": 9590438.0, + "step": 251 + }, + { + "epoch": 0.032056990204808546, + "ewc_loss": 0.001733538694679737, + "ewc_loss_diag": 1.169741153717041e-06, + "ewc_loss_parallel": 5.6624126045790035e-06, + "grad_norm": 4.438498497009277, + "learning_rate": 1.0640101738024586e-07, + "loss": 0.6227, + "mean_token_accuracy": 0.8119215965270996, + "num_tokens": 9631402.0, + "step": 252 + }, + { + "epoch": 0.03218420048339906, + "ewc_loss": 0.0017277486622333527, + "ewc_loss_diag": 1.169741153717041e-06, + "ewc_loss_parallel": 5.604513262369437e-06, + "grad_norm": 11.56846809387207, + "learning_rate": 1.0682492581602373e-07, + "loss": 0.6042, + "mean_token_accuracy": 0.8182066679000854, + "num_tokens": 9663702.0, + "step": 253 + }, + { + "epoch": 0.03231141076198957, + "ewc_loss": 0.0017393991583958268, + "ewc_loss_diag": 1.169741153717041e-06, + "ewc_loss_parallel": 5.721017714677146e-06, + "grad_norm": 4.575555324554443, + "learning_rate": 1.072488342518016e-07, + "loss": 0.5913, + "mean_token_accuracy": 0.8174645900726318, + "num_tokens": 9698969.0, + "step": 254 + }, + { + "epoch": 0.03243862104058008, + "ewc_loss": 0.0017395148752257228, + "ewc_loss_diag": 1.1771917343139648e-06, + "ewc_loss_parallel": 5.645881174132228e-06, + "grad_norm": 4.346861839294434, + "learning_rate": 1.0767274268757948e-07, + "loss": 0.6612, + "mean_token_accuracy": 0.7988119125366211, + "num_tokens": 9738075.0, + "step": 255 + }, + { + "epoch": 0.03256583131917059, + "ewc_loss": 0.0017369135748595, + "ewc_loss_diag": 1.1846423149108887e-06, + "ewc_loss_parallel": 5.543573479371844e-06, + "grad_norm": 5.522840976715088, + "learning_rate": 1.0809665112335735e-07, + "loss": 0.6662, + "mean_token_accuracy": 0.8010895252227783, + "num_tokens": 9782675.0, + "step": 256 + }, + { + "epoch": 0.0326930415977611, + "ewc_loss": 0.0017346427775919437, + "ewc_loss_diag": 1.1846423149108887e-06, + "ewc_loss_parallel": 5.520866125152679e-06, + "grad_norm": 4.508364677429199, + "learning_rate": 1.0852055955913522e-07, + "loss": 0.5662, + "mean_token_accuracy": 0.825950562953949, + "num_tokens": 9820585.0, + "step": 257 + }, + { + "epoch": 0.03282025187635161, + "ewc_loss": 0.0017291519325226545, + "ewc_loss_diag": 1.1846423149108887e-06, + "ewc_loss_parallel": 5.465958565764595e-06, + "grad_norm": 4.26370096206665, + "learning_rate": 1.089444679949131e-07, + "loss": 0.6907, + "mean_token_accuracy": 0.7930439114570618, + "num_tokens": 9860693.0, + "step": 258 + }, + { + "epoch": 0.03294746215494212, + "ewc_loss": 0.0017381994985044003, + "ewc_loss_diag": 1.1995434761047363e-06, + "ewc_loss_parallel": 5.4038455346017145e-06, + "grad_norm": 5.67625617980957, + "learning_rate": 1.0936837643069097e-07, + "loss": 0.6245, + "mean_token_accuracy": 0.8112151622772217, + "num_tokens": 9902629.0, + "step": 259 + }, + { + "epoch": 0.03307467243353263, + "ewc_loss": 0.001732612494379282, + "ewc_loss_diag": 1.1920928955078125e-06, + "ewc_loss_parallel": 5.424269602372078e-06, + "grad_norm": 4.528299331665039, + "learning_rate": 1.0979228486646884e-07, + "loss": 0.5962, + "mean_token_accuracy": 0.8152624368667603, + "num_tokens": 9937304.0, + "step": 260 + }, + { + "epoch": 0.03320188271212314, + "ewc_loss": 0.0017209395300596952, + "ewc_loss_diag": 1.1846423149108887e-06, + "ewc_loss_parallel": 5.383833922678605e-06, + "grad_norm": 4.420058727264404, + "learning_rate": 1.1021619330224671e-07, + "loss": 0.7289, + "mean_token_accuracy": 0.7807906270027161, + "num_tokens": 9976911.0, + "step": 261 + }, + { + "epoch": 0.03332909299071365, + "ewc_loss": 0.0017176913097500801, + "ewc_loss_diag": 1.1846423149108887e-06, + "ewc_loss_parallel": 5.351352228899486e-06, + "grad_norm": 4.355537414550781, + "learning_rate": 1.1064010173802458e-07, + "loss": 0.664, + "mean_token_accuracy": 0.8006622791290283, + "num_tokens": 10015740.0, + "step": 262 + }, + { + "epoch": 0.03345630326930416, + "ewc_loss": 0.0017387145198881626, + "ewc_loss_diag": 1.2069940567016602e-06, + "ewc_loss_parallel": 5.332702130544931e-06, + "grad_norm": 3.6942760944366455, + "learning_rate": 1.1106401017380246e-07, + "loss": 0.601, + "mean_token_accuracy": 0.8180762529373169, + "num_tokens": 10053054.0, + "step": 263 + }, + { + "epoch": 0.03358351354789467, + "ewc_loss": 0.0017417068593204021, + "ewc_loss_diag": 1.214444637298584e-06, + "ewc_loss_parallel": 5.286331543175038e-06, + "grad_norm": 6.050146579742432, + "learning_rate": 1.1148791860958033e-07, + "loss": 0.6962, + "mean_token_accuracy": 0.7864009141921997, + "num_tokens": 10082088.0, + "step": 264 + }, + { + "epoch": 0.03371072382648518, + "ewc_loss": 0.0017594490200281143, + "ewc_loss_diag": 1.2218952178955078e-06, + "ewc_loss_parallel": 5.387459168559872e-06, + "grad_norm": 4.789798736572266, + "learning_rate": 1.119118270453582e-07, + "loss": 0.6703, + "mean_token_accuracy": 0.7961806058883667, + "num_tokens": 10121122.0, + "step": 265 + }, + { + "epoch": 0.03383793410507569, + "ewc_loss": 0.0017773972358554602, + "ewc_loss_diag": 1.2367963790893555e-06, + "ewc_loss_parallel": 5.414352926891297e-06, + "grad_norm": 6.335567951202393, + "learning_rate": 1.1233573548113607e-07, + "loss": 0.6652, + "mean_token_accuracy": 0.7998442649841309, + "num_tokens": 10161017.0, + "step": 266 + }, + { + "epoch": 0.0339651443836662, + "ewc_loss": 0.001791647868230939, + "ewc_loss_diag": 1.2442469596862793e-06, + "ewc_loss_parallel": 5.480565505422419e-06, + "grad_norm": 4.455104351043701, + "learning_rate": 1.1275964391691393e-07, + "loss": 0.6319, + "mean_token_accuracy": 0.8100301623344421, + "num_tokens": 10204913.0, + "step": 267 + }, + { + "epoch": 0.03409235466225671, + "ewc_loss": 0.0018117072759196162, + "ewc_loss_diag": 1.2665987014770508e-06, + "ewc_loss_parallel": 5.4522779464605264e-06, + "grad_norm": 4.132923603057861, + "learning_rate": 1.131835523526918e-07, + "loss": 0.6795, + "mean_token_accuracy": 0.7931824922561646, + "num_tokens": 10245154.0, + "step": 268 + }, + { + "epoch": 0.03421956494084722, + "ewc_loss": 0.001805958803743124, + "ewc_loss_diag": 1.2665987014770508e-06, + "ewc_loss_parallel": 5.394792424340267e-06, + "grad_norm": 4.01412296295166, + "learning_rate": 1.1360746078846968e-07, + "loss": 0.6861, + "mean_token_accuracy": 0.792785108089447, + "num_tokens": 10283006.0, + "step": 269 + }, + { + "epoch": 0.03434677521943773, + "ewc_loss": 0.001803850056603551, + "ewc_loss_diag": 1.2665987014770508e-06, + "ewc_loss_parallel": 5.373706244427012e-06, + "grad_norm": 6.190240859985352, + "learning_rate": 1.1403136922424755e-07, + "loss": 0.6461, + "mean_token_accuracy": 0.8025097846984863, + "num_tokens": 10316689.0, + "step": 270 + }, + { + "epoch": 0.03447398549802824, + "ewc_loss": 0.0018204392399638891, + "ewc_loss_diag": 1.2740492820739746e-06, + "ewc_loss_parallel": 5.463302841235418e-06, + "grad_norm": 4.586262226104736, + "learning_rate": 1.1445527766002542e-07, + "loss": 0.6403, + "mean_token_accuracy": 0.8031662702560425, + "num_tokens": 10358145.0, + "step": 271 + }, + { + "epoch": 0.03460119577661875, + "ewc_loss": 0.0018189004622399807, + "ewc_loss_diag": 1.2740492820739746e-06, + "ewc_loss_parallel": 5.447916464618174e-06, + "grad_norm": 5.532628059387207, + "learning_rate": 1.148791860958033e-07, + "loss": 0.7412, + "mean_token_accuracy": 0.7771586775779724, + "num_tokens": 10393424.0, + "step": 272 + }, + { + "epoch": 0.034728406055209264, + "ewc_loss": 0.0018458670238032937, + "ewc_loss_diag": 1.296401023864746e-06, + "ewc_loss_parallel": 5.488699571287725e-06, + "grad_norm": 4.6242499351501465, + "learning_rate": 1.1530309453158117e-07, + "loss": 0.6557, + "mean_token_accuracy": 0.80179762840271, + "num_tokens": 10433560.0, + "step": 273 + }, + { + "epoch": 0.03485561633379977, + "ewc_loss": 0.001868237042799592, + "ewc_loss_diag": 1.3187527656555176e-06, + "ewc_loss_parallel": 5.483518179971725e-06, + "grad_norm": 3.9469239711761475, + "learning_rate": 1.1572700296735904e-07, + "loss": 0.6518, + "mean_token_accuracy": 0.8016654849052429, + "num_tokens": 10472024.0, + "step": 274 + }, + { + "epoch": 0.03498282661239028, + "ewc_loss": 0.0018717984203249216, + "ewc_loss_diag": 1.3262033462524414e-06, + "ewc_loss_parallel": 5.4428369367087726e-06, + "grad_norm": 4.47611141204834, + "learning_rate": 1.1615091140313691e-07, + "loss": 0.6904, + "mean_token_accuracy": 0.7915611267089844, + "num_tokens": 10510874.0, + "step": 275 + }, + { + "epoch": 0.03511003689098079, + "ewc_loss": 0.001888688188046217, + "ewc_loss_diag": 1.341104507446289e-06, + "ewc_loss_parallel": 5.459147814690368e-06, + "grad_norm": 4.289575576782227, + "learning_rate": 1.1657481983891479e-07, + "loss": 0.6281, + "mean_token_accuracy": 0.809603214263916, + "num_tokens": 10552411.0, + "step": 276 + }, + { + "epoch": 0.0352372471695713, + "ewc_loss": 0.001890915329568088, + "ewc_loss_diag": 1.341104507446289e-06, + "ewc_loss_parallel": 5.481419066200033e-06, + "grad_norm": 6.827956676483154, + "learning_rate": 1.1699872827469266e-07, + "loss": 0.6448, + "mean_token_accuracy": 0.8059836030006409, + "num_tokens": 10591537.0, + "step": 277 + }, + { + "epoch": 0.03536445744816181, + "ewc_loss": 0.0019141687080264091, + "ewc_loss_diag": 1.3485550880432129e-06, + "ewc_loss_parallel": 5.63765888728085e-06, + "grad_norm": 4.814728736877441, + "learning_rate": 1.1742263671047053e-07, + "loss": 0.6528, + "mean_token_accuracy": 0.8029961585998535, + "num_tokens": 10632228.0, + "step": 278 + }, + { + "epoch": 0.03549166772675232, + "ewc_loss": 0.0019170362502336502, + "ewc_loss_diag": 1.3485550880432129e-06, + "ewc_loss_parallel": 5.666334345733048e-06, + "grad_norm": 6.207443714141846, + "learning_rate": 1.178465451462484e-07, + "loss": 0.7081, + "mean_token_accuracy": 0.7851735353469849, + "num_tokens": 10672829.0, + "step": 279 + }, + { + "epoch": 0.03561887800534283, + "ewc_loss": 0.001938376808539033, + "ewc_loss_diag": 1.3634562492370605e-06, + "ewc_loss_parallel": 5.727151801693253e-06, + "grad_norm": 6.97683572769165, + "learning_rate": 1.1827045358202628e-07, + "loss": 0.682, + "mean_token_accuracy": 0.7995153665542603, + "num_tokens": 10706146.0, + "step": 280 + }, + { + "epoch": 0.035746088283933344, + "ewc_loss": 0.0019468360114842653, + "ewc_loss_diag": 1.3634562492370605e-06, + "ewc_loss_parallel": 5.811743903905153e-06, + "grad_norm": 4.986219882965088, + "learning_rate": 1.1869436201780415e-07, + "loss": 0.5994, + "mean_token_accuracy": 0.8194122910499573, + "num_tokens": 10744112.0, + "step": 281 + }, + { + "epoch": 0.03587329856252385, + "ewc_loss": 0.0019426336511969566, + "ewc_loss_diag": 1.3634562492370605e-06, + "ewc_loss_parallel": 5.769720701209735e-06, + "grad_norm": 5.499690532684326, + "learning_rate": 1.1911827045358202e-07, + "loss": 0.6298, + "mean_token_accuracy": 0.813512921333313, + "num_tokens": 10783726.0, + "step": 282 + }, + { + "epoch": 0.03600050884111436, + "ewc_loss": 0.0019500656053423882, + "ewc_loss_diag": 1.3709068298339844e-06, + "ewc_loss_parallel": 5.767745278717484e-06, + "grad_norm": 4.759444236755371, + "learning_rate": 1.195421788893599e-07, + "loss": 0.6939, + "mean_token_accuracy": 0.7930393815040588, + "num_tokens": 10824476.0, + "step": 283 + }, + { + "epoch": 0.036127719119704874, + "ewc_loss": 0.0019561592489480972, + "ewc_loss_diag": 1.3783574104309082e-06, + "ewc_loss_parallel": 5.752389824920101e-06, + "grad_norm": 5.136285305023193, + "learning_rate": 1.1996608732513778e-07, + "loss": 0.6534, + "mean_token_accuracy": 0.8025033473968506, + "num_tokens": 10859997.0, + "step": 284 + }, + { + "epoch": 0.03625492939829538, + "ewc_loss": 0.0019634482450783253, + "ewc_loss_diag": 1.385807991027832e-06, + "ewc_loss_parallel": 5.748984222009312e-06, + "grad_norm": 4.933758735656738, + "learning_rate": 1.2038999576091563e-07, + "loss": 0.6241, + "mean_token_accuracy": 0.8112109899520874, + "num_tokens": 10898800.0, + "step": 285 + }, + { + "epoch": 0.036382139676885895, + "ewc_loss": 0.0019703577272593975, + "ewc_loss_diag": 1.3932585716247559e-06, + "ewc_loss_parallel": 5.7417860261921305e-06, + "grad_norm": 5.112797737121582, + "learning_rate": 1.208139041966935e-07, + "loss": 0.5902, + "mean_token_accuracy": 0.8215407133102417, + "num_tokens": 10937462.0, + "step": 286 + }, + { + "epoch": 0.0365093499554764, + "ewc_loss": 0.001988081494346261, + "ewc_loss_diag": 1.4081597328186035e-06, + "ewc_loss_parallel": 5.766435606346931e-06, + "grad_norm": 4.77252197265625, + "learning_rate": 1.2123781263247137e-07, + "loss": 0.651, + "mean_token_accuracy": 0.7992751002311707, + "num_tokens": 10970739.0, + "step": 287 + }, + { + "epoch": 0.03663656023406691, + "ewc_loss": 0.0020053619518876076, + "ewc_loss_diag": 1.4230608940124512e-06, + "ewc_loss_parallel": 5.786650945083238e-06, + "grad_norm": 4.5213446617126465, + "learning_rate": 1.2166172106824924e-07, + "loss": 0.7033, + "mean_token_accuracy": 0.7941361665725708, + "num_tokens": 11003000.0, + "step": 288 + }, + { + "epoch": 0.036763770512657425, + "ewc_loss": 0.0020138092804700136, + "ewc_loss_diag": 1.4379620552062988e-06, + "ewc_loss_parallel": 5.794830940430984e-06, + "grad_norm": 5.1712141036987305, + "learning_rate": 1.2208562950402712e-07, + "loss": 0.6083, + "mean_token_accuracy": 0.811730146408081, + "num_tokens": 11039665.0, + "step": 289 + }, + { + "epoch": 0.03689098079124793, + "ewc_loss": 0.0020194859243929386, + "ewc_loss_diag": 1.430511474609375e-06, + "ewc_loss_parallel": 5.851597506989492e-06, + "grad_norm": 4.578949928283691, + "learning_rate": 1.22509537939805e-07, + "loss": 0.636, + "mean_token_accuracy": 0.801588237285614, + "num_tokens": 11078368.0, + "step": 290 + }, + { + "epoch": 0.03701819106983844, + "ewc_loss": 0.0020373763982206583, + "ewc_loss_diag": 1.4528632164001465e-06, + "ewc_loss_parallel": 5.8779151004273444e-06, + "grad_norm": 3.642425537109375, + "learning_rate": 1.2293344637558286e-07, + "loss": 0.6617, + "mean_token_accuracy": 0.8009046912193298, + "num_tokens": 11122654.0, + "step": 291 + }, + { + "epoch": 0.037145401348428954, + "ewc_loss": 0.002033495344221592, + "ewc_loss_diag": 1.4528632164001465e-06, + "ewc_loss_parallel": 5.839102868776536e-06, + "grad_norm": 4.344725131988525, + "learning_rate": 1.2335735481136073e-07, + "loss": 0.6159, + "mean_token_accuracy": 0.8068858981132507, + "num_tokens": 11162622.0, + "step": 292 + }, + { + "epoch": 0.03727261162701946, + "ewc_loss": 0.002054189331829548, + "ewc_loss_diag": 1.4677643775939941e-06, + "ewc_loss_parallel": 5.893457000638591e-06, + "grad_norm": 4.489528179168701, + "learning_rate": 1.237812632471386e-07, + "loss": 0.6274, + "mean_token_accuracy": 0.8100607395172119, + "num_tokens": 11202564.0, + "step": 293 + }, + { + "epoch": 0.037399821905609976, + "ewc_loss": 0.0020606014877557755, + "ewc_loss_diag": 1.4677643775939941e-06, + "ewc_loss_parallel": 5.9575768318609335e-06, + "grad_norm": 5.7087531089782715, + "learning_rate": 1.2420517168291648e-07, + "loss": 0.6459, + "mean_token_accuracy": 0.8009412288665771, + "num_tokens": 11243422.0, + "step": 294 + }, + { + "epoch": 0.03752703218420048, + "ewc_loss": 0.002083437517285347, + "ewc_loss_diag": 1.475214958190918e-06, + "ewc_loss_parallel": 6.109644345997367e-06, + "grad_norm": 4.2888264656066895, + "learning_rate": 1.2462908011869435e-07, + "loss": 0.6275, + "mean_token_accuracy": 0.8058656454086304, + "num_tokens": 11280867.0, + "step": 295 + }, + { + "epoch": 0.03765424246279099, + "ewc_loss": 0.0020916564390063286, + "ewc_loss_diag": 1.4826655387878418e-06, + "ewc_loss_parallel": 6.115539690654259e-06, + "grad_norm": 5.495177268981934, + "learning_rate": 1.2505298855447223e-07, + "loss": 0.5768, + "mean_token_accuracy": 0.8224545121192932, + "num_tokens": 11318454.0, + "step": 296 + }, + { + "epoch": 0.037781452741381505, + "ewc_loss": 0.0020996672101318836, + "ewc_loss_diag": 1.4826655387878418e-06, + "ewc_loss_parallel": 6.195647529239068e-06, + "grad_norm": 5.883276462554932, + "learning_rate": 1.254768969902501e-07, + "loss": 0.717, + "mean_token_accuracy": 0.7847751379013062, + "num_tokens": 11354721.0, + "step": 297 + }, + { + "epoch": 0.03790866301997201, + "ewc_loss": 0.002116700168699026, + "ewc_loss_diag": 1.4901161193847656e-06, + "ewc_loss_parallel": 6.2896815506974235e-06, + "grad_norm": 5.654463768005371, + "learning_rate": 1.2590080542602797e-07, + "loss": 0.6743, + "mean_token_accuracy": 0.7963007688522339, + "num_tokens": 11385938.0, + "step": 298 + }, + { + "epoch": 0.03803587329856253, + "ewc_loss": 0.0021310565061867237, + "ewc_loss_diag": 1.4975666999816895e-06, + "ewc_loss_parallel": 6.3569509620720055e-06, + "grad_norm": 3.435554027557373, + "learning_rate": 1.2632471386180584e-07, + "loss": 0.6534, + "mean_token_accuracy": 0.8006912469863892, + "num_tokens": 11424911.0, + "step": 299 + }, + { + "epoch": 0.038163083577153034, + "ewc_loss": 0.0021260608918964863, + "ewc_loss_diag": 1.5050172805786133e-06, + "ewc_loss_parallel": 6.230702638276853e-06, + "grad_norm": 4.006176471710205, + "learning_rate": 1.2674862229758372e-07, + "loss": 0.6222, + "mean_token_accuracy": 0.810975193977356, + "num_tokens": 11460580.0, + "step": 300 + }, + { + "epoch": 0.03829029385574354, + "ewc_loss": 0.002125623170286417, + "ewc_loss_diag": 1.5050172805786133e-06, + "ewc_loss_parallel": 6.226324785529869e-06, + "grad_norm": 4.7443461418151855, + "learning_rate": 1.271725307333616e-07, + "loss": 0.6128, + "mean_token_accuracy": 0.8144488334655762, + "num_tokens": 11501031.0, + "step": 301 + }, + { + "epoch": 0.038417504134334056, + "ewc_loss": 0.0021327713038772345, + "ewc_loss_diag": 1.5050172805786133e-06, + "ewc_loss_parallel": 6.297805157373659e-06, + "grad_norm": 4.651463508605957, + "learning_rate": 1.2759643916913946e-07, + "loss": 0.6224, + "mean_token_accuracy": 0.8103927969932556, + "num_tokens": 11544053.0, + "step": 302 + }, + { + "epoch": 0.03854471441292456, + "ewc_loss": 0.0021475336980074644, + "ewc_loss_diag": 1.5124678611755371e-06, + "ewc_loss_parallel": 6.369135917339008e-06, + "grad_norm": 6.5673112869262695, + "learning_rate": 1.2802034760491733e-07, + "loss": 0.6386, + "mean_token_accuracy": 0.8047475218772888, + "num_tokens": 11576944.0, + "step": 303 + }, + { + "epoch": 0.03867192469151508, + "ewc_loss": 0.0021806834265589714, + "ewc_loss_diag": 1.5273690223693848e-06, + "ewc_loss_parallel": 6.548045348608866e-06, + "grad_norm": 3.6832644939422607, + "learning_rate": 1.284442560406952e-07, + "loss": 0.6883, + "mean_token_accuracy": 0.7902534008026123, + "num_tokens": 11619735.0, + "step": 304 + }, + { + "epoch": 0.038799134970105585, + "ewc_loss": 0.0021714535541832447, + "ewc_loss_diag": 1.5273690223693848e-06, + "ewc_loss_parallel": 6.455746643041493e-06, + "grad_norm": 5.421593189239502, + "learning_rate": 1.2886816447647308e-07, + "loss": 0.725, + "mean_token_accuracy": 0.7779992818832397, + "num_tokens": 11655025.0, + "step": 305 + }, + { + "epoch": 0.03892634524869609, + "ewc_loss": 0.0021964621264487505, + "ewc_loss_diag": 1.5422701835632324e-06, + "ewc_loss_parallel": 6.5532440203242e-06, + "grad_norm": 5.428092956542969, + "learning_rate": 1.2929207291225095e-07, + "loss": 0.6228, + "mean_token_accuracy": 0.8111444711685181, + "num_tokens": 11691970.0, + "step": 306 + }, + { + "epoch": 0.03905355552728661, + "ewc_loss": 0.0022214893251657486, + "ewc_loss_diag": 1.55717134475708e-06, + "ewc_loss_parallel": 6.650929663010174e-06, + "grad_norm": 4.760166168212891, + "learning_rate": 1.2971598134802882e-07, + "loss": 0.6385, + "mean_token_accuracy": 0.8051825761795044, + "num_tokens": 11729561.0, + "step": 307 + }, + { + "epoch": 0.039180765805877114, + "ewc_loss": 0.002238180721178651, + "ewc_loss_diag": 1.5720725059509277e-06, + "ewc_loss_parallel": 6.665253749815747e-06, + "grad_norm": 4.825104713439941, + "learning_rate": 1.301398897838067e-07, + "loss": 0.6567, + "mean_token_accuracy": 0.7978584170341492, + "num_tokens": 11765707.0, + "step": 308 + }, + { + "epoch": 0.03930797608446762, + "ewc_loss": 0.002248915610834956, + "ewc_loss_diag": 1.5795230865478516e-06, + "ewc_loss_parallel": 6.6963098106498364e-06, + "grad_norm": 4.3940887451171875, + "learning_rate": 1.3056379821958457e-07, + "loss": 0.6657, + "mean_token_accuracy": 0.8025627732276917, + "num_tokens": 11806223.0, + "step": 309 + }, + { + "epoch": 0.039435186363058136, + "ewc_loss": 0.002261769026517868, + "ewc_loss_diag": 1.5944242477416992e-06, + "ewc_loss_parallel": 6.672255949524697e-06, + "grad_norm": 5.212183952331543, + "learning_rate": 1.3098770665536244e-07, + "loss": 0.6213, + "mean_token_accuracy": 0.8118728399276733, + "num_tokens": 11845477.0, + "step": 310 + }, + { + "epoch": 0.039562396641648644, + "ewc_loss": 0.0022683804854750633, + "ewc_loss_diag": 1.5944242477416992e-06, + "ewc_loss_parallel": 6.73837166687008e-06, + "grad_norm": 5.897846221923828, + "learning_rate": 1.3141161509114031e-07, + "loss": 0.6057, + "mean_token_accuracy": 0.8095940351486206, + "num_tokens": 11877973.0, + "step": 311 + }, + { + "epoch": 0.03968960692023916, + "ewc_loss": 0.0022947622928768396, + "ewc_loss_diag": 1.6093254089355469e-06, + "ewc_loss_parallel": 6.849600595160155e-06, + "grad_norm": 4.748443126678467, + "learning_rate": 1.3183552352691819e-07, + "loss": 0.6116, + "mean_token_accuracy": 0.8131722807884216, + "num_tokens": 11912093.0, + "step": 312 + }, + { + "epoch": 0.039816817198829665, + "ewc_loss": 0.002296194899827242, + "ewc_loss_diag": 1.6093254089355469e-06, + "ewc_loss_parallel": 6.863926500955131e-06, + "grad_norm": 4.1172776222229, + "learning_rate": 1.3225943196269603e-07, + "loss": 0.6374, + "mean_token_accuracy": 0.8090376853942871, + "num_tokens": 11952541.0, + "step": 313 + }, + { + "epoch": 0.03994402747742017, + "ewc_loss": 0.0023109230678528547, + "ewc_loss_diag": 1.6316771507263184e-06, + "ewc_loss_parallel": 6.782327091059415e-06, + "grad_norm": 5.029558181762695, + "learning_rate": 1.3268334039847393e-07, + "loss": 0.6146, + "mean_token_accuracy": 0.810383141040802, + "num_tokens": 11990414.0, + "step": 314 + }, + { + "epoch": 0.04007123775601069, + "ewc_loss": 0.0023313923738896847, + "ewc_loss_diag": 1.646578311920166e-06, + "ewc_loss_parallel": 6.834432497271337e-06, + "grad_norm": 5.202574253082275, + "learning_rate": 1.3310724883425178e-07, + "loss": 0.6821, + "mean_token_accuracy": 0.7900362014770508, + "num_tokens": 12025925.0, + "step": 315 + }, + { + "epoch": 0.040198448034601195, + "ewc_loss": 0.0024378462694585323, + "ewc_loss_diag": 1.7508864402770996e-06, + "ewc_loss_parallel": 6.9071497819095384e-06, + "grad_norm": 12.336126327514648, + "learning_rate": 1.3353115727002968e-07, + "loss": 0.6496, + "mean_token_accuracy": 0.8017483949661255, + "num_tokens": 12068879.0, + "step": 316 + }, + { + "epoch": 0.04032565831319171, + "ewc_loss": 0.0024045712780207396, + "ewc_loss_diag": 1.6763806343078613e-06, + "ewc_loss_parallel": 7.261045539053157e-06, + "grad_norm": 4.389480113983154, + "learning_rate": 1.3395506570580752e-07, + "loss": 0.6162, + "mean_token_accuracy": 0.8106414079666138, + "num_tokens": 12111931.0, + "step": 317 + }, + { + "epoch": 0.040452868591782216, + "ewc_loss": 0.0024028061889111996, + "ewc_loss_diag": 1.6838312149047852e-06, + "ewc_loss_parallel": 7.167100193328224e-06, + "grad_norm": 5.45817232131958, + "learning_rate": 1.3437897414158542e-07, + "loss": 0.6836, + "mean_token_accuracy": 0.7895601987838745, + "num_tokens": 12140431.0, + "step": 318 + }, + { + "epoch": 0.040580078870372724, + "ewc_loss": 0.0024085131008177996, + "ewc_loss_diag": 1.691281795501709e-06, + "ewc_loss_parallel": 7.147874839574797e-06, + "grad_norm": 4.471919059753418, + "learning_rate": 1.3480288257736327e-07, + "loss": 0.6136, + "mean_token_accuracy": 0.8104305267333984, + "num_tokens": 12173932.0, + "step": 319 + }, + { + "epoch": 0.04070728914896324, + "ewc_loss": 0.0024215467274188995, + "ewc_loss_diag": 1.7136335372924805e-06, + "ewc_loss_parallel": 7.049329724395648e-06, + "grad_norm": 4.352822780609131, + "learning_rate": 1.3522679101314117e-07, + "loss": 0.5908, + "mean_token_accuracy": 0.8165879249572754, + "num_tokens": 12210040.0, + "step": 320 + }, + { + "epoch": 0.040834499427553746, + "ewc_loss": 0.0024140982422977686, + "ewc_loss_diag": 1.7136335372924805e-06, + "ewc_loss_parallel": 6.974844382057199e-06, + "grad_norm": 4.733625888824463, + "learning_rate": 1.35650699448919e-07, + "loss": 0.6245, + "mean_token_accuracy": 0.8094973564147949, + "num_tokens": 12248556.0, + "step": 321 + }, + { + "epoch": 0.04096170970614425, + "ewc_loss": 0.002415992086753249, + "ewc_loss_diag": 1.7136335372924805e-06, + "ewc_loss_parallel": 6.993782790232217e-06, + "grad_norm": 4.475594997406006, + "learning_rate": 1.360746078846969e-07, + "loss": 0.6131, + "mean_token_accuracy": 0.8099750876426697, + "num_tokens": 12285664.0, + "step": 322 + }, + { + "epoch": 0.04108891998473477, + "ewc_loss": 0.0024241514038294554, + "ewc_loss_diag": 1.7210841178894043e-06, + "ewc_loss_parallel": 6.999082870606799e-06, + "grad_norm": 4.261312484741211, + "learning_rate": 1.3649851632047476e-07, + "loss": 0.6666, + "mean_token_accuracy": 0.7962648868560791, + "num_tokens": 12326964.0, + "step": 323 + }, + { + "epoch": 0.041216130263325275, + "ewc_loss": 0.002432289533317089, + "ewc_loss_diag": 1.7285346984863281e-06, + "ewc_loss_parallel": 7.004168764979113e-06, + "grad_norm": 4.23406982421875, + "learning_rate": 1.3692242475625266e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.8249101042747498, + "num_tokens": 12366541.0, + "step": 324 + }, + { + "epoch": 0.04134334054191579, + "ewc_loss": 0.0024492735974490643, + "ewc_loss_diag": 1.7508864402770996e-06, + "ewc_loss_parallel": 7.0214214247243945e-06, + "grad_norm": 5.190798282623291, + "learning_rate": 1.373463331920305e-07, + "loss": 0.6377, + "mean_token_accuracy": 0.8101770877838135, + "num_tokens": 12405664.0, + "step": 325 + }, + { + "epoch": 0.0414705508205063, + "ewc_loss": 0.00246292632073164, + "ewc_loss_diag": 1.7508864402770996e-06, + "ewc_loss_parallel": 7.157950221881038e-06, + "grad_norm": 5.55540657043457, + "learning_rate": 1.377702416278084e-07, + "loss": 0.7012, + "mean_token_accuracy": 0.7874342203140259, + "num_tokens": 12445039.0, + "step": 326 + }, + { + "epoch": 0.041597761099096804, + "ewc_loss": 0.0024752123281359673, + "ewc_loss_diag": 1.7508864402770996e-06, + "ewc_loss_parallel": 7.280810677912086e-06, + "grad_norm": 5.424428939819336, + "learning_rate": 1.3819415006358625e-07, + "loss": 0.5716, + "mean_token_accuracy": 0.8243305683135986, + "num_tokens": 12485481.0, + "step": 327 + }, + { + "epoch": 0.04172497137768732, + "ewc_loss": 0.0024821904953569174, + "ewc_loss_diag": 1.7508864402770996e-06, + "ewc_loss_parallel": 7.350592113652965e-06, + "grad_norm": 4.695948600769043, + "learning_rate": 1.3861805849936415e-07, + "loss": 0.5961, + "mean_token_accuracy": 0.8198018074035645, + "num_tokens": 12530272.0, + "step": 328 + }, + { + "epoch": 0.041852181656277826, + "ewc_loss": 0.002477354370057583, + "ewc_loss_diag": 1.7508864402770996e-06, + "ewc_loss_parallel": 7.3022301876335405e-06, + "grad_norm": 5.030630588531494, + "learning_rate": 1.39041966935142e-07, + "loss": 0.6414, + "mean_token_accuracy": 0.8033836483955383, + "num_tokens": 12565854.0, + "step": 329 + }, + { + "epoch": 0.04197939193486834, + "ewc_loss": 0.002501303795725107, + "ewc_loss_diag": 1.773238182067871e-06, + "ewc_loss_parallel": 7.312841717066476e-06, + "grad_norm": 3.8117899894714355, + "learning_rate": 1.394658753709199e-07, + "loss": 0.6279, + "mean_token_accuracy": 0.8090774416923523, + "num_tokens": 12607313.0, + "step": 330 + }, + { + "epoch": 0.04210660221345885, + "ewc_loss": 0.0025095436722040176, + "ewc_loss_diag": 1.7955899238586426e-06, + "ewc_loss_parallel": 7.166358955146279e-06, + "grad_norm": 5.297486782073975, + "learning_rate": 1.3988978380669774e-07, + "loss": 0.666, + "mean_token_accuracy": 0.7971050143241882, + "num_tokens": 12642057.0, + "step": 331 + }, + { + "epoch": 0.042233812492049355, + "ewc_loss": 0.002518797293305397, + "ewc_loss_diag": 1.7955899238586426e-06, + "ewc_loss_parallel": 7.2588968578202184e-06, + "grad_norm": 4.342551231384277, + "learning_rate": 1.403136922424756e-07, + "loss": 0.6566, + "mean_token_accuracy": 0.7991811633110046, + "num_tokens": 12684118.0, + "step": 332 + }, + { + "epoch": 0.04236102277063987, + "ewc_loss": 0.0025183800607919693, + "ewc_loss_diag": 1.7955899238586426e-06, + "ewc_loss_parallel": 7.254723641381133e-06, + "grad_norm": 4.164731979370117, + "learning_rate": 1.4073760067825348e-07, + "loss": 0.6213, + "mean_token_accuracy": 0.8092626333236694, + "num_tokens": 12721945.0, + "step": 333 + }, + { + "epoch": 0.04248823304923038, + "ewc_loss": 0.0025413753464818, + "ewc_loss_diag": 1.817941665649414e-06, + "ewc_loss_parallel": 7.255795935634524e-06, + "grad_norm": 4.009845733642578, + "learning_rate": 1.4116150911403136e-07, + "loss": 0.6355, + "mean_token_accuracy": 0.8049330711364746, + "num_tokens": 12764175.0, + "step": 334 + }, + { + "epoch": 0.04261544332782089, + "ewc_loss": 0.002547045238316059, + "ewc_loss_diag": 1.8253922462463379e-06, + "ewc_loss_parallel": 7.2361985985480715e-06, + "grad_norm": 4.42566442489624, + "learning_rate": 1.4158541754980923e-07, + "loss": 0.5861, + "mean_token_accuracy": 0.8219339847564697, + "num_tokens": 12801600.0, + "step": 335 + }, + { + "epoch": 0.0427426536064114, + "ewc_loss": 0.0025523025542497635, + "ewc_loss_diag": 1.8253922462463379e-06, + "ewc_loss_parallel": 7.288771485036705e-06, + "grad_norm": 5.062089443206787, + "learning_rate": 1.420093259855871e-07, + "loss": 0.6183, + "mean_token_accuracy": 0.8170256614685059, + "num_tokens": 12843584.0, + "step": 336 + }, + { + "epoch": 0.042869863885001906, + "ewc_loss": 0.0025655576027929783, + "ewc_loss_diag": 1.8253922462463379e-06, + "ewc_loss_parallel": 7.421322152367793e-06, + "grad_norm": 4.501428604125977, + "learning_rate": 1.4243323442136497e-07, + "loss": 0.6313, + "mean_token_accuracy": 0.8063843250274658, + "num_tokens": 12878105.0, + "step": 337 + }, + { + "epoch": 0.04299707416359242, + "ewc_loss": 0.002591365249827504, + "ewc_loss_diag": 1.8477439880371094e-06, + "ewc_loss_parallel": 7.450518296536757e-06, + "grad_norm": 4.167062759399414, + "learning_rate": 1.4285714285714285e-07, + "loss": 0.6389, + "mean_token_accuracy": 0.8058080673217773, + "num_tokens": 12911712.0, + "step": 338 + }, + { + "epoch": 0.04312428444218293, + "ewc_loss": 0.0025886204093694687, + "ewc_loss_diag": 1.8477439880371094e-06, + "ewc_loss_parallel": 7.423068836942548e-06, + "grad_norm": 4.35781717300415, + "learning_rate": 1.4328105129292072e-07, + "loss": 0.6209, + "mean_token_accuracy": 0.8120148181915283, + "num_tokens": 12951303.0, + "step": 339 + }, + { + "epoch": 0.043251494720773435, + "ewc_loss": 0.002597397193312645, + "ewc_loss_diag": 1.8551945686340332e-06, + "ewc_loss_parallel": 7.434542112605413e-06, + "grad_norm": 6.189615249633789, + "learning_rate": 1.437049597286986e-07, + "loss": 0.608, + "mean_token_accuracy": 0.8140289783477783, + "num_tokens": 12990544.0, + "step": 340 + }, + { + "epoch": 0.04337870499936395, + "ewc_loss": 0.0026182341389358044, + "ewc_loss_diag": 1.8551945686340332e-06, + "ewc_loss_parallel": 7.642912351002451e-06, + "grad_norm": 7.24943208694458, + "learning_rate": 1.4412886816447646e-07, + "loss": 0.6664, + "mean_token_accuracy": 0.7972854375839233, + "num_tokens": 13023766.0, + "step": 341 + }, + { + "epoch": 0.04350591527795446, + "ewc_loss": 0.002658673794940114, + "ewc_loss_diag": 1.8700957298278809e-06, + "ewc_loss_parallel": 7.894720511103515e-06, + "grad_norm": 4.78011417388916, + "learning_rate": 1.4455277660025434e-07, + "loss": 0.5798, + "mean_token_accuracy": 0.8220124244689941, + "num_tokens": 13056337.0, + "step": 342 + }, + { + "epoch": 0.04363312555654497, + "ewc_loss": 0.0026503014378249645, + "ewc_loss_diag": 1.8700957298278809e-06, + "ewc_loss_parallel": 7.810998795321211e-06, + "grad_norm": 3.766474723815918, + "learning_rate": 1.449766850360322e-07, + "loss": 0.6391, + "mean_token_accuracy": 0.8058393597602844, + "num_tokens": 13097345.0, + "step": 343 + }, + { + "epoch": 0.04376033583513548, + "ewc_loss": 0.0026484387926757336, + "ewc_loss_diag": 1.8924474716186523e-06, + "ewc_loss_parallel": 7.56349072617013e-06, + "grad_norm": 5.9264116287231445, + "learning_rate": 1.4540059347181008e-07, + "loss": 0.6236, + "mean_token_accuracy": 0.8090550899505615, + "num_tokens": 13127332.0, + "step": 344 + }, + { + "epoch": 0.043887546113725986, + "ewc_loss": 0.002665309701114893, + "ewc_loss_diag": 1.8998980522155762e-06, + "ewc_loss_parallel": 7.655904482817277e-06, + "grad_norm": 4.419288158416748, + "learning_rate": 1.4582450190758795e-07, + "loss": 0.6535, + "mean_token_accuracy": 0.7991259098052979, + "num_tokens": 13170760.0, + "step": 345 + }, + { + "epoch": 0.0440147563923165, + "ewc_loss": 0.002667666645720601, + "ewc_loss_diag": 1.9073486328125e-06, + "ewc_loss_parallel": 7.603180165460799e-06, + "grad_norm": 4.621676445007324, + "learning_rate": 1.4624841034336583e-07, + "loss": 0.6207, + "mean_token_accuracy": 0.8125730752944946, + "num_tokens": 13207061.0, + "step": 346 + }, + { + "epoch": 0.04414196667090701, + "ewc_loss": 0.002683336613699794, + "ewc_loss_diag": 1.9222497940063477e-06, + "ewc_loss_parallel": 7.607292445754865e-06, + "grad_norm": 4.094210147857666, + "learning_rate": 1.466723187791437e-07, + "loss": 0.5904, + "mean_token_accuracy": 0.8193410038948059, + "num_tokens": 13247903.0, + "step": 347 + }, + { + "epoch": 0.04426917694949752, + "ewc_loss": 0.002707886975258589, + "ewc_loss_diag": 1.952052116394043e-06, + "ewc_loss_parallel": 7.547619134129491e-06, + "grad_norm": 4.696077346801758, + "learning_rate": 1.4709622721492157e-07, + "loss": 0.6009, + "mean_token_accuracy": 0.8120642304420471, + "num_tokens": 13287193.0, + "step": 348 + }, + { + "epoch": 0.04439638722808803, + "ewc_loss": 0.0027284049428999424, + "ewc_loss_diag": 1.9669532775878906e-06, + "ewc_loss_parallel": 7.600212029501563e-06, + "grad_norm": 5.488556861877441, + "learning_rate": 1.4752013565069942e-07, + "loss": 0.605, + "mean_token_accuracy": 0.8150199055671692, + "num_tokens": 13323112.0, + "step": 349 + }, + { + "epoch": 0.04452359750667854, + "ewc_loss": 0.002743610180914402, + "ewc_loss_diag": 1.9669532775878906e-06, + "ewc_loss_parallel": 7.752262717986014e-06, + "grad_norm": 4.9037957191467285, + "learning_rate": 1.4794404408647732e-07, + "loss": 0.6005, + "mean_token_accuracy": 0.816804826259613, + "num_tokens": 13359653.0, + "step": 350 + }, + { + "epoch": 0.04465080778526905, + "ewc_loss": 0.002762171206995845, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.785286470607389e-06, + "grad_norm": 5.277317523956299, + "learning_rate": 1.4836795252225516e-07, + "loss": 0.5712, + "mean_token_accuracy": 0.8218432664871216, + "num_tokens": 13400571.0, + "step": 351 + }, + { + "epoch": 0.04477801806385956, + "ewc_loss": 0.0027661838103085756, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.825411557860207e-06, + "grad_norm": 4.88239860534668, + "learning_rate": 1.4879186095803306e-07, + "loss": 0.5487, + "mean_token_accuracy": 0.8270168900489807, + "num_tokens": 13437656.0, + "step": 352 + }, + { + "epoch": 0.04490522834245007, + "ewc_loss": 0.002760929986834526, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.772873686917592e-06, + "grad_norm": 4.171927452087402, + "learning_rate": 1.492157693938109e-07, + "loss": 0.6075, + "mean_token_accuracy": 0.8134242296218872, + "num_tokens": 13474688.0, + "step": 353 + }, + { + "epoch": 0.04503243862104058, + "ewc_loss": 0.0027446309104561806, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.6098826866655145e-06, + "grad_norm": 3.8604652881622314, + "learning_rate": 1.496396778295888e-07, + "loss": 0.5458, + "mean_token_accuracy": 0.8298903703689575, + "num_tokens": 13518256.0, + "step": 354 + }, + { + "epoch": 0.04515964889963109, + "ewc_loss": 0.002732273191213608, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.486306913051521e-06, + "grad_norm": 4.253170967102051, + "learning_rate": 1.5006358626536665e-07, + "loss": 0.6322, + "mean_token_accuracy": 0.8085780739784241, + "num_tokens": 13554315.0, + "step": 355 + }, + { + "epoch": 0.0452868591782216, + "ewc_loss": 0.002735667861998081, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.520254257542547e-06, + "grad_norm": 4.314901828765869, + "learning_rate": 1.5048749470114455e-07, + "loss": 0.6531, + "mean_token_accuracy": 0.8009228110313416, + "num_tokens": 13594388.0, + "step": 356 + }, + { + "epoch": 0.04541406945681211, + "ewc_loss": 0.002742811106145382, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.591685061925091e-06, + "grad_norm": 3.9285130500793457, + "learning_rate": 1.509114031369224e-07, + "loss": 0.5793, + "mean_token_accuracy": 0.8188096284866333, + "num_tokens": 13633704.0, + "step": 357 + }, + { + "epoch": 0.04554127973540262, + "ewc_loss": 0.002741673495620489, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.580308192700613e-06, + "grad_norm": 4.215423583984375, + "learning_rate": 1.513353115727003e-07, + "loss": 0.6285, + "mean_token_accuracy": 0.8037310242652893, + "num_tokens": 13671183.0, + "step": 358 + }, + { + "epoch": 0.04566849001399313, + "ewc_loss": 0.002745826030150056, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.621835266036214e-06, + "grad_norm": 3.9509549140930176, + "learning_rate": 1.5175922000847814e-07, + "loss": 0.584, + "mean_token_accuracy": 0.8209350109100342, + "num_tokens": 13709640.0, + "step": 359 + }, + { + "epoch": 0.04579570029258364, + "ewc_loss": 0.0027470530476421118, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.63410389481578e-06, + "grad_norm": 4.195169448852539, + "learning_rate": 1.5218312844425604e-07, + "loss": 0.6073, + "mean_token_accuracy": 0.8066051006317139, + "num_tokens": 13751114.0, + "step": 360 + }, + { + "epoch": 0.045922910571174154, + "ewc_loss": 0.0027524628676474094, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.688204277656041e-06, + "grad_norm": 3.9546525478363037, + "learning_rate": 1.526070368800339e-07, + "loss": 0.5368, + "mean_token_accuracy": 0.8289923667907715, + "num_tokens": 13786448.0, + "step": 361 + }, + { + "epoch": 0.04605012084976466, + "ewc_loss": 0.0027529150247573853, + "ewc_loss_diag": 1.9818544387817383e-06, + "ewc_loss_parallel": 7.692725375818554e-06, + "grad_norm": 3.9449775218963623, + "learning_rate": 1.530309453158118e-07, + "loss": 0.5603, + "mean_token_accuracy": 0.8306472897529602, + "num_tokens": 13827663.0, + "step": 362 + }, + { + "epoch": 0.04617733112835517, + "ewc_loss": 0.0027694287709891796, + "ewc_loss_diag": 1.996755599975586e-06, + "ewc_loss_parallel": 7.705274583713617e-06, + "grad_norm": 4.485123157501221, + "learning_rate": 1.5345485375158963e-07, + "loss": 0.6397, + "mean_token_accuracy": 0.8047031164169312, + "num_tokens": 13866078.0, + "step": 363 + }, + { + "epoch": 0.04630454140694568, + "ewc_loss": 0.002780705224722624, + "ewc_loss_diag": 1.996755599975586e-06, + "ewc_loss_parallel": 7.818037374818232e-06, + "grad_norm": 4.427474021911621, + "learning_rate": 1.5387876218736753e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.8376719355583191, + "num_tokens": 13909767.0, + "step": 364 + }, + { + "epoch": 0.04643175168553619, + "ewc_loss": 0.0027862985152751207, + "ewc_loss_diag": 1.996755599975586e-06, + "ewc_loss_parallel": 7.873972208471969e-06, + "grad_norm": 4.728917121887207, + "learning_rate": 1.5430267062314538e-07, + "loss": 0.6249, + "mean_token_accuracy": 0.810672402381897, + "num_tokens": 13948937.0, + "step": 365 + }, + { + "epoch": 0.0465589619641267, + "ewc_loss": 0.0028242473490536213, + "ewc_loss_diag": 2.0265579223632812e-06, + "ewc_loss_parallel": 7.948285201564431e-06, + "grad_norm": 5.699088096618652, + "learning_rate": 1.5472657905892328e-07, + "loss": 0.636, + "mean_token_accuracy": 0.8038994073867798, + "num_tokens": 13984039.0, + "step": 366 + }, + { + "epoch": 0.04668617224271721, + "ewc_loss": 0.0028582983650267124, + "ewc_loss_diag": 2.041459083557129e-06, + "ewc_loss_parallel": 8.136204996844754e-06, + "grad_norm": 5.103545665740967, + "learning_rate": 1.5515048749470113e-07, + "loss": 0.6182, + "mean_token_accuracy": 0.8085204362869263, + "num_tokens": 14018162.0, + "step": 367 + }, + { + "epoch": 0.04681338252130772, + "ewc_loss": 0.0028607051353901625, + "ewc_loss_diag": 2.041459083557129e-06, + "ewc_loss_parallel": 8.160273864632472e-06, + "grad_norm": 6.120087623596191, + "learning_rate": 1.55574395930479e-07, + "loss": 0.6182, + "mean_token_accuracy": 0.8097715973854065, + "num_tokens": 14056493.0, + "step": 368 + }, + { + "epoch": 0.046940592799898234, + "ewc_loss": 0.00285622151568532, + "ewc_loss_diag": 2.0265579223632812e-06, + "ewc_loss_parallel": 8.268025339930318e-06, + "grad_norm": 6.59307861328125, + "learning_rate": 1.5599830436625687e-07, + "loss": 0.6112, + "mean_token_accuracy": 0.8129112124443054, + "num_tokens": 14097530.0, + "step": 369 + }, + { + "epoch": 0.04706780307848874, + "ewc_loss": 0.002884144429117441, + "ewc_loss_diag": 2.041459083557129e-06, + "ewc_loss_parallel": 8.394666110689286e-06, + "grad_norm": 4.197859764099121, + "learning_rate": 1.5642221280203474e-07, + "loss": 0.6293, + "mean_token_accuracy": 0.8097529411315918, + "num_tokens": 14136240.0, + "step": 370 + }, + { + "epoch": 0.04719501335707925, + "ewc_loss": 0.00285475910641253, + "ewc_loss_diag": 2.041459083557129e-06, + "ewc_loss_parallel": 8.100813829514664e-06, + "grad_norm": 6.009146213531494, + "learning_rate": 1.5684612123781262e-07, + "loss": 0.5413, + "mean_token_accuracy": 0.8314064741134644, + "num_tokens": 14171010.0, + "step": 371 + }, + { + "epoch": 0.04732222363566976, + "ewc_loss": 0.0028718505054712296, + "ewc_loss_diag": 2.0563602447509766e-06, + "ewc_loss_parallel": 8.119140147755388e-06, + "grad_norm": 5.354580879211426, + "learning_rate": 1.572700296735905e-07, + "loss": 0.6125, + "mean_token_accuracy": 0.81785649061203, + "num_tokens": 14206985.0, + "step": 372 + }, + { + "epoch": 0.04744943391426027, + "ewc_loss": 0.002852344885468483, + "ewc_loss_diag": 2.041459083557129e-06, + "ewc_loss_parallel": 8.076671292656101e-06, + "grad_norm": 5.61984395980835, + "learning_rate": 1.576939381093684e-07, + "loss": 0.6106, + "mean_token_accuracy": 0.8092395067214966, + "num_tokens": 14236728.0, + "step": 373 + }, + { + "epoch": 0.047576644192850785, + "ewc_loss": 0.002853229409083724, + "ewc_loss_diag": 2.041459083557129e-06, + "ewc_loss_parallel": 8.085516128630843e-06, + "grad_norm": 3.4871699810028076, + "learning_rate": 1.5811784654514623e-07, + "loss": 0.5371, + "mean_token_accuracy": 0.8343605995178223, + "num_tokens": 14270758.0, + "step": 374 + }, + { + "epoch": 0.04770385447144129, + "ewc_loss": 0.002836304483935237, + "ewc_loss_diag": 2.0563602447509766e-06, + "ewc_loss_parallel": 7.76367960497737e-06, + "grad_norm": 3.9906768798828125, + "learning_rate": 1.5854175498092413e-07, + "loss": 0.6546, + "mean_token_accuracy": 0.7965760231018066, + "num_tokens": 14315002.0, + "step": 375 + }, + { + "epoch": 0.0478310647500318, + "ewc_loss": 0.0028448840603232384, + "ewc_loss_diag": 2.0712614059448242e-06, + "ewc_loss_parallel": 7.696887223573867e-06, + "grad_norm": 5.1191606521606445, + "learning_rate": 1.5896566341670198e-07, + "loss": 0.625, + "mean_token_accuracy": 0.8047226667404175, + "num_tokens": 14353385.0, + "step": 376 + }, + { + "epoch": 0.047958275028622314, + "ewc_loss": 0.0028792687226086855, + "ewc_loss_diag": 2.086162567138672e-06, + "ewc_loss_parallel": 7.8881466833991e-06, + "grad_norm": 4.123859405517578, + "learning_rate": 1.5938957185247988e-07, + "loss": 0.6576, + "mean_token_accuracy": 0.7937231063842773, + "num_tokens": 14391702.0, + "step": 377 + }, + { + "epoch": 0.04808548530721282, + "ewc_loss": 0.0028926334343850613, + "ewc_loss_diag": 2.1010637283325195e-06, + "ewc_loss_parallel": 7.869204637245275e-06, + "grad_norm": 4.624702453613281, + "learning_rate": 1.5981348028825772e-07, + "loss": 0.625, + "mean_token_accuracy": 0.8070091009140015, + "num_tokens": 14427876.0, + "step": 378 + }, + { + "epoch": 0.048212695585803336, + "ewc_loss": 0.002900798339396715, + "ewc_loss_diag": 2.1010637283325195e-06, + "ewc_loss_parallel": 7.95085452409694e-06, + "grad_norm": 5.2039361000061035, + "learning_rate": 1.6023738872403562e-07, + "loss": 0.7112, + "mean_token_accuracy": 0.7808626294136047, + "num_tokens": 14460349.0, + "step": 379 + }, + { + "epoch": 0.048339905864393844, + "ewc_loss": 0.002944548614323139, + "ewc_loss_diag": 2.130866050720215e-06, + "ewc_loss_parallel": 8.083182365226094e-06, + "grad_norm": 4.307863235473633, + "learning_rate": 1.6066129715981347e-07, + "loss": 0.5617, + "mean_token_accuracy": 0.8273558616638184, + "num_tokens": 14497763.0, + "step": 380 + }, + { + "epoch": 0.04846711614298435, + "ewc_loss": 0.0029212860390543938, + "ewc_loss_diag": 2.115964889526367e-06, + "ewc_loss_parallel": 8.00314410298597e-06, + "grad_norm": 3.815162420272827, + "learning_rate": 1.6108520559559137e-07, + "loss": 0.5558, + "mean_token_accuracy": 0.8249905109405518, + "num_tokens": 14537172.0, + "step": 381 + }, + { + "epoch": 0.048594326421574865, + "ewc_loss": 0.002918520476669073, + "ewc_loss_diag": 2.130866050720215e-06, + "ewc_loss_parallel": 7.822898623999208e-06, + "grad_norm": 4.078006267547607, + "learning_rate": 1.6150911403136921e-07, + "loss": 0.6418, + "mean_token_accuracy": 0.8078720569610596, + "num_tokens": 14565515.0, + "step": 382 + }, + { + "epoch": 0.04872153670016537, + "ewc_loss": 0.0029221922159194946, + "ewc_loss_diag": 2.130866050720215e-06, + "ewc_loss_parallel": 7.859617653593887e-06, + "grad_norm": 3.359126567840576, + "learning_rate": 1.619330224671471e-07, + "loss": 0.6333, + "mean_token_accuracy": 0.8068957328796387, + "num_tokens": 14608104.0, + "step": 383 + }, + { + "epoch": 0.04884874697875588, + "ewc_loss": 0.0029427087865769863, + "ewc_loss_diag": 2.16066837310791e-06, + "ewc_loss_parallel": 7.759608706692234e-06, + "grad_norm": 4.751634120941162, + "learning_rate": 1.6235693090292496e-07, + "loss": 0.6419, + "mean_token_accuracy": 0.8066756725311279, + "num_tokens": 14645328.0, + "step": 384 + }, + { + "epoch": 0.048975957257346395, + "ewc_loss": 0.0029688768554478884, + "ewc_loss_diag": 2.16066837310791e-06, + "ewc_loss_parallel": 8.02128852228634e-06, + "grad_norm": 4.446407794952393, + "learning_rate": 1.6278083933870286e-07, + "loss": 0.674, + "mean_token_accuracy": 0.7901840806007385, + "num_tokens": 14678792.0, + "step": 385 + }, + { + "epoch": 0.0491031675359369, + "ewc_loss": 0.002981564961373806, + "ewc_loss_diag": 2.16066837310791e-06, + "ewc_loss_parallel": 8.148170309141278e-06, + "grad_norm": 4.027183532714844, + "learning_rate": 1.632047477744807e-07, + "loss": 0.5979, + "mean_token_accuracy": 0.8167493343353271, + "num_tokens": 14715095.0, + "step": 386 + }, + { + "epoch": 0.049230377814527417, + "ewc_loss": 0.0029786499217152596, + "ewc_loss_diag": 2.16066837310791e-06, + "ewc_loss_parallel": 8.11901827546535e-06, + "grad_norm": 3.813391923904419, + "learning_rate": 1.6362865621025858e-07, + "loss": 0.6173, + "mean_token_accuracy": 0.8103901147842407, + "num_tokens": 14753641.0, + "step": 387 + }, + { + "epoch": 0.049357588093117924, + "ewc_loss": 0.0029899058863520622, + "ewc_loss_diag": 2.175569534301758e-06, + "ewc_loss_parallel": 8.078991413640324e-06, + "grad_norm": 4.366762161254883, + "learning_rate": 1.6405256464603645e-07, + "loss": 0.5887, + "mean_token_accuracy": 0.8174576759338379, + "num_tokens": 14788816.0, + "step": 388 + }, + { + "epoch": 0.04948479837170843, + "ewc_loss": 0.0030163731426000595, + "ewc_loss_diag": 2.1904706954956055e-06, + "ewc_loss_parallel": 8.191076631192118e-06, + "grad_norm": 4.785707473754883, + "learning_rate": 1.6447647308181432e-07, + "loss": 0.6323, + "mean_token_accuracy": 0.8043857216835022, + "num_tokens": 14821607.0, + "step": 389 + }, + { + "epoch": 0.049612008650298946, + "ewc_loss": 0.0030325932893902063, + "ewc_loss_diag": 2.1904706954956055e-06, + "ewc_loss_parallel": 8.353276825801004e-06, + "grad_norm": 3.638540744781494, + "learning_rate": 1.649003815175922e-07, + "loss": 0.6504, + "mean_token_accuracy": 0.8022476434707642, + "num_tokens": 14861298.0, + "step": 390 + }, + { + "epoch": 0.04973921892888945, + "ewc_loss": 0.003015553578734398, + "ewc_loss_diag": 2.1904706954956055e-06, + "ewc_loss_parallel": 8.18288026493974e-06, + "grad_norm": 3.533489465713501, + "learning_rate": 1.6532428995337007e-07, + "loss": 0.567, + "mean_token_accuracy": 0.8236750960350037, + "num_tokens": 14899951.0, + "step": 391 + }, + { + "epoch": 0.04986642920747997, + "ewc_loss": 0.003019745461642742, + "ewc_loss_diag": 2.205371856689453e-06, + "ewc_loss_parallel": 8.072209311649203e-06, + "grad_norm": 3.3951733112335205, + "learning_rate": 1.6574819838914794e-07, + "loss": 0.5485, + "mean_token_accuracy": 0.8305099010467529, + "num_tokens": 14937169.0, + "step": 392 + }, + { + "epoch": 0.049993639486070475, + "ewc_loss": 0.0030164397321641445, + "ewc_loss_diag": 2.205371856689453e-06, + "ewc_loss_parallel": 8.039151907723863e-06, + "grad_norm": 3.9059841632843018, + "learning_rate": 1.661721068249258e-07, + "loss": 0.6003, + "mean_token_accuracy": 0.8144786953926086, + "num_tokens": 14975549.0, + "step": 393 + }, + { + "epoch": 0.05012084976466098, + "ewc_loss": 0.0030348780564963818, + "ewc_loss_diag": 2.205371856689453e-06, + "ewc_loss_parallel": 8.223535587603692e-06, + "grad_norm": 4.0061421394348145, + "learning_rate": 1.6659601526070368e-07, + "loss": 0.6276, + "mean_token_accuracy": 0.8076900243759155, + "num_tokens": 15016630.0, + "step": 394 + }, + { + "epoch": 0.0502480600432515, + "ewc_loss": 0.0030518565326929092, + "ewc_loss_diag": 2.205371856689453e-06, + "ewc_loss_parallel": 8.393320058530662e-06, + "grad_norm": 3.647592306137085, + "learning_rate": 1.6701992369648156e-07, + "loss": 0.6094, + "mean_token_accuracy": 0.810671329498291, + "num_tokens": 15057822.0, + "step": 395 + }, + { + "epoch": 0.050375270321842004, + "ewc_loss": 0.0030620479956269264, + "ewc_loss_diag": 2.2202730178833008e-06, + "ewc_loss_parallel": 8.342646651726682e-06, + "grad_norm": 5.608916759490967, + "learning_rate": 1.6744383213225943e-07, + "loss": 0.5827, + "mean_token_accuracy": 0.8157120943069458, + "num_tokens": 15094385.0, + "step": 396 + }, + { + "epoch": 0.05050248060043251, + "ewc_loss": 0.003102485788986087, + "ewc_loss_diag": 2.2351741790771484e-06, + "ewc_loss_parallel": 8.747026186028961e-06, + "grad_norm": 4.343994140625, + "learning_rate": 1.678677405680373e-07, + "loss": 0.5421, + "mean_token_accuracy": 0.8277063369750977, + "num_tokens": 15130969.0, + "step": 397 + }, + { + "epoch": 0.050629690879023026, + "ewc_loss": 0.003099411725997925, + "ewc_loss_diag": 2.2351741790771484e-06, + "ewc_loss_parallel": 8.716286174603738e-06, + "grad_norm": 4.524201393127441, + "learning_rate": 1.6829164900381518e-07, + "loss": 0.5637, + "mean_token_accuracy": 0.8205248117446899, + "num_tokens": 15166586.0, + "step": 398 + }, + { + "epoch": 0.05075690115761353, + "ewc_loss": 0.0031102702487260103, + "ewc_loss_diag": 2.250075340270996e-06, + "ewc_loss_parallel": 8.67228300194256e-06, + "grad_norm": 3.7512993812561035, + "learning_rate": 1.6871555743959305e-07, + "loss": 0.5534, + "mean_token_accuracy": 0.8246380090713501, + "num_tokens": 15209603.0, + "step": 399 + }, + { + "epoch": 0.05088411143620405, + "ewc_loss": 0.003085196716710925, + "ewc_loss_diag": 2.250075340270996e-06, + "ewc_loss_parallel": 8.421548045589589e-06, + "grad_norm": 3.863922357559204, + "learning_rate": 1.6913946587537092e-07, + "loss": 0.5708, + "mean_token_accuracy": 0.8202356696128845, + "num_tokens": 15251542.0, + "step": 400 + }, + { + "epoch": 0.051011321714794555, + "ewc_loss": 0.0030946428887546062, + "ewc_loss_diag": 2.2649765014648438e-06, + "ewc_loss_parallel": 8.363421329704579e-06, + "grad_norm": 5.519947528839111, + "learning_rate": 1.695633743111488e-07, + "loss": 0.5855, + "mean_token_accuracy": 0.8201785087585449, + "num_tokens": 15288940.0, + "step": 401 + }, + { + "epoch": 0.05113853199338506, + "ewc_loss": 0.003129372140392661, + "ewc_loss_diag": 2.2649765014648438e-06, + "ewc_loss_parallel": 8.710712791071273e-06, + "grad_norm": 3.866870880126953, + "learning_rate": 1.6998728274692667e-07, + "loss": 0.5523, + "mean_token_accuracy": 0.8308719396591187, + "num_tokens": 15321965.0, + "step": 402 + }, + { + "epoch": 0.05126574227197558, + "ewc_loss": 0.003127338830381632, + "ewc_loss_diag": 2.2798776626586914e-06, + "ewc_loss_parallel": 8.53779238241259e-06, + "grad_norm": 3.351877450942993, + "learning_rate": 1.7041119118270454e-07, + "loss": 0.5608, + "mean_token_accuracy": 0.8270860910415649, + "num_tokens": 15362895.0, + "step": 403 + }, + { + "epoch": 0.051392952550566084, + "ewc_loss": 0.003101700684055686, + "ewc_loss_diag": 2.2798776626586914e-06, + "ewc_loss_parallel": 8.281411282951012e-06, + "grad_norm": 5.377229690551758, + "learning_rate": 1.7083509961848238e-07, + "loss": 0.6184, + "mean_token_accuracy": 0.807658314704895, + "num_tokens": 15396675.0, + "step": 404 + }, + { + "epoch": 0.0515201628291566, + "ewc_loss": 0.0031995598692446947, + "ewc_loss_diag": 2.339482307434082e-06, + "ewc_loss_parallel": 8.649651135783643e-06, + "grad_norm": 5.860054016113281, + "learning_rate": 1.7125900805426028e-07, + "loss": 0.6398, + "mean_token_accuracy": 0.8032254576683044, + "num_tokens": 15443070.0, + "step": 405 + }, + { + "epoch": 0.051647373107747106, + "ewc_loss": 0.0032045599073171616, + "ewc_loss_diag": 2.3096799850463867e-06, + "ewc_loss_parallel": 9.00482882570941e-06, + "grad_norm": 3.6961936950683594, + "learning_rate": 1.7168291649003813e-07, + "loss": 0.5666, + "mean_token_accuracy": 0.8228973150253296, + "num_tokens": 15476747.0, + "step": 406 + }, + { + "epoch": 0.051774583386337614, + "ewc_loss": 0.0031637796200811863, + "ewc_loss_diag": 2.3096799850463867e-06, + "ewc_loss_parallel": 8.597025043854956e-06, + "grad_norm": 3.431830644607544, + "learning_rate": 1.7210682492581603e-07, + "loss": 0.5926, + "mean_token_accuracy": 0.8177402019500732, + "num_tokens": 15514866.0, + "step": 407 + }, + { + "epoch": 0.05190179366492813, + "ewc_loss": 0.0031494556460529566, + "ewc_loss_diag": 2.3245811462402344e-06, + "ewc_loss_parallel": 8.301197340188082e-06, + "grad_norm": 3.755432367324829, + "learning_rate": 1.7253073336159387e-07, + "loss": 0.5695, + "mean_token_accuracy": 0.8207617998123169, + "num_tokens": 15554245.0, + "step": 408 + }, + { + "epoch": 0.052029003943518635, + "ewc_loss": 0.0031387319322675467, + "ewc_loss_diag": 2.3096799850463867e-06, + "ewc_loss_parallel": 8.346547474502586e-06, + "grad_norm": 3.525961399078369, + "learning_rate": 1.7295464179737177e-07, + "loss": 0.5987, + "mean_token_accuracy": 0.8166260719299316, + "num_tokens": 15593958.0, + "step": 409 + }, + { + "epoch": 0.05215621422210915, + "ewc_loss": 0.0031433545518666506, + "ewc_loss_diag": 2.3096799850463867e-06, + "ewc_loss_parallel": 8.392774361709598e-06, + "grad_norm": 4.143446445465088, + "learning_rate": 1.7337855023314962e-07, + "loss": 0.5438, + "mean_token_accuracy": 0.829754114151001, + "num_tokens": 15632567.0, + "step": 410 + }, + { + "epoch": 0.05228342450069966, + "ewc_loss": 0.0031900410540401936, + "ewc_loss_diag": 2.339482307434082e-06, + "ewc_loss_parallel": 8.554462510801386e-06, + "grad_norm": 3.821388006210327, + "learning_rate": 1.7380245866892752e-07, + "loss": 0.5522, + "mean_token_accuracy": 0.8263077735900879, + "num_tokens": 15668255.0, + "step": 411 + }, + { + "epoch": 0.052410634779290165, + "ewc_loss": 0.003205662127584219, + "ewc_loss_diag": 2.3543834686279297e-06, + "ewc_loss_parallel": 8.558085028198548e-06, + "grad_norm": 3.9122207164764404, + "learning_rate": 1.7422636710470536e-07, + "loss": 0.6052, + "mean_token_accuracy": 0.8116806745529175, + "num_tokens": 15705435.0, + "step": 412 + }, + { + "epoch": 0.05253784505788068, + "ewc_loss": 0.00322313467040658, + "ewc_loss_diag": 2.3692846298217773e-06, + "ewc_loss_parallel": 8.580223038734403e-06, + "grad_norm": 3.926379919052124, + "learning_rate": 1.7465027554048326e-07, + "loss": 0.6256, + "mean_token_accuracy": 0.8055883646011353, + "num_tokens": 15744657.0, + "step": 413 + }, + { + "epoch": 0.05266505533647119, + "ewc_loss": 0.00322186597622931, + "ewc_loss_diag": 2.3692846298217773e-06, + "ewc_loss_parallel": 8.567536497139372e-06, + "grad_norm": 4.279619216918945, + "learning_rate": 1.750741839762611e-07, + "loss": 0.5697, + "mean_token_accuracy": 0.8248063325881958, + "num_tokens": 15777374.0, + "step": 414 + }, + { + "epoch": 0.052792265615061694, + "ewc_loss": 0.0032330648973584175, + "ewc_loss_diag": 2.3692846298217773e-06, + "ewc_loss_parallel": 8.679527127242181e-06, + "grad_norm": 3.522690534591675, + "learning_rate": 1.75498092412039e-07, + "loss": 0.541, + "mean_token_accuracy": 0.8317276835441589, + "num_tokens": 15814047.0, + "step": 415 + }, + { + "epoch": 0.05291947589365221, + "ewc_loss": 0.0032169052865356207, + "ewc_loss_diag": 2.3692846298217773e-06, + "ewc_loss_parallel": 8.51792901812587e-06, + "grad_norm": 3.415806770324707, + "learning_rate": 1.7592200084781686e-07, + "loss": 0.6225, + "mean_token_accuracy": 0.8063265085220337, + "num_tokens": 15849110.0, + "step": 416 + }, + { + "epoch": 0.053046686172242716, + "ewc_loss": 0.0032263321336358786, + "ewc_loss_diag": 2.384185791015625e-06, + "ewc_loss_parallel": 8.459610398858786e-06, + "grad_norm": 3.896848440170288, + "learning_rate": 1.7634590928359475e-07, + "loss": 0.5928, + "mean_token_accuracy": 0.8187577724456787, + "num_tokens": 15888910.0, + "step": 417 + }, + { + "epoch": 0.05317389645083323, + "ewc_loss": 0.0032718866132199764, + "ewc_loss_diag": 2.4139881134033203e-06, + "ewc_loss_parallel": 8.609978976892307e-06, + "grad_norm": 3.729001522064209, + "learning_rate": 1.767698177193726e-07, + "loss": 0.579, + "mean_token_accuracy": 0.8197044730186462, + "num_tokens": 15925604.0, + "step": 418 + }, + { + "epoch": 0.05330110672942374, + "ewc_loss": 0.003308933926746249, + "ewc_loss_diag": 2.4437904357910156e-06, + "ewc_loss_parallel": 8.675277058500797e-06, + "grad_norm": 3.514507532119751, + "learning_rate": 1.771937261551505e-07, + "loss": 0.5533, + "mean_token_accuracy": 0.8335329294204712, + "num_tokens": 15961401.0, + "step": 419 + }, + { + "epoch": 0.053428317008014245, + "ewc_loss": 0.0033203347120434046, + "ewc_loss_diag": 2.4586915969848633e-06, + "ewc_loss_parallel": 8.636696293251589e-06, + "grad_norm": 3.685572624206543, + "learning_rate": 1.7761763459092835e-07, + "loss": 0.5841, + "mean_token_accuracy": 0.8192050457000732, + "num_tokens": 16001029.0, + "step": 420 + }, + { + "epoch": 0.05355552728660476, + "ewc_loss": 0.0033245026133954525, + "ewc_loss_diag": 2.4586915969848633e-06, + "ewc_loss_parallel": 8.678375706949737e-06, + "grad_norm": 4.2381415367126465, + "learning_rate": 1.7804154302670624e-07, + "loss": 0.5653, + "mean_token_accuracy": 0.8249595165252686, + "num_tokens": 16036014.0, + "step": 421 + }, + { + "epoch": 0.05368273756519527, + "ewc_loss": 0.0033766604028642178, + "ewc_loss_diag": 2.4884939193725586e-06, + "ewc_loss_parallel": 8.89477723831078e-06, + "grad_norm": 3.497786521911621, + "learning_rate": 1.784654514624841e-07, + "loss": 0.5472, + "mean_token_accuracy": 0.8264187574386597, + "num_tokens": 16075373.0, + "step": 422 + }, + { + "epoch": 0.05380994784378578, + "ewc_loss": 0.003360538277775049, + "ewc_loss_diag": 2.4884939193725586e-06, + "ewc_loss_parallel": 8.733556569495704e-06, + "grad_norm": 3.314987897872925, + "learning_rate": 1.7888935989826196e-07, + "loss": 0.5891, + "mean_token_accuracy": 0.8139840364456177, + "num_tokens": 16111352.0, + "step": 423 + }, + { + "epoch": 0.05393715812237629, + "ewc_loss": 0.0033664072398096323, + "ewc_loss_diag": 2.5033950805664062e-06, + "ewc_loss_parallel": 8.639657608000562e-06, + "grad_norm": 3.555546283721924, + "learning_rate": 1.7931326833403984e-07, + "loss": 0.605, + "mean_token_accuracy": 0.8101905584335327, + "num_tokens": 16148826.0, + "step": 424 + }, + { + "epoch": 0.054064368400966796, + "ewc_loss": 0.0033755959011614323, + "ewc_loss_diag": 2.5033950805664062e-06, + "ewc_loss_parallel": 8.731544767215382e-06, + "grad_norm": 3.502319097518921, + "learning_rate": 1.797371767698177e-07, + "loss": 0.5141, + "mean_token_accuracy": 0.8401153087615967, + "num_tokens": 16193491.0, + "step": 425 + }, + { + "epoch": 0.05419157867955731, + "ewc_loss": 0.0033968668431043625, + "ewc_loss_diag": 2.518296241760254e-06, + "ewc_loss_parallel": 8.791666914476082e-06, + "grad_norm": 3.7356245517730713, + "learning_rate": 1.8016108520559558e-07, + "loss": 0.562, + "mean_token_accuracy": 0.8255051970481873, + "num_tokens": 16235595.0, + "step": 426 + }, + { + "epoch": 0.05431878895814782, + "ewc_loss": 0.00344217661768198, + "ewc_loss_diag": 2.5480985641479492e-06, + "ewc_loss_parallel": 8.93958713277243e-06, + "grad_norm": 4.400539875030518, + "learning_rate": 1.8058499364137345e-07, + "loss": 0.6159, + "mean_token_accuracy": 0.8137260675430298, + "num_tokens": 16274105.0, + "step": 427 + }, + { + "epoch": 0.054445999236738325, + "ewc_loss": 0.003469133283942938, + "ewc_loss_diag": 2.5480985641479492e-06, + "ewc_loss_parallel": 9.209154995915014e-06, + "grad_norm": 3.5213499069213867, + "learning_rate": 1.8100890207715133e-07, + "loss": 0.6592, + "mean_token_accuracy": 0.7930068969726562, + "num_tokens": 16315886.0, + "step": 428 + }, + { + "epoch": 0.05457320951532884, + "ewc_loss": 0.0034652308095246553, + "ewc_loss_diag": 2.562999725341797e-06, + "ewc_loss_parallel": 9.017542652145494e-06, + "grad_norm": 3.605443000793457, + "learning_rate": 1.814328105129292e-07, + "loss": 0.6005, + "mean_token_accuracy": 0.8127360343933105, + "num_tokens": 16353021.0, + "step": 429 + }, + { + "epoch": 0.05470041979391935, + "ewc_loss": 0.003461036831140518, + "ewc_loss_diag": 2.562999725341797e-06, + "ewc_loss_parallel": 8.975602213467937e-06, + "grad_norm": 4.27105188369751, + "learning_rate": 1.8185671894870707e-07, + "loss": 0.5759, + "mean_token_accuracy": 0.8175243735313416, + "num_tokens": 16380887.0, + "step": 430 + }, + { + "epoch": 0.05482763007250986, + "ewc_loss": 0.003514662617817521, + "ewc_loss_diag": 2.592802047729492e-06, + "ewc_loss_parallel": 9.206684808304999e-06, + "grad_norm": 3.4714205265045166, + "learning_rate": 1.8228062738448494e-07, + "loss": 0.6314, + "mean_token_accuracy": 0.8059914708137512, + "num_tokens": 16421066.0, + "step": 431 + }, + { + "epoch": 0.05495484035110037, + "ewc_loss": 0.003500138409435749, + "ewc_loss_diag": 2.592802047729492e-06, + "ewc_loss_parallel": 9.061443961400073e-06, + "grad_norm": 3.8940579891204834, + "learning_rate": 1.8270453582026282e-07, + "loss": 0.6001, + "mean_token_accuracy": 0.8131669759750366, + "num_tokens": 16457197.0, + "step": 432 + }, + { + "epoch": 0.055082050629690876, + "ewc_loss": 0.003541070967912674, + "ewc_loss_diag": 2.6226043701171875e-06, + "ewc_loss_parallel": 9.165591109194793e-06, + "grad_norm": 3.2603490352630615, + "learning_rate": 1.831284442560407e-07, + "loss": 0.5555, + "mean_token_accuracy": 0.826229453086853, + "num_tokens": 16497385.0, + "step": 433 + }, + { + "epoch": 0.05520926090828139, + "ewc_loss": 0.0035232966765761375, + "ewc_loss_diag": 2.6226043701171875e-06, + "ewc_loss_parallel": 8.98784855962731e-06, + "grad_norm": 3.4733877182006836, + "learning_rate": 1.8355235269181856e-07, + "loss": 0.5704, + "mean_token_accuracy": 0.8230955600738525, + "num_tokens": 16540099.0, + "step": 434 + }, + { + "epoch": 0.0553364711868719, + "ewc_loss": 0.0035322681069374084, + "ewc_loss_diag": 2.6226043701171875e-06, + "ewc_loss_parallel": 9.077563845494296e-06, + "grad_norm": 3.837073802947998, + "learning_rate": 1.8397626112759643e-07, + "loss": 0.5669, + "mean_token_accuracy": 0.8236547708511353, + "num_tokens": 16572760.0, + "step": 435 + }, + { + "epoch": 0.05546368146546241, + "ewc_loss": 0.0035504898987710476, + "ewc_loss_diag": 2.6226043701171875e-06, + "ewc_loss_parallel": 9.259782018489204e-06, + "grad_norm": 3.529405117034912, + "learning_rate": 1.844001695633743e-07, + "loss": 0.5266, + "mean_token_accuracy": 0.8338930606842041, + "num_tokens": 16608549.0, + "step": 436 + }, + { + "epoch": 0.05559089174405292, + "ewc_loss": 0.003578366246074438, + "ewc_loss_diag": 2.652406692504883e-06, + "ewc_loss_parallel": 9.233368473360315e-06, + "grad_norm": 3.856733560562134, + "learning_rate": 1.8482407799915218e-07, + "loss": 0.5642, + "mean_token_accuracy": 0.8246386051177979, + "num_tokens": 16643800.0, + "step": 437 + }, + { + "epoch": 0.05571810202264343, + "ewc_loss": 0.0035897830966860056, + "ewc_loss_diag": 2.652406692504883e-06, + "ewc_loss_parallel": 9.34753734327387e-06, + "grad_norm": 4.29860782623291, + "learning_rate": 1.8524798643493005e-07, + "loss": 0.6369, + "mean_token_accuracy": 0.8004405498504639, + "num_tokens": 16677948.0, + "step": 438 + }, + { + "epoch": 0.05584531230123394, + "ewc_loss": 0.003624517237767577, + "ewc_loss_diag": 2.6673078536987305e-06, + "ewc_loss_parallel": 9.542291081743315e-06, + "grad_norm": 3.5878684520721436, + "learning_rate": 1.8567189487070792e-07, + "loss": 0.6105, + "mean_token_accuracy": 0.811646044254303, + "num_tokens": 16716905.0, + "step": 439 + }, + { + "epoch": 0.05597252257982445, + "ewc_loss": 0.0036240764893591404, + "ewc_loss_diag": 2.682209014892578e-06, + "ewc_loss_parallel": 9.385296834807377e-06, + "grad_norm": 3.578251838684082, + "learning_rate": 1.8609580330648577e-07, + "loss": 0.5219, + "mean_token_accuracy": 0.8353571891784668, + "num_tokens": 16756158.0, + "step": 440 + }, + { + "epoch": 0.05609973285841496, + "ewc_loss": 0.003614986315369606, + "ewc_loss_diag": 2.682209014892578e-06, + "ewc_loss_parallel": 9.294394658354577e-06, + "grad_norm": 3.992626428604126, + "learning_rate": 1.8651971174226367e-07, + "loss": 0.6133, + "mean_token_accuracy": 0.8131600022315979, + "num_tokens": 16794261.0, + "step": 441 + }, + { + "epoch": 0.05622694313700547, + "ewc_loss": 0.003642804455012083, + "ewc_loss_diag": 2.6971101760864258e-06, + "ewc_loss_parallel": 9.4199867817224e-06, + "grad_norm": 3.0814929008483887, + "learning_rate": 1.8694362017804152e-07, + "loss": 0.5699, + "mean_token_accuracy": 0.8210786581039429, + "num_tokens": 16834953.0, + "step": 442 + }, + { + "epoch": 0.05635415341559598, + "ewc_loss": 0.003615068271756172, + "ewc_loss_diag": 2.6971101760864258e-06, + "ewc_loss_parallel": 9.142625458480325e-06, + "grad_norm": 3.450849771499634, + "learning_rate": 1.8736752861381941e-07, + "loss": 0.5285, + "mean_token_accuracy": 0.8355884552001953, + "num_tokens": 16873954.0, + "step": 443 + }, + { + "epoch": 0.05648136369418649, + "ewc_loss": 0.0036693979054689407, + "ewc_loss_diag": 2.7418136596679688e-06, + "ewc_loss_parallel": 9.228157978213858e-06, + "grad_norm": 3.679018259048462, + "learning_rate": 1.8779143704959726e-07, + "loss": 0.551, + "mean_token_accuracy": 0.8247748017311096, + "num_tokens": 16906018.0, + "step": 444 + }, + { + "epoch": 0.056608573972777, + "ewc_loss": 0.003672245657071471, + "ewc_loss_diag": 2.726912498474121e-06, + "ewc_loss_parallel": 9.40922382142162e-06, + "grad_norm": 3.7004683017730713, + "learning_rate": 1.8821534548537516e-07, + "loss": 0.6037, + "mean_token_accuracy": 0.8133450746536255, + "num_tokens": 16938945.0, + "step": 445 + }, + { + "epoch": 0.05673578425136751, + "ewc_loss": 0.003682073438540101, + "ewc_loss_diag": 2.726912498474121e-06, + "ewc_loss_parallel": 9.507501999905799e-06, + "grad_norm": 3.190620183944702, + "learning_rate": 1.88639253921153e-07, + "loss": 0.5625, + "mean_token_accuracy": 0.8267725706100464, + "num_tokens": 16982214.0, + "step": 446 + }, + { + "epoch": 0.05686299452995802, + "ewc_loss": 0.003695039777085185, + "ewc_loss_diag": 2.7567148208618164e-06, + "ewc_loss_parallel": 9.331989531347062e-06, + "grad_norm": 3.8115129470825195, + "learning_rate": 1.890631623569309e-07, + "loss": 0.5718, + "mean_token_accuracy": 0.8216222524642944, + "num_tokens": 17024367.0, + "step": 447 + }, + { + "epoch": 0.05699020480854853, + "ewc_loss": 0.0037336815148591995, + "ewc_loss_diag": 2.771615982055664e-06, + "ewc_loss_parallel": 9.56581970967818e-06, + "grad_norm": 3.781249761581421, + "learning_rate": 1.8948707079270875e-07, + "loss": 0.593, + "mean_token_accuracy": 0.8115103244781494, + "num_tokens": 17057451.0, + "step": 448 + }, + { + "epoch": 0.057117415087139044, + "ewc_loss": 0.003743296954780817, + "ewc_loss_diag": 2.771615982055664e-06, + "ewc_loss_parallel": 9.66197421803372e-06, + "grad_norm": 4.0288238525390625, + "learning_rate": 1.8991097922848665e-07, + "loss": 0.5446, + "mean_token_accuracy": 0.8286401033401489, + "num_tokens": 17094633.0, + "step": 449 + }, + { + "epoch": 0.05724462536572955, + "ewc_loss": 0.003771568415686488, + "ewc_loss_diag": 2.7865171432495117e-06, + "ewc_loss_parallel": 9.792100172489882e-06, + "grad_norm": 3.847322940826416, + "learning_rate": 1.903348876642645e-07, + "loss": 0.599, + "mean_token_accuracy": 0.8110698461532593, + "num_tokens": 17135892.0, + "step": 450 + }, + { + "epoch": 0.05737183564432006, + "ewc_loss": 0.0037647192366421223, + "ewc_loss_diag": 2.7865171432495117e-06, + "ewc_loss_parallel": 9.723608854983468e-06, + "grad_norm": 3.6336541175842285, + "learning_rate": 1.907587961000424e-07, + "loss": 0.5421, + "mean_token_accuracy": 0.8315454125404358, + "num_tokens": 17173881.0, + "step": 451 + }, + { + "epoch": 0.05749904592291057, + "ewc_loss": 0.0037523286882787943, + "ewc_loss_diag": 2.7865171432495117e-06, + "ewc_loss_parallel": 9.599702934792731e-06, + "grad_norm": 4.2113728523254395, + "learning_rate": 1.9118270453582024e-07, + "loss": 0.5912, + "mean_token_accuracy": 0.8168498873710632, + "num_tokens": 17215668.0, + "step": 452 + }, + { + "epoch": 0.05762625620150108, + "ewc_loss": 0.003785103792324662, + "ewc_loss_diag": 2.8014183044433594e-06, + "ewc_loss_parallel": 9.774866157385986e-06, + "grad_norm": 3.622321128845215, + "learning_rate": 1.9160661297159814e-07, + "loss": 0.6141, + "mean_token_accuracy": 0.8112578392028809, + "num_tokens": 17254339.0, + "step": 453 + }, + { + "epoch": 0.057753466480091595, + "ewc_loss": 0.00377149716950953, + "ewc_loss_diag": 2.8014183044433594e-06, + "ewc_loss_parallel": 9.638800293032546e-06, + "grad_norm": 3.2508766651153564, + "learning_rate": 1.9203052140737599e-07, + "loss": 0.5996, + "mean_token_accuracy": 0.811809778213501, + "num_tokens": 17288858.0, + "step": 454 + }, + { + "epoch": 0.0578806767586821, + "ewc_loss": 0.0037831312511116266, + "ewc_loss_diag": 2.8312206268310547e-06, + "ewc_loss_parallel": 9.44996554608224e-06, + "grad_norm": 3.4461824893951416, + "learning_rate": 1.9245442984315389e-07, + "loss": 0.5761, + "mean_token_accuracy": 0.8231080770492554, + "num_tokens": 17330601.0, + "step": 455 + }, + { + "epoch": 0.05800788703727261, + "ewc_loss": 0.0038081062957644463, + "ewc_loss_diag": 2.8461217880249023e-06, + "ewc_loss_parallel": 9.54712595557794e-06, + "grad_norm": 3.3370327949523926, + "learning_rate": 1.9287833827893173e-07, + "loss": 0.5958, + "mean_token_accuracy": 0.8140740394592285, + "num_tokens": 17367797.0, + "step": 456 + }, + { + "epoch": 0.058135097315863124, + "ewc_loss": 0.003824983723461628, + "ewc_loss_diag": 2.86102294921875e-06, + "ewc_loss_parallel": 9.563314051774796e-06, + "grad_norm": 3.578232765197754, + "learning_rate": 1.9330224671470963e-07, + "loss": 0.5813, + "mean_token_accuracy": 0.8185141682624817, + "num_tokens": 17403206.0, + "step": 457 + }, + { + "epoch": 0.05826230759445363, + "ewc_loss": 0.0038580219261348248, + "ewc_loss_diag": 2.8908252716064453e-06, + "ewc_loss_parallel": 9.74110844254028e-06, + "grad_norm": 3.5433411598205566, + "learning_rate": 1.9372615515048748e-07, + "loss": 0.6269, + "mean_token_accuracy": 0.8080592155456543, + "num_tokens": 17437207.0, + "step": 458 + }, + { + "epoch": 0.05838951787304414, + "ewc_loss": 0.0038630403578281403, + "ewc_loss_diag": 2.8908252716064453e-06, + "ewc_loss_parallel": 9.791293450689409e-06, + "grad_norm": 3.3888590335845947, + "learning_rate": 1.9415006358626535e-07, + "loss": 0.5632, + "mean_token_accuracy": 0.826406717300415, + "num_tokens": 17473335.0, + "step": 459 + }, + { + "epoch": 0.05851672815163465, + "ewc_loss": 0.003891396801918745, + "ewc_loss_diag": 2.9206275939941406e-06, + "ewc_loss_parallel": 9.769680218596477e-06, + "grad_norm": 3.767077922821045, + "learning_rate": 1.9457397202204322e-07, + "loss": 0.5535, + "mean_token_accuracy": 0.8234442472457886, + "num_tokens": 17505104.0, + "step": 460 + }, + { + "epoch": 0.05864393843022516, + "ewc_loss": 0.003909816034138203, + "ewc_loss_diag": 2.9206275939941406e-06, + "ewc_loss_parallel": 9.953872904588934e-06, + "grad_norm": 3.2619426250457764, + "learning_rate": 1.949978804578211e-07, + "loss": 0.5366, + "mean_token_accuracy": 0.8333722352981567, + "num_tokens": 17539052.0, + "step": 461 + }, + { + "epoch": 0.058771148708815675, + "ewc_loss": 0.003941853065043688, + "ewc_loss_diag": 2.9653310775756836e-06, + "ewc_loss_parallel": 9.816480087465607e-06, + "grad_norm": 3.2837512493133545, + "learning_rate": 1.9542178889359897e-07, + "loss": 0.6109, + "mean_token_accuracy": 0.8093475699424744, + "num_tokens": 17578942.0, + "step": 462 + }, + { + "epoch": 0.05889835898740618, + "ewc_loss": 0.003927131649106741, + "ewc_loss_diag": 2.950429916381836e-06, + "ewc_loss_parallel": 9.82185247266898e-06, + "grad_norm": 3.6794798374176025, + "learning_rate": 1.9584569732937684e-07, + "loss": 0.5884, + "mean_token_accuracy": 0.8206812143325806, + "num_tokens": 17611931.0, + "step": 463 + }, + { + "epoch": 0.05902556926599669, + "ewc_loss": 0.003978678956627846, + "ewc_loss_diag": 2.9802322387695312e-06, + "ewc_loss_parallel": 1.003215311357053e-05, + "grad_norm": 3.2910921573638916, + "learning_rate": 1.962696057651547e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.8446999788284302, + "num_tokens": 17648764.0, + "step": 464 + }, + { + "epoch": 0.059152779544587204, + "ewc_loss": 0.003987327218055725, + "ewc_loss_diag": 2.995133399963379e-06, + "ewc_loss_parallel": 9.966047400666866e-06, + "grad_norm": 3.843306064605713, + "learning_rate": 1.9669351420093258e-07, + "loss": 0.6052, + "mean_token_accuracy": 0.8114592432975769, + "num_tokens": 17683099.0, + "step": 465 + }, + { + "epoch": 0.05927998982317771, + "ewc_loss": 0.003995247185230255, + "ewc_loss_diag": 2.9802322387695312e-06, + "ewc_loss_parallel": 1.0197833034908399e-05, + "grad_norm": 3.303701162338257, + "learning_rate": 1.9711742263671046e-07, + "loss": 0.5909, + "mean_token_accuracy": 0.8202639222145081, + "num_tokens": 17727120.0, + "step": 466 + }, + { + "epoch": 0.059407200101768226, + "ewc_loss": 0.004010588396340609, + "ewc_loss_diag": 3.0100345611572266e-06, + "ewc_loss_parallel": 1.0046068382507656e-05, + "grad_norm": 3.5794692039489746, + "learning_rate": 1.9754133107248833e-07, + "loss": 0.544, + "mean_token_accuracy": 0.8239496946334839, + "num_tokens": 17761676.0, + "step": 467 + }, + { + "epoch": 0.059534410380358734, + "ewc_loss": 0.004035744816064835, + "ewc_loss_diag": 3.0249357223510742e-06, + "ewc_loss_parallel": 1.01450468719122e-05, + "grad_norm": 3.1495859622955322, + "learning_rate": 1.979652395082662e-07, + "loss": 0.6124, + "mean_token_accuracy": 0.8117572069168091, + "num_tokens": 17802311.0, + "step": 468 + }, + { + "epoch": 0.05966162065894924, + "ewc_loss": 0.004025219473987818, + "ewc_loss_diag": 3.0249357223510742e-06, + "ewc_loss_parallel": 1.0039791050076019e-05, + "grad_norm": 3.2222094535827637, + "learning_rate": 1.9838914794404408e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8385236263275146, + "num_tokens": 17839048.0, + "step": 469 + }, + { + "epoch": 0.059788830937539755, + "ewc_loss": 0.004041492938995361, + "ewc_loss_diag": 3.039836883544922e-06, + "ewc_loss_parallel": 1.0049936463474296e-05, + "grad_norm": 3.548983097076416, + "learning_rate": 1.9881305637982195e-07, + "loss": 0.5758, + "mean_token_accuracy": 0.8207939863204956, + "num_tokens": 17881510.0, + "step": 470 + }, + { + "epoch": 0.05991604121613026, + "ewc_loss": 0.004062256310135126, + "ewc_loss_diag": 3.039836883544922e-06, + "ewc_loss_parallel": 1.0257572284899652e-05, + "grad_norm": 3.9105687141418457, + "learning_rate": 1.9923696481559982e-07, + "loss": 0.5997, + "mean_token_accuracy": 0.8154521584510803, + "num_tokens": 17915080.0, + "step": 471 + }, + { + "epoch": 0.06004325149472077, + "ewc_loss": 0.004099849611520767, + "ewc_loss_diag": 3.0547380447387695e-06, + "ewc_loss_parallel": 1.0480918717803434e-05, + "grad_norm": 4.208763599395752, + "learning_rate": 1.996608732513777e-07, + "loss": 0.633, + "mean_token_accuracy": 0.8048017621040344, + "num_tokens": 17957972.0, + "step": 472 + }, + { + "epoch": 0.060170461773311285, + "ewc_loss": 0.0041324784979224205, + "ewc_loss_diag": 3.069639205932617e-06, + "ewc_loss_parallel": 1.0654616744432133e-05, + "grad_norm": 3.7945609092712402, + "learning_rate": 2.0008478168715557e-07, + "loss": 0.5055, + "mean_token_accuracy": 0.8380421996116638, + "num_tokens": 17998008.0, + "step": 473 + }, + { + "epoch": 0.06029767205190179, + "ewc_loss": 0.004114673472940922, + "ewc_loss_diag": 3.069639205932617e-06, + "ewc_loss_parallel": 1.0476565876160748e-05, + "grad_norm": 3.4469146728515625, + "learning_rate": 2.0050869012293344e-07, + "loss": 0.5112, + "mean_token_accuracy": 0.8352755308151245, + "num_tokens": 18032427.0, + "step": 474 + }, + { + "epoch": 0.060424882330492306, + "ewc_loss": 0.00410448107868433, + "ewc_loss_diag": 3.084540367126465e-06, + "ewc_loss_parallel": 1.022205651679542e-05, + "grad_norm": 2.9447643756866455, + "learning_rate": 2.009325985587113e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8291422128677368, + "num_tokens": 18069848.0, + "step": 475 + }, + { + "epoch": 0.060552092609082814, + "ewc_loss": 0.004075164906680584, + "ewc_loss_diag": 3.084540367126465e-06, + "ewc_loss_parallel": 9.928896361088846e-06, + "grad_norm": 3.4730727672576904, + "learning_rate": 2.0135650699448918e-07, + "loss": 0.6343, + "mean_token_accuracy": 0.8026447892189026, + "num_tokens": 18109240.0, + "step": 476 + }, + { + "epoch": 0.06067930288767332, + "ewc_loss": 0.004119587130844593, + "ewc_loss_diag": 3.0994415283203125e-06, + "ewc_loss_parallel": 1.022052856569644e-05, + "grad_norm": 3.451005220413208, + "learning_rate": 2.0178041543026706e-07, + "loss": 0.6045, + "mean_token_accuracy": 0.8140186071395874, + "num_tokens": 18146849.0, + "step": 477 + }, + { + "epoch": 0.060806513166263836, + "ewc_loss": 0.004133386537432671, + "ewc_loss_diag": 3.0994415283203125e-06, + "ewc_loss_parallel": 1.0358523468312342e-05, + "grad_norm": 3.368396759033203, + "learning_rate": 2.022043238660449e-07, + "loss": 0.6348, + "mean_token_accuracy": 0.8019322752952576, + "num_tokens": 18186872.0, + "step": 478 + }, + { + "epoch": 0.06093372344485434, + "ewc_loss": 0.004144738893955946, + "ewc_loss_diag": 3.11434268951416e-06, + "ewc_loss_parallel": 1.0319460670871194e-05, + "grad_norm": 3.1146676540374756, + "learning_rate": 2.026282323018228e-07, + "loss": 0.61, + "mean_token_accuracy": 0.8120254278182983, + "num_tokens": 18222299.0, + "step": 479 + }, + { + "epoch": 0.06106093372344486, + "ewc_loss": 0.0041636573150753975, + "ewc_loss_diag": 3.1441450119018555e-06, + "ewc_loss_parallel": 1.0203465535596479e-05, + "grad_norm": 3.2843241691589355, + "learning_rate": 2.0305214073760065e-07, + "loss": 0.614, + "mean_token_accuracy": 0.812829852104187, + "num_tokens": 18261645.0, + "step": 480 + }, + { + "epoch": 0.061188144002035365, + "ewc_loss": 0.004146510269492865, + "ewc_loss_diag": 3.11434268951416e-06, + "ewc_loss_parallel": 1.0337173989682924e-05, + "grad_norm": 3.233354091644287, + "learning_rate": 2.0347604917337855e-07, + "loss": 0.5846, + "mean_token_accuracy": 0.8176648616790771, + "num_tokens": 18292970.0, + "step": 481 + }, + { + "epoch": 0.06131535428062587, + "ewc_loss": 0.004151279106736183, + "ewc_loss_diag": 3.11434268951416e-06, + "ewc_loss_parallel": 1.038486425386509e-05, + "grad_norm": 3.0287647247314453, + "learning_rate": 2.038999576091564e-07, + "loss": 0.5326, + "mean_token_accuracy": 0.8350140452384949, + "num_tokens": 18331133.0, + "step": 482 + }, + { + "epoch": 0.06144256455921639, + "ewc_loss": 0.004159999545663595, + "ewc_loss_diag": 3.129243850708008e-06, + "ewc_loss_parallel": 1.031947886076523e-05, + "grad_norm": 3.1917834281921387, + "learning_rate": 2.043238660449343e-07, + "loss": 0.6184, + "mean_token_accuracy": 0.8088880777359009, + "num_tokens": 18368905.0, + "step": 483 + }, + { + "epoch": 0.061569774837806894, + "ewc_loss": 0.004176334012299776, + "ewc_loss_diag": 3.129243850708008e-06, + "ewc_loss_parallel": 1.0482822290214244e-05, + "grad_norm": 3.407635450363159, + "learning_rate": 2.0474777448071214e-07, + "loss": 0.5436, + "mean_token_accuracy": 0.832217812538147, + "num_tokens": 18406362.0, + "step": 484 + }, + { + "epoch": 0.0616969851163974, + "ewc_loss": 0.0042273676954209805, + "ewc_loss_diag": 3.159046173095703e-06, + "ewc_loss_parallel": 1.068798519554548e-05, + "grad_norm": 3.1286427974700928, + "learning_rate": 2.0517168291649004e-07, + "loss": 0.5792, + "mean_token_accuracy": 0.8208247423171997, + "num_tokens": 18439246.0, + "step": 485 + }, + { + "epoch": 0.061824195394987916, + "ewc_loss": 0.004231848753988743, + "ewc_loss_diag": 3.1739473342895508e-06, + "ewc_loss_parallel": 1.0580205525911879e-05, + "grad_norm": 3.418315887451172, + "learning_rate": 2.0559559135226788e-07, + "loss": 0.5838, + "mean_token_accuracy": 0.8165900707244873, + "num_tokens": 18473408.0, + "step": 486 + }, + { + "epoch": 0.06195140567357842, + "ewc_loss": 0.004250705242156982, + "ewc_loss_diag": 3.1739473342895508e-06, + "ewc_loss_parallel": 1.0768768333946355e-05, + "grad_norm": 3.125262975692749, + "learning_rate": 2.0601949978804578e-07, + "loss": 0.5383, + "mean_token_accuracy": 0.8300761580467224, + "num_tokens": 18513246.0, + "step": 487 + }, + { + "epoch": 0.06207861595216894, + "ewc_loss": 0.004241546615958214, + "ewc_loss_diag": 3.1739473342895508e-06, + "ewc_loss_parallel": 1.0677185855456628e-05, + "grad_norm": 3.5109851360321045, + "learning_rate": 2.0644340822382363e-07, + "loss": 0.5101, + "mean_token_accuracy": 0.8384723663330078, + "num_tokens": 18549125.0, + "step": 488 + }, + { + "epoch": 0.062205826230759445, + "ewc_loss": 0.004277925938367844, + "ewc_loss_diag": 3.1888484954833984e-06, + "ewc_loss_parallel": 1.0888392353081144e-05, + "grad_norm": 3.2670164108276367, + "learning_rate": 2.0686731665960153e-07, + "loss": 0.5602, + "mean_token_accuracy": 0.8304277658462524, + "num_tokens": 18585306.0, + "step": 489 + }, + { + "epoch": 0.06233303650934995, + "ewc_loss": 0.0042902142740786076, + "ewc_loss_diag": 3.203749656677246e-06, + "ewc_loss_parallel": 1.0858685527637135e-05, + "grad_norm": 3.227365255355835, + "learning_rate": 2.0729122509537937e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.8385130763053894, + "num_tokens": 18624378.0, + "step": 490 + }, + { + "epoch": 0.06246024678794047, + "ewc_loss": 0.004289603792130947, + "ewc_loss_diag": 3.203749656677246e-06, + "ewc_loss_parallel": 1.0852580089704134e-05, + "grad_norm": 3.127368450164795, + "learning_rate": 2.0771513353115727e-07, + "loss": 0.5621, + "mean_token_accuracy": 0.8245905637741089, + "num_tokens": 18660814.0, + "step": 491 + }, + { + "epoch": 0.06258745706653097, + "ewc_loss": 0.004285466391593218, + "ewc_loss_diag": 3.203749656677246e-06, + "ewc_loss_parallel": 1.0811206266225781e-05, + "grad_norm": 2.9019508361816406, + "learning_rate": 2.0813904196693512e-07, + "loss": 0.532, + "mean_token_accuracy": 0.834060549736023, + "num_tokens": 18702129.0, + "step": 492 + }, + { + "epoch": 0.06271466734512149, + "ewc_loss": 0.0042900098487734795, + "ewc_loss_diag": 3.2186508178710938e-06, + "ewc_loss_parallel": 1.0704051419452298e-05, + "grad_norm": 3.5521159172058105, + "learning_rate": 2.0856295040271302e-07, + "loss": 0.6018, + "mean_token_accuracy": 0.8127787709236145, + "num_tokens": 18737496.0, + "step": 493 + }, + { + "epoch": 0.06284187762371199, + "ewc_loss": 0.004369438160210848, + "ewc_loss_diag": 3.248453140258789e-06, + "ewc_loss_parallel": 1.1193159480171744e-05, + "grad_norm": 2.8411049842834473, + "learning_rate": 2.0898685883849086e-07, + "loss": 0.5602, + "mean_token_accuracy": 0.8278156518936157, + "num_tokens": 18779332.0, + "step": 494 + }, + { + "epoch": 0.0629690879023025, + "ewc_loss": 0.004353826399892569, + "ewc_loss_diag": 3.2633543014526367e-06, + "ewc_loss_parallel": 1.0884456969506573e-05, + "grad_norm": 2.8933730125427246, + "learning_rate": 2.0941076727426874e-07, + "loss": 0.5458, + "mean_token_accuracy": 0.8292951583862305, + "num_tokens": 18817976.0, + "step": 495 + }, + { + "epoch": 0.06309629818089302, + "ewc_loss": 0.0043576499447226524, + "ewc_loss_diag": 3.2633543014526367e-06, + "ewc_loss_parallel": 1.0922691217274405e-05, + "grad_norm": 2.6161599159240723, + "learning_rate": 2.098346757100466e-07, + "loss": 0.5368, + "mean_token_accuracy": 0.8308672904968262, + "num_tokens": 18862860.0, + "step": 496 + }, + { + "epoch": 0.06322350845948353, + "ewc_loss": 0.004377068020403385, + "ewc_loss_diag": 3.293156623840332e-06, + "ewc_loss_parallel": 1.0811696483870037e-05, + "grad_norm": 3.3928136825561523, + "learning_rate": 2.1025858414582448e-07, + "loss": 0.5772, + "mean_token_accuracy": 0.8189408779144287, + "num_tokens": 18899025.0, + "step": 497 + }, + { + "epoch": 0.06335071873807403, + "ewc_loss": 0.0044881016947329044, + "ewc_loss_diag": 3.337860107421875e-06, + "ewc_loss_parallel": 1.146426802733913e-05, + "grad_norm": 3.1952431201934814, + "learning_rate": 2.1068249258160238e-07, + "loss": 0.5424, + "mean_token_accuracy": 0.8235092759132385, + "num_tokens": 18933118.0, + "step": 498 + }, + { + "epoch": 0.06347792901666455, + "ewc_loss": 0.004491765983402729, + "ewc_loss_diag": 3.337860107421875e-06, + "ewc_loss_parallel": 1.150091065937886e-05, + "grad_norm": 3.3542239665985107, + "learning_rate": 2.1110640101738023e-07, + "loss": 0.544, + "mean_token_accuracy": 0.8314931988716125, + "num_tokens": 18969165.0, + "step": 499 + }, + { + "epoch": 0.06360513929525506, + "ewc_loss": 0.004498458467423916, + "ewc_loss_diag": 3.337860107421875e-06, + "ewc_loss_parallel": 1.1567834008019418e-05, + "grad_norm": 3.3158152103424072, + "learning_rate": 2.1153030945315813e-07, + "loss": 0.5531, + "mean_token_accuracy": 0.8259056210517883, + "num_tokens": 19003882.0, + "step": 500 + }, + { + "epoch": 0.06373234957384556, + "ewc_loss": 0.0045118010602891445, + "ewc_loss_diag": 3.3527612686157227e-06, + "ewc_loss_parallel": 1.154867550212657e-05, + "grad_norm": 3.253206968307495, + "learning_rate": 2.1195421788893597e-07, + "loss": 0.5532, + "mean_token_accuracy": 0.8281538486480713, + "num_tokens": 19037540.0, + "step": 501 + }, + { + "epoch": 0.06385955985243608, + "ewc_loss": 0.0045569841749966145, + "ewc_loss_diag": 3.3974647521972656e-06, + "ewc_loss_parallel": 1.1542743777681608e-05, + "grad_norm": 3.254941463470459, + "learning_rate": 2.1237812632471387e-07, + "loss": 0.5647, + "mean_token_accuracy": 0.8191753029823303, + "num_tokens": 19077270.0, + "step": 502 + }, + { + "epoch": 0.06398677013102659, + "ewc_loss": 0.004554252605885267, + "ewc_loss_diag": 3.3974647521972656e-06, + "ewc_loss_parallel": 1.151542801380856e-05, + "grad_norm": 3.2468833923339844, + "learning_rate": 2.1280203476049172e-07, + "loss": 0.6513, + "mean_token_accuracy": 0.7942216396331787, + "num_tokens": 19117579.0, + "step": 503 + }, + { + "epoch": 0.06411398040961709, + "ewc_loss": 0.0045856693759560585, + "ewc_loss_diag": 3.427267074584961e-06, + "ewc_loss_parallel": 1.1524416549946181e-05, + "grad_norm": 2.976172685623169, + "learning_rate": 2.1322594319626962e-07, + "loss": 0.5912, + "mean_token_accuracy": 0.8151499032974243, + "num_tokens": 19156599.0, + "step": 504 + }, + { + "epoch": 0.0642411906882076, + "ewc_loss": 0.00459087872877717, + "ewc_loss_diag": 3.4421682357788086e-06, + "ewc_loss_parallel": 1.1423922842368484e-05, + "grad_norm": 3.579071044921875, + "learning_rate": 2.1364985163204746e-07, + "loss": 0.5821, + "mean_token_accuracy": 0.8176107406616211, + "num_tokens": 19187367.0, + "step": 505 + }, + { + "epoch": 0.06436840096679812, + "ewc_loss": 0.004642495885491371, + "ewc_loss_diag": 3.4570693969726562e-06, + "ewc_loss_parallel": 1.178750790131744e-05, + "grad_norm": 3.196610927581787, + "learning_rate": 2.1407376006782536e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8357176780700684, + "num_tokens": 19223916.0, + "step": 506 + }, + { + "epoch": 0.06449561124538863, + "ewc_loss": 0.0046555716544389725, + "ewc_loss_diag": 3.4868717193603516e-06, + "ewc_loss_parallel": 1.1613086826400831e-05, + "grad_norm": 3.159510612487793, + "learning_rate": 2.144976685036032e-07, + "loss": 0.532, + "mean_token_accuracy": 0.833937406539917, + "num_tokens": 19260336.0, + "step": 507 + }, + { + "epoch": 0.06462282152397913, + "ewc_loss": 0.004669299349188805, + "ewc_loss_diag": 3.516674041748047e-06, + "ewc_loss_parallel": 1.1597780940064695e-05, + "grad_norm": 3.1888558864593506, + "learning_rate": 2.149215769393811e-07, + "loss": 0.5934, + "mean_token_accuracy": 0.8167319893836975, + "num_tokens": 19299895.0, + "step": 508 + }, + { + "epoch": 0.06475003180256965, + "ewc_loss": 0.004671444185078144, + "ewc_loss_diag": 3.516674041748047e-06, + "ewc_loss_parallel": 1.1619225006143097e-05, + "grad_norm": 3.0439605712890625, + "learning_rate": 2.1534548537515895e-07, + "loss": 0.536, + "mean_token_accuracy": 0.830742597579956, + "num_tokens": 19336614.0, + "step": 509 + }, + { + "epoch": 0.06487724208116016, + "ewc_loss": 0.004679559729993343, + "ewc_loss_diag": 3.5315752029418945e-06, + "ewc_loss_parallel": 1.1547794201760553e-05, + "grad_norm": 2.8560597896575928, + "learning_rate": 2.1576939381093685e-07, + "loss": 0.5867, + "mean_token_accuracy": 0.8186249732971191, + "num_tokens": 19378683.0, + "step": 510 + }, + { + "epoch": 0.06500445235975066, + "ewc_loss": 0.004671809263527393, + "ewc_loss_diag": 3.5315752029418945e-06, + "ewc_loss_parallel": 1.1470287063275464e-05, + "grad_norm": 2.8991010189056396, + "learning_rate": 2.161933022467147e-07, + "loss": 0.5703, + "mean_token_accuracy": 0.8191018104553223, + "num_tokens": 19422598.0, + "step": 511 + }, + { + "epoch": 0.06513166263834118, + "ewc_loss": 0.004678045399487019, + "ewc_loss_diag": 3.5315752029418945e-06, + "ewc_loss_parallel": 1.1532653843460139e-05, + "grad_norm": 2.7477118968963623, + "learning_rate": 2.166172106824926e-07, + "loss": 0.5609, + "mean_token_accuracy": 0.8276160955429077, + "num_tokens": 19466972.0, + "step": 512 + }, + { + "epoch": 0.06525887291693169, + "ewc_loss": 0.004695701878517866, + "ewc_loss_diag": 3.546476364135742e-06, + "ewc_loss_parallel": 1.1556629033293575e-05, + "grad_norm": 3.3131494522094727, + "learning_rate": 2.1704111911827044e-07, + "loss": 0.5507, + "mean_token_accuracy": 0.8230851888656616, + "num_tokens": 19503809.0, + "step": 513 + }, + { + "epoch": 0.0653860831955222, + "ewc_loss": 0.004761293530464172, + "ewc_loss_diag": 3.56137752532959e-06, + "ewc_loss_parallel": 1.2059957953169942e-05, + "grad_norm": 2.848200798034668, + "learning_rate": 2.1746502755404831e-07, + "loss": 0.6067, + "mean_token_accuracy": 0.8176411986351013, + "num_tokens": 19545713.0, + "step": 514 + }, + { + "epoch": 0.06551329347411271, + "ewc_loss": 0.0047365291975438595, + "ewc_loss_diag": 3.56137752532959e-06, + "ewc_loss_parallel": 1.1812313459813595e-05, + "grad_norm": 2.9202520847320557, + "learning_rate": 2.178889359898262e-07, + "loss": 0.5931, + "mean_token_accuracy": 0.8189908266067505, + "num_tokens": 19582272.0, + "step": 515 + }, + { + "epoch": 0.06564050375270322, + "ewc_loss": 0.004739431198686361, + "ewc_loss_diag": 3.56137752532959e-06, + "ewc_loss_parallel": 1.1841333616757765e-05, + "grad_norm": 2.815220832824707, + "learning_rate": 2.1831284442560406e-07, + "loss": 0.6084, + "mean_token_accuracy": 0.8128532767295837, + "num_tokens": 19625574.0, + "step": 516 + }, + { + "epoch": 0.06576771403129372, + "ewc_loss": 0.00474054180085659, + "ewc_loss_diag": 3.56137752532959e-06, + "ewc_loss_parallel": 1.1852438547066413e-05, + "grad_norm": 3.1210713386535645, + "learning_rate": 2.1873675286138193e-07, + "loss": 0.5961, + "mean_token_accuracy": 0.8122321367263794, + "num_tokens": 19667791.0, + "step": 517 + }, + { + "epoch": 0.06589492430988424, + "ewc_loss": 0.004767624661326408, + "ewc_loss_diag": 3.56137752532959e-06, + "ewc_loss_parallel": 1.2123269698349759e-05, + "grad_norm": 2.9855220317840576, + "learning_rate": 2.191606612971598e-07, + "loss": 0.6241, + "mean_token_accuracy": 0.8064979314804077, + "num_tokens": 19708289.0, + "step": 518 + }, + { + "epoch": 0.06602213458847475, + "ewc_loss": 0.004796317778527737, + "ewc_loss_diag": 3.591179847717285e-06, + "ewc_loss_parallel": 1.210502341564279e-05, + "grad_norm": 2.8843204975128174, + "learning_rate": 2.1958456973293768e-07, + "loss": 0.6033, + "mean_token_accuracy": 0.8086353540420532, + "num_tokens": 19747444.0, + "step": 519 + }, + { + "epoch": 0.06614934486706527, + "ewc_loss": 0.004792113788425922, + "ewc_loss_diag": 3.591179847717285e-06, + "ewc_loss_parallel": 1.2062984751537442e-05, + "grad_norm": 3.214050531387329, + "learning_rate": 2.2000847816871555e-07, + "loss": 0.492, + "mean_token_accuracy": 0.8408330082893372, + "num_tokens": 19787061.0, + "step": 520 + }, + { + "epoch": 0.06627655514565577, + "ewc_loss": 0.004818917252123356, + "ewc_loss_diag": 3.591179847717285e-06, + "ewc_loss_parallel": 1.2331020116107538e-05, + "grad_norm": 2.6865828037261963, + "learning_rate": 2.2043238660449342e-07, + "loss": 0.534, + "mean_token_accuracy": 0.8342264890670776, + "num_tokens": 19826893.0, + "step": 521 + }, + { + "epoch": 0.06640376542424628, + "ewc_loss": 0.004788665100932121, + "ewc_loss_diag": 3.591179847717285e-06, + "ewc_loss_parallel": 1.2028493983962107e-05, + "grad_norm": 2.866814374923706, + "learning_rate": 2.208562950402713e-07, + "loss": 0.527, + "mean_token_accuracy": 0.8309491872787476, + "num_tokens": 19864285.0, + "step": 522 + }, + { + "epoch": 0.0665309757028368, + "ewc_loss": 0.004800030030310154, + "ewc_loss_diag": 3.591179847717285e-06, + "ewc_loss_parallel": 1.2142147170379758e-05, + "grad_norm": 2.7372219562530518, + "learning_rate": 2.2128020347604917e-07, + "loss": 0.5626, + "mean_token_accuracy": 0.8241504430770874, + "num_tokens": 19903806.0, + "step": 523 + }, + { + "epoch": 0.0666581859814273, + "ewc_loss": 0.004800897091627121, + "ewc_loss_diag": 3.591179847717285e-06, + "ewc_loss_parallel": 1.2150816473877057e-05, + "grad_norm": 2.7562921047210693, + "learning_rate": 2.2170411191182704e-07, + "loss": 0.5398, + "mean_token_accuracy": 0.8291096687316895, + "num_tokens": 19950382.0, + "step": 524 + }, + { + "epoch": 0.06678539626001781, + "ewc_loss": 0.004807163029909134, + "ewc_loss_diag": 3.591179847717285e-06, + "ewc_loss_parallel": 1.22134742923663e-05, + "grad_norm": 2.9386932849884033, + "learning_rate": 2.221280203476049e-07, + "loss": 0.5639, + "mean_token_accuracy": 0.8215228319168091, + "num_tokens": 19984765.0, + "step": 525 + }, + { + "epoch": 0.06691260653860832, + "ewc_loss": 0.004843891132622957, + "ewc_loss_diag": 3.606081008911133e-06, + "ewc_loss_parallel": 1.2428169611666817e-05, + "grad_norm": 2.9236512184143066, + "learning_rate": 2.2255192878338279e-07, + "loss": 0.6036, + "mean_token_accuracy": 0.814327597618103, + "num_tokens": 20021081.0, + "step": 526 + }, + { + "epoch": 0.06703981681719882, + "ewc_loss": 0.004869371652603149, + "ewc_loss_diag": 3.6209821701049805e-06, + "ewc_loss_parallel": 1.2530385902209673e-05, + "grad_norm": 3.283785581588745, + "learning_rate": 2.2297583721916066e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8198806047439575, + "num_tokens": 20055860.0, + "step": 527 + }, + { + "epoch": 0.06716702709578934, + "ewc_loss": 0.004929611925035715, + "ewc_loss_diag": 3.6507844924926758e-06, + "ewc_loss_parallel": 1.2827613318222575e-05, + "grad_norm": 2.791081190109253, + "learning_rate": 2.2339974565493853e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8342296481132507, + "num_tokens": 20095023.0, + "step": 528 + }, + { + "epoch": 0.06729423737437985, + "ewc_loss": 0.004890000447630882, + "ewc_loss_diag": 3.6507844924926758e-06, + "ewc_loss_parallel": 1.2431501090759411e-05, + "grad_norm": 2.911893367767334, + "learning_rate": 2.238236540907164e-07, + "loss": 0.585, + "mean_token_accuracy": 0.8175963163375854, + "num_tokens": 20134620.0, + "step": 529 + }, + { + "epoch": 0.06742144765297035, + "ewc_loss": 0.004898346960544586, + "ewc_loss_diag": 3.6507844924926758e-06, + "ewc_loss_parallel": 1.2514966329035815e-05, + "grad_norm": 3.1334922313690186, + "learning_rate": 2.2424756252649428e-07, + "loss": 0.5551, + "mean_token_accuracy": 0.8236579895019531, + "num_tokens": 20169690.0, + "step": 530 + }, + { + "epoch": 0.06754865793156087, + "ewc_loss": 0.004939364269375801, + "ewc_loss_diag": 3.6656856536865234e-06, + "ewc_loss_parallel": 1.277255159948254e-05, + "grad_norm": 3.215801239013672, + "learning_rate": 2.2467147096227215e-07, + "loss": 0.5473, + "mean_token_accuracy": 0.8282274007797241, + "num_tokens": 20203609.0, + "step": 531 + }, + { + "epoch": 0.06767586821015138, + "ewc_loss": 0.004969012923538685, + "ewc_loss_diag": 3.680586814880371e-06, + "ewc_loss_parallel": 1.2916446394228842e-05, + "grad_norm": 3.2767691612243652, + "learning_rate": 2.2509537939805002e-07, + "loss": 0.5539, + "mean_token_accuracy": 0.8237195014953613, + "num_tokens": 20234616.0, + "step": 532 + }, + { + "epoch": 0.0678030784887419, + "ewc_loss": 0.004961883649230003, + "ewc_loss_diag": 3.6656856536865234e-06, + "ewc_loss_parallel": 1.299774430663092e-05, + "grad_norm": 2.8769614696502686, + "learning_rate": 2.2551928783382787e-07, + "loss": 0.5453, + "mean_token_accuracy": 0.8252891302108765, + "num_tokens": 20271634.0, + "step": 533 + }, + { + "epoch": 0.0679302887673324, + "ewc_loss": 0.004958951845765114, + "ewc_loss_diag": 3.6954879760742188e-06, + "ewc_loss_parallel": 1.2663250345212873e-05, + "grad_norm": 3.068643093109131, + "learning_rate": 2.2594319626960577e-07, + "loss": 0.5731, + "mean_token_accuracy": 0.8237670660018921, + "num_tokens": 20309683.0, + "step": 534 + }, + { + "epoch": 0.06805749904592291, + "ewc_loss": 0.00498946662992239, + "ewc_loss_diag": 3.7103891372680664e-06, + "ewc_loss_parallel": 1.2815808076993562e-05, + "grad_norm": 2.811149835586548, + "learning_rate": 2.263671047053836e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8411378860473633, + "num_tokens": 20346013.0, + "step": 535 + }, + { + "epoch": 0.06818470932451343, + "ewc_loss": 0.0049765631556510925, + "ewc_loss_diag": 3.7103891372680664e-06, + "ewc_loss_parallel": 1.268677442567423e-05, + "grad_norm": 3.46946120262146, + "learning_rate": 2.267910131411615e-07, + "loss": 0.544, + "mean_token_accuracy": 0.8321633338928223, + "num_tokens": 20387613.0, + "step": 536 + }, + { + "epoch": 0.06831191960310393, + "ewc_loss": 0.005044180899858475, + "ewc_loss_diag": 3.725290298461914e-06, + "ewc_loss_parallel": 1.3210365068516694e-05, + "grad_norm": 3.0420985221862793, + "learning_rate": 2.2721492157693936e-07, + "loss": 0.5431, + "mean_token_accuracy": 0.8289740681648254, + "num_tokens": 20417579.0, + "step": 537 + }, + { + "epoch": 0.06843912988169444, + "ewc_loss": 0.005021259188652039, + "ewc_loss_diag": 3.725290298461914e-06, + "ewc_loss_parallel": 1.2981147847312968e-05, + "grad_norm": 2.7711167335510254, + "learning_rate": 2.2763883001271726e-07, + "loss": 0.516, + "mean_token_accuracy": 0.8343147039413452, + "num_tokens": 20455100.0, + "step": 538 + }, + { + "epoch": 0.06856634016028496, + "ewc_loss": 0.004992865491658449, + "ewc_loss_diag": 3.725290298461914e-06, + "ewc_loss_parallel": 1.2697210877377074e-05, + "grad_norm": 2.810494899749756, + "learning_rate": 2.280627384484951e-07, + "loss": 0.5774, + "mean_token_accuracy": 0.8178305625915527, + "num_tokens": 20496458.0, + "step": 539 + }, + { + "epoch": 0.06869355043887546, + "ewc_loss": 0.005029627121984959, + "ewc_loss_diag": 3.7550926208496094e-06, + "ewc_loss_parallel": 1.2759650417137891e-05, + "grad_norm": 3.258030652999878, + "learning_rate": 2.28486646884273e-07, + "loss": 0.5525, + "mean_token_accuracy": 0.8245247602462769, + "num_tokens": 20533758.0, + "step": 540 + }, + { + "epoch": 0.06882076071746597, + "ewc_loss": 0.00509359547868371, + "ewc_loss_diag": 3.769993782043457e-06, + "ewc_loss_parallel": 1.3246746675577015e-05, + "grad_norm": 3.0839579105377197, + "learning_rate": 2.2891055532005085e-07, + "loss": 0.5969, + "mean_token_accuracy": 0.8161150217056274, + "num_tokens": 20567259.0, + "step": 541 + }, + { + "epoch": 0.06894797099605648, + "ewc_loss": 0.005116741172969341, + "ewc_loss_diag": 3.7997961044311523e-06, + "ewc_loss_parallel": 1.317302758252481e-05, + "grad_norm": 3.0727689266204834, + "learning_rate": 2.2933446375582875e-07, + "loss": 0.5781, + "mean_token_accuracy": 0.821346640586853, + "num_tokens": 20599333.0, + "step": 542 + }, + { + "epoch": 0.06907518127464699, + "ewc_loss": 0.005109344143420458, + "ewc_loss_diag": 3.7997961044311523e-06, + "ewc_loss_parallel": 1.3099055649945512e-05, + "grad_norm": 2.923403024673462, + "learning_rate": 2.297583721916066e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8381874561309814, + "num_tokens": 20640195.0, + "step": 543 + }, + { + "epoch": 0.0692023915532375, + "ewc_loss": 0.005099428817629814, + "ewc_loss_diag": 3.7997961044311523e-06, + "ewc_loss_parallel": 1.2999901628063526e-05, + "grad_norm": 2.8071422576904297, + "learning_rate": 2.301822806273845e-07, + "loss": 0.536, + "mean_token_accuracy": 0.8309260606765747, + "num_tokens": 20682617.0, + "step": 544 + }, + { + "epoch": 0.06932960183182801, + "ewc_loss": 0.00509191770106554, + "ewc_loss_diag": 3.7997961044311523e-06, + "ewc_loss_parallel": 1.2924793736601714e-05, + "grad_norm": 3.107135057449341, + "learning_rate": 2.3060618906316234e-07, + "loss": 0.5873, + "mean_token_accuracy": 0.8188874125480652, + "num_tokens": 20718032.0, + "step": 545 + }, + { + "epoch": 0.06945681211041853, + "ewc_loss": 0.005121465772390366, + "ewc_loss_diag": 3.7997961044311523e-06, + "ewc_loss_parallel": 1.322027401329251e-05, + "grad_norm": 2.9298598766326904, + "learning_rate": 2.3103009749894024e-07, + "loss": 0.5528, + "mean_token_accuracy": 0.8269228935241699, + "num_tokens": 20755190.0, + "step": 546 + }, + { + "epoch": 0.06958402238900903, + "ewc_loss": 0.00512780761346221, + "ewc_loss_diag": 3.814697265625e-06, + "ewc_loss_parallel": 1.3131102605257183e-05, + "grad_norm": 2.492551803588867, + "learning_rate": 2.3145400593471808e-07, + "loss": 0.5767, + "mean_token_accuracy": 0.8178929686546326, + "num_tokens": 20799088.0, + "step": 547 + }, + { + "epoch": 0.06971123266759954, + "ewc_loss": 0.005091476254165173, + "ewc_loss_diag": 3.814697265625e-06, + "ewc_loss_parallel": 1.2767787666234653e-05, + "grad_norm": 3.095738649368286, + "learning_rate": 2.3187791437049598e-07, + "loss": 0.6063, + "mean_token_accuracy": 0.8093296885490417, + "num_tokens": 20831819.0, + "step": 548 + }, + { + "epoch": 0.06983844294619006, + "ewc_loss": 0.00516175851225853, + "ewc_loss_diag": 3.814697265625e-06, + "ewc_loss_parallel": 1.3470609701471403e-05, + "grad_norm": 3.2084105014801025, + "learning_rate": 2.3230182280627383e-07, + "loss": 0.5354, + "mean_token_accuracy": 0.8323056101799011, + "num_tokens": 20861291.0, + "step": 549 + }, + { + "epoch": 0.06996565322478056, + "ewc_loss": 0.005183671601116657, + "ewc_loss_diag": 3.814697265625e-06, + "ewc_loss_parallel": 1.3689743354916573e-05, + "grad_norm": 2.662043333053589, + "learning_rate": 2.327257312420517e-07, + "loss": 0.5131, + "mean_token_accuracy": 0.8400939106941223, + "num_tokens": 20897446.0, + "step": 550 + }, + { + "epoch": 0.07009286350337107, + "ewc_loss": 0.005131653510034084, + "ewc_loss_diag": 3.814697265625e-06, + "ewc_loss_parallel": 1.3169560588721652e-05, + "grad_norm": 3.063638925552368, + "learning_rate": 2.3314963967782957e-07, + "loss": 0.5366, + "mean_token_accuracy": 0.8268373608589172, + "num_tokens": 20934423.0, + "step": 551 + }, + { + "epoch": 0.07022007378196159, + "ewc_loss": 0.005200683604925871, + "ewc_loss_diag": 3.844499588012695e-06, + "ewc_loss_parallel": 1.3554687939176802e-05, + "grad_norm": 2.719611406326294, + "learning_rate": 2.3357354811360745e-07, + "loss": 0.5553, + "mean_token_accuracy": 0.8285188674926758, + "num_tokens": 20978394.0, + "step": 552 + }, + { + "epoch": 0.07034728406055209, + "ewc_loss": 0.005209641996771097, + "ewc_loss_diag": 3.874301910400391e-06, + "ewc_loss_parallel": 1.3339096767595038e-05, + "grad_norm": 3.248955011367798, + "learning_rate": 2.3399745654938532e-07, + "loss": 0.5802, + "mean_token_accuracy": 0.8200815320014954, + "num_tokens": 21010669.0, + "step": 553 + }, + { + "epoch": 0.0704744943391426, + "ewc_loss": 0.0052634356543421745, + "ewc_loss_diag": 3.874301910400391e-06, + "ewc_loss_parallel": 1.387703468935797e-05, + "grad_norm": 2.782119035720825, + "learning_rate": 2.344213649851632e-07, + "loss": 0.5498, + "mean_token_accuracy": 0.8294008374214172, + "num_tokens": 21047696.0, + "step": 554 + }, + { + "epoch": 0.07060170461773312, + "ewc_loss": 0.005254702176898718, + "ewc_loss_diag": 3.904104232788086e-06, + "ewc_loss_parallel": 1.3484521332429722e-05, + "grad_norm": 2.7387826442718506, + "learning_rate": 2.3484527342094106e-07, + "loss": 0.6312, + "mean_token_accuracy": 0.8068549633026123, + "num_tokens": 21091437.0, + "step": 555 + }, + { + "epoch": 0.07072891489632362, + "ewc_loss": 0.005248439032584429, + "ewc_loss_diag": 3.904104232788086e-06, + "ewc_loss_parallel": 1.342188897979213e-05, + "grad_norm": 2.785888195037842, + "learning_rate": 2.3526918185671894e-07, + "loss": 0.5921, + "mean_token_accuracy": 0.8161791563034058, + "num_tokens": 21128880.0, + "step": 556 + }, + { + "epoch": 0.07085612517491413, + "ewc_loss": 0.005294754635542631, + "ewc_loss_diag": 3.933906555175781e-06, + "ewc_loss_parallel": 1.357987002847949e-05, + "grad_norm": 2.9560937881469727, + "learning_rate": 2.356930902924968e-07, + "loss": 0.5427, + "mean_token_accuracy": 0.8302665948867798, + "num_tokens": 21168523.0, + "step": 557 + }, + { + "epoch": 0.07098333545350465, + "ewc_loss": 0.005317331291735172, + "ewc_loss_diag": 3.933906555175781e-06, + "ewc_loss_parallel": 1.3805635717289988e-05, + "grad_norm": 3.1829347610473633, + "learning_rate": 2.3611699872827468e-07, + "loss": 0.6288, + "mean_token_accuracy": 0.8036680817604065, + "num_tokens": 21211376.0, + "step": 558 + }, + { + "epoch": 0.07111054573209516, + "ewc_loss": 0.005342424381524324, + "ewc_loss_diag": 3.933906555175781e-06, + "ewc_loss_parallel": 1.4056568033993244e-05, + "grad_norm": 2.5988030433654785, + "learning_rate": 2.3654090716405255e-07, + "loss": 0.5623, + "mean_token_accuracy": 0.8233951330184937, + "num_tokens": 21249162.0, + "step": 559 + }, + { + "epoch": 0.07123775601068566, + "ewc_loss": 0.0052848877385258675, + "ewc_loss_diag": 3.933906555175781e-06, + "ewc_loss_parallel": 1.3481200767273549e-05, + "grad_norm": 2.936584711074829, + "learning_rate": 2.3696481559983043e-07, + "loss": 0.5879, + "mean_token_accuracy": 0.8147860169410706, + "num_tokens": 21282351.0, + "step": 560 + }, + { + "epoch": 0.07136496628927617, + "ewc_loss": 0.005320947151631117, + "ewc_loss_diag": 3.933906555175781e-06, + "ewc_loss_parallel": 1.3841796317137778e-05, + "grad_norm": 2.9951088428497314, + "learning_rate": 2.373887240356083e-07, + "loss": 0.5792, + "mean_token_accuracy": 0.8190667629241943, + "num_tokens": 21319508.0, + "step": 561 + }, + { + "epoch": 0.07149217656786669, + "ewc_loss": 0.005335552617907524, + "ewc_loss_diag": 3.933906555175781e-06, + "ewc_loss_parallel": 1.3987848433316685e-05, + "grad_norm": 2.7408297061920166, + "learning_rate": 2.3781263247138617e-07, + "loss": 0.5442, + "mean_token_accuracy": 0.8304173946380615, + "num_tokens": 21357111.0, + "step": 562 + }, + { + "epoch": 0.07161938684645719, + "ewc_loss": 0.00534325186163187, + "ewc_loss_diag": 3.9637088775634766e-06, + "ewc_loss_parallel": 1.3759665307588875e-05, + "grad_norm": 2.7038583755493164, + "learning_rate": 2.3823654090716404e-07, + "loss": 0.5454, + "mean_token_accuracy": 0.825931191444397, + "num_tokens": 21393222.0, + "step": 563 + }, + { + "epoch": 0.0717465971250477, + "ewc_loss": 0.005406002979725599, + "ewc_loss_diag": 4.023313522338867e-06, + "ewc_loss_parallel": 1.3776826563116629e-05, + "grad_norm": 2.655035972595215, + "learning_rate": 2.386604493429419e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8397406935691833, + "num_tokens": 21429595.0, + "step": 564 + }, + { + "epoch": 0.07187380740363822, + "ewc_loss": 0.0054140472784638405, + "ewc_loss_diag": 4.023313522338867e-06, + "ewc_loss_parallel": 1.3857272278983146e-05, + "grad_norm": 2.9762465953826904, + "learning_rate": 2.390843577787198e-07, + "loss": 0.5956, + "mean_token_accuracy": 0.8100783824920654, + "num_tokens": 21467341.0, + "step": 565 + }, + { + "epoch": 0.07200101768222872, + "ewc_loss": 0.005456398241221905, + "ewc_loss_diag": 4.023313522338867e-06, + "ewc_loss_parallel": 1.4280776667874306e-05, + "grad_norm": 2.6080403327941895, + "learning_rate": 2.3950826621449766e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.849816083908081, + "num_tokens": 21510927.0, + "step": 566 + }, + { + "epoch": 0.07212822796081923, + "ewc_loss": 0.005420655943453312, + "ewc_loss_diag": 4.023313522338867e-06, + "ewc_loss_parallel": 1.3923355254519265e-05, + "grad_norm": 2.6856822967529297, + "learning_rate": 2.3993217465027556e-07, + "loss": 0.5498, + "mean_token_accuracy": 0.8288774490356445, + "num_tokens": 21551604.0, + "step": 567 + }, + { + "epoch": 0.07225543823940975, + "ewc_loss": 0.005432005040347576, + "ewc_loss_diag": 4.023313522338867e-06, + "ewc_loss_parallel": 1.4036845641385298e-05, + "grad_norm": 2.9637370109558105, + "learning_rate": 2.403560830860534e-07, + "loss": 0.5996, + "mean_token_accuracy": 0.8170989751815796, + "num_tokens": 21591871.0, + "step": 568 + }, + { + "epoch": 0.07238264851800025, + "ewc_loss": 0.005473284982144833, + "ewc_loss_diag": 4.023313522338867e-06, + "ewc_loss_parallel": 1.4449646187131293e-05, + "grad_norm": 2.9563353061676025, + "learning_rate": 2.4077999152183125e-07, + "loss": 0.5977, + "mean_token_accuracy": 0.8136693239212036, + "num_tokens": 21629810.0, + "step": 569 + }, + { + "epoch": 0.07250985879659076, + "ewc_loss": 0.005509659182280302, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4508212188957259e-05, + "grad_norm": 2.6851813793182373, + "learning_rate": 2.4120389995760915e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8456557989120483, + "num_tokens": 21668706.0, + "step": 570 + }, + { + "epoch": 0.07263706907518128, + "ewc_loss": 0.005477031227201223, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4181933693180326e-05, + "grad_norm": 2.790642023086548, + "learning_rate": 2.41627808393387e-07, + "loss": 0.5396, + "mean_token_accuracy": 0.8283490538597107, + "num_tokens": 21712725.0, + "step": 571 + }, + { + "epoch": 0.07276427935377179, + "ewc_loss": 0.005490874405950308, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.432036606274778e-05, + "grad_norm": 2.8676202297210693, + "learning_rate": 2.420517168291649e-07, + "loss": 0.5507, + "mean_token_accuracy": 0.8302515745162964, + "num_tokens": 21746086.0, + "step": 572 + }, + { + "epoch": 0.07289148963236229, + "ewc_loss": 0.005512346047908068, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.453508320992114e-05, + "grad_norm": 2.8022687435150146, + "learning_rate": 2.4247562526494274e-07, + "loss": 0.5105, + "mean_token_accuracy": 0.8363657593727112, + "num_tokens": 21779573.0, + "step": 573 + }, + { + "epoch": 0.0730186999109528, + "ewc_loss": 0.00550907151773572, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4502335034194402e-05, + "grad_norm": 2.6620535850524902, + "learning_rate": 2.4289953370072064e-07, + "loss": 0.5818, + "mean_token_accuracy": 0.8168144226074219, + "num_tokens": 21818766.0, + "step": 574 + }, + { + "epoch": 0.07314591018954332, + "ewc_loss": 0.0054972730576992035, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4384351743501611e-05, + "grad_norm": 2.7297918796539307, + "learning_rate": 2.433234421364985e-07, + "loss": 0.5376, + "mean_token_accuracy": 0.8340635299682617, + "num_tokens": 21855539.0, + "step": 575 + }, + { + "epoch": 0.07327312046813382, + "ewc_loss": 0.005513359326869249, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4545215890393592e-05, + "grad_norm": 2.6584839820861816, + "learning_rate": 2.437473505722764e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8386043310165405, + "num_tokens": 21892567.0, + "step": 576 + }, + { + "epoch": 0.07340033074672433, + "ewc_loss": 0.005514140240848064, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4553024811903015e-05, + "grad_norm": 2.796921730041504, + "learning_rate": 2.4417125900805423e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8279377222061157, + "num_tokens": 21928607.0, + "step": 577 + }, + { + "epoch": 0.07352754102531485, + "ewc_loss": 0.005529122427105904, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4702843145641964e-05, + "grad_norm": 2.792269468307495, + "learning_rate": 2.4459516744383213e-07, + "loss": 0.5693, + "mean_token_accuracy": 0.821999728679657, + "num_tokens": 21963781.0, + "step": 578 + }, + { + "epoch": 0.07365475130390535, + "ewc_loss": 0.005536889191716909, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4780512174183968e-05, + "grad_norm": 2.7024505138397217, + "learning_rate": 2.4501907587961e-07, + "loss": 0.5299, + "mean_token_accuracy": 0.8337200880050659, + "num_tokens": 21999140.0, + "step": 579 + }, + { + "epoch": 0.07378196158249586, + "ewc_loss": 0.00553264981135726, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4738120626134332e-05, + "grad_norm": 2.773829460144043, + "learning_rate": 2.454429843153879e-07, + "loss": 0.6284, + "mean_token_accuracy": 0.8028076887130737, + "num_tokens": 22038703.0, + "step": 580 + }, + { + "epoch": 0.07390917186108638, + "ewc_loss": 0.005537408869713545, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4785708117415197e-05, + "grad_norm": 3.042351007461548, + "learning_rate": 2.458668927511657e-07, + "loss": 0.5436, + "mean_token_accuracy": 0.8219327926635742, + "num_tokens": 22071052.0, + "step": 581 + }, + { + "epoch": 0.07403638213967688, + "ewc_loss": 0.005575221031904221, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.51638341776561e-05, + "grad_norm": 2.705437183380127, + "learning_rate": 2.462908011869436e-07, + "loss": 0.5083, + "mean_token_accuracy": 0.839931845664978, + "num_tokens": 22109629.0, + "step": 582 + }, + { + "epoch": 0.0741635924182674, + "ewc_loss": 0.005535589065402746, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4767511856916826e-05, + "grad_norm": 2.534822463989258, + "learning_rate": 2.4671470962272147e-07, + "loss": 0.5203, + "mean_token_accuracy": 0.8362745046615601, + "num_tokens": 22151544.0, + "step": 583 + }, + { + "epoch": 0.07429080269685791, + "ewc_loss": 0.005519275553524494, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4604379430238623e-05, + "grad_norm": 2.6703755855560303, + "learning_rate": 2.4713861805849937e-07, + "loss": 0.4982, + "mean_token_accuracy": 0.8396883606910706, + "num_tokens": 22189241.0, + "step": 584 + }, + { + "epoch": 0.07441801297544842, + "ewc_loss": 0.005545289721339941, + "ewc_loss_diag": 4.0531158447265625e-06, + "ewc_loss_parallel": 1.4864519471302629e-05, + "grad_norm": 2.545013427734375, + "learning_rate": 2.475625264942772e-07, + "loss": 0.5138, + "mean_token_accuracy": 0.8362098932266235, + "num_tokens": 22229138.0, + "step": 585 + }, + { + "epoch": 0.07454522325403892, + "ewc_loss": 0.005571904592216015, + "ewc_loss_diag": 4.082918167114258e-06, + "ewc_loss_parallel": 1.4825488506176043e-05, + "grad_norm": 2.7676002979278564, + "learning_rate": 2.479864349300551e-07, + "loss": 0.5515, + "mean_token_accuracy": 0.8280249834060669, + "num_tokens": 22264930.0, + "step": 586 + }, + { + "epoch": 0.07467243353262944, + "ewc_loss": 0.005631275475025177, + "ewc_loss_diag": 4.112720489501953e-06, + "ewc_loss_parallel": 1.5114022062334698e-05, + "grad_norm": 2.873009204864502, + "learning_rate": 2.4841034336583296e-07, + "loss": 0.4953, + "mean_token_accuracy": 0.8377248048782349, + "num_tokens": 22297162.0, + "step": 587 + }, + { + "epoch": 0.07479964381121995, + "ewc_loss": 0.005620215088129044, + "ewc_loss_diag": 4.082918167114258e-06, + "ewc_loss_parallel": 1.5308596630347893e-05, + "grad_norm": 2.7114083766937256, + "learning_rate": 2.488342518016108e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8162283897399902, + "num_tokens": 22334779.0, + "step": 588 + }, + { + "epoch": 0.07492685408981045, + "ewc_loss": 0.005629764869809151, + "ewc_loss_diag": 4.112720489501953e-06, + "ewc_loss_parallel": 1.5098917174327653e-05, + "grad_norm": 2.845000982284546, + "learning_rate": 2.492581602373887e-07, + "loss": 0.5353, + "mean_token_accuracy": 0.8326573967933655, + "num_tokens": 22372820.0, + "step": 589 + }, + { + "epoch": 0.07505406436840097, + "ewc_loss": 0.00564254354685545, + "ewc_loss_diag": 4.112720489501953e-06, + "ewc_loss_parallel": 1.5226704817905556e-05, + "grad_norm": 2.652567148208618, + "learning_rate": 2.4968206867316655e-07, + "loss": 0.5551, + "mean_token_accuracy": 0.8251084089279175, + "num_tokens": 22417841.0, + "step": 590 + }, + { + "epoch": 0.07518127464699148, + "ewc_loss": 0.0056210169568657875, + "ewc_loss_diag": 4.112720489501953e-06, + "ewc_loss_parallel": 1.5011440154921729e-05, + "grad_norm": 2.7437729835510254, + "learning_rate": 2.5010597710894445e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8510193824768066, + "num_tokens": 22449133.0, + "step": 591 + }, + { + "epoch": 0.07530848492558198, + "ewc_loss": 0.0056989435106515884, + "ewc_loss_diag": 4.172325134277344e-06, + "ewc_loss_parallel": 1.5180351510934997e-05, + "grad_norm": 6.953911781311035, + "learning_rate": 2.505298855447223e-07, + "loss": 0.535, + "mean_token_accuracy": 0.8324172496795654, + "num_tokens": 22491145.0, + "step": 592 + }, + { + "epoch": 0.0754356952041725, + "ewc_loss": 0.00595503905788064, + "ewc_loss_diag": 4.112720489501953e-06, + "ewc_loss_parallel": 1.8351660401094705e-05, + "grad_norm": 2.8908939361572266, + "learning_rate": 2.509537939805002e-07, + "loss": 0.5475, + "mean_token_accuracy": 0.8294419050216675, + "num_tokens": 22531738.0, + "step": 593 + }, + { + "epoch": 0.07556290548276301, + "ewc_loss": 0.005822066217660904, + "ewc_loss_diag": 4.202127456665039e-06, + "ewc_loss_parallel": 1.6106401744764298e-05, + "grad_norm": 2.947782039642334, + "learning_rate": 2.513777024162781e-07, + "loss": 0.5268, + "mean_token_accuracy": 0.835753321647644, + "num_tokens": 22563005.0, + "step": 594 + }, + { + "epoch": 0.07569011576135352, + "ewc_loss": 0.005738039966672659, + "ewc_loss_diag": 4.202127456665039e-06, + "ewc_loss_parallel": 1.5266141417669132e-05, + "grad_norm": 2.6264443397521973, + "learning_rate": 2.5180161085205594e-07, + "loss": 0.509, + "mean_token_accuracy": 0.8405832052230835, + "num_tokens": 22601333.0, + "step": 595 + }, + { + "epoch": 0.07581732603994402, + "ewc_loss": 0.005717722699046135, + "ewc_loss_diag": 4.202127456665039e-06, + "ewc_loss_parallel": 1.5062971215229481e-05, + "grad_norm": 2.6045429706573486, + "learning_rate": 2.522255192878338e-07, + "loss": 0.5605, + "mean_token_accuracy": 0.8232861757278442, + "num_tokens": 22639528.0, + "step": 596 + }, + { + "epoch": 0.07594453631853454, + "ewc_loss": 0.005754454992711544, + "ewc_loss_diag": 4.231929779052734e-06, + "ewc_loss_parallel": 1.5125117897696327e-05, + "grad_norm": 2.693627119064331, + "learning_rate": 2.526494277236117e-07, + "loss": 0.5438, + "mean_token_accuracy": 0.828762948513031, + "num_tokens": 22675290.0, + "step": 597 + }, + { + "epoch": 0.07607174659712505, + "ewc_loss": 0.005774312652647495, + "ewc_loss_diag": 4.231929779052734e-06, + "ewc_loss_parallel": 1.532369242340792e-05, + "grad_norm": 2.8120946884155273, + "learning_rate": 2.530733361593896e-07, + "loss": 0.5201, + "mean_token_accuracy": 0.8347123861312866, + "num_tokens": 22712823.0, + "step": 598 + }, + { + "epoch": 0.07619895687571555, + "ewc_loss": 0.005789764691144228, + "ewc_loss_diag": 4.231929779052734e-06, + "ewc_loss_parallel": 1.5478211935260333e-05, + "grad_norm": 2.6781156063079834, + "learning_rate": 2.5349724459516743e-07, + "loss": 0.542, + "mean_token_accuracy": 0.8298423290252686, + "num_tokens": 22750916.0, + "step": 599 + }, + { + "epoch": 0.07632616715430607, + "ewc_loss": 0.005770013201981783, + "ewc_loss_diag": 4.231929779052734e-06, + "ewc_loss_parallel": 1.5280696970876306e-05, + "grad_norm": 3.005687952041626, + "learning_rate": 2.539211530309453e-07, + "loss": 0.4938, + "mean_token_accuracy": 0.841137170791626, + "num_tokens": 22782537.0, + "step": 600 + }, + { + "epoch": 0.07645337743289658, + "ewc_loss": 0.005813058000057936, + "ewc_loss_diag": 4.231929779052734e-06, + "ewc_loss_parallel": 1.5711146261310205e-05, + "grad_norm": 3.0424258708953857, + "learning_rate": 2.543450614667232e-07, + "loss": 0.5839, + "mean_token_accuracy": 0.818285346031189, + "num_tokens": 22814805.0, + "step": 601 + }, + { + "epoch": 0.07658058771148708, + "ewc_loss": 0.005849340930581093, + "ewc_loss_diag": 4.26173210144043e-06, + "ewc_loss_parallel": 1.5768800949444994e-05, + "grad_norm": 2.7269110679626465, + "learning_rate": 2.547689699025011e-07, + "loss": 0.5825, + "mean_token_accuracy": 0.8164546489715576, + "num_tokens": 22852654.0, + "step": 602 + }, + { + "epoch": 0.0767077979900776, + "ewc_loss": 0.005838315933942795, + "ewc_loss_diag": 4.291534423828125e-06, + "ewc_loss_parallel": 1.5353376511484385e-05, + "grad_norm": 2.8234992027282715, + "learning_rate": 2.551928783382789e-07, + "loss": 0.536, + "mean_token_accuracy": 0.8285054564476013, + "num_tokens": 22894205.0, + "step": 603 + }, + { + "epoch": 0.07683500826866811, + "ewc_loss": 0.005851395428180695, + "ewc_loss_diag": 4.291534423828125e-06, + "ewc_loss_parallel": 1.5484169125556946e-05, + "grad_norm": 2.7300868034362793, + "learning_rate": 2.5561678677405677e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.8384872078895569, + "num_tokens": 22924095.0, + "step": 604 + }, + { + "epoch": 0.07696221854725861, + "ewc_loss": 0.005847651045769453, + "ewc_loss_diag": 4.291534423828125e-06, + "ewc_loss_parallel": 1.5446725228684954e-05, + "grad_norm": 2.6512093544006348, + "learning_rate": 2.5604069520983467e-07, + "loss": 0.5845, + "mean_token_accuracy": 0.8141365647315979, + "num_tokens": 22960857.0, + "step": 605 + }, + { + "epoch": 0.07708942882584913, + "ewc_loss": 0.005840213969349861, + "ewc_loss_diag": 4.291534423828125e-06, + "ewc_loss_parallel": 1.5372355846920982e-05, + "grad_norm": 2.563754081726074, + "learning_rate": 2.564646036456125e-07, + "loss": 0.5383, + "mean_token_accuracy": 0.8271360397338867, + "num_tokens": 23004670.0, + "step": 606 + }, + { + "epoch": 0.07721663910443964, + "ewc_loss": 0.0058703431859612465, + "ewc_loss_diag": 4.32133674621582e-06, + "ewc_loss_parallel": 1.536847230454441e-05, + "grad_norm": 2.826997995376587, + "learning_rate": 2.568885120813904e-07, + "loss": 0.5371, + "mean_token_accuracy": 0.8286688327789307, + "num_tokens": 23041999.0, + "step": 607 + }, + { + "epoch": 0.07734384938303016, + "ewc_loss": 0.005912602413445711, + "ewc_loss_diag": 4.32133674621582e-06, + "ewc_loss_parallel": 1.5791063560754992e-05, + "grad_norm": 2.6514689922332764, + "learning_rate": 2.5731242051716826e-07, + "loss": 0.5261, + "mean_token_accuracy": 0.8354514241218567, + "num_tokens": 23080704.0, + "step": 608 + }, + { + "epoch": 0.07747105966162066, + "ewc_loss": 0.005924514029175043, + "ewc_loss_diag": 4.351139068603516e-06, + "ewc_loss_parallel": 1.5605002772645094e-05, + "grad_norm": 2.703855037689209, + "learning_rate": 2.5773632895294616e-07, + "loss": 0.561, + "mean_token_accuracy": 0.8246555924415588, + "num_tokens": 23118662.0, + "step": 609 + }, + { + "epoch": 0.07759826994021117, + "ewc_loss": 0.0059338887222111225, + "ewc_loss_diag": 4.351139068603516e-06, + "ewc_loss_parallel": 1.5698751667514443e-05, + "grad_norm": 2.724449634552002, + "learning_rate": 2.58160237388724e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8429619073867798, + "num_tokens": 23150367.0, + "step": 610 + }, + { + "epoch": 0.07772548021880168, + "ewc_loss": 0.0059454720467329025, + "ewc_loss_diag": 4.351139068603516e-06, + "ewc_loss_parallel": 1.5814581274753436e-05, + "grad_norm": 2.738842248916626, + "learning_rate": 2.585841458245019e-07, + "loss": 0.5421, + "mean_token_accuracy": 0.8285900354385376, + "num_tokens": 23185836.0, + "step": 611 + }, + { + "epoch": 0.07785269049739219, + "ewc_loss": 0.005952848121523857, + "ewc_loss_diag": 4.351139068603516e-06, + "ewc_loss_parallel": 1.5888344933046028e-05, + "grad_norm": 2.5938880443573, + "learning_rate": 2.5900805426027975e-07, + "loss": 0.484, + "mean_token_accuracy": 0.8454991579055786, + "num_tokens": 23219346.0, + "step": 612 + }, + { + "epoch": 0.0779799007759827, + "ewc_loss": 0.005932574160397053, + "ewc_loss_diag": 4.351139068603516e-06, + "ewc_loss_parallel": 1.5685605831095017e-05, + "grad_norm": 2.667797565460205, + "learning_rate": 2.5943196269605765e-07, + "loss": 0.5537, + "mean_token_accuracy": 0.8311323523521423, + "num_tokens": 23257569.0, + "step": 613 + }, + { + "epoch": 0.07810711105457321, + "ewc_loss": 0.005954449065029621, + "ewc_loss_diag": 4.351139068603516e-06, + "ewc_loss_parallel": 1.5904353858786635e-05, + "grad_norm": 2.461846113204956, + "learning_rate": 2.598558711318355e-07, + "loss": 0.5457, + "mean_token_accuracy": 0.8304030895233154, + "num_tokens": 23302944.0, + "step": 614 + }, + { + "epoch": 0.07823432133316371, + "ewc_loss": 0.005936247296631336, + "ewc_loss_diag": 4.351139068603516e-06, + "ewc_loss_parallel": 1.5722338503110223e-05, + "grad_norm": 2.595069646835327, + "learning_rate": 2.602797795676134e-07, + "loss": 0.5516, + "mean_token_accuracy": 0.8260966539382935, + "num_tokens": 23341660.0, + "step": 615 + }, + { + "epoch": 0.07836153161175423, + "ewc_loss": 0.0059927674010396, + "ewc_loss_diag": 4.380941390991211e-06, + "ewc_loss_parallel": 1.5982363038347103e-05, + "grad_norm": 2.7850301265716553, + "learning_rate": 2.6070368800339124e-07, + "loss": 0.5475, + "mean_token_accuracy": 0.8220791220664978, + "num_tokens": 23375956.0, + "step": 616 + }, + { + "epoch": 0.07848874189034474, + "ewc_loss": 0.006031571887433529, + "ewc_loss_diag": 4.380941390991211e-06, + "ewc_loss_parallel": 1.6370404409826733e-05, + "grad_norm": 2.6590700149536133, + "learning_rate": 2.6112759643916914e-07, + "loss": 0.5004, + "mean_token_accuracy": 0.839968204498291, + "num_tokens": 23407568.0, + "step": 617 + }, + { + "epoch": 0.07861595216893524, + "ewc_loss": 0.00603920454159379, + "ewc_loss_diag": 4.410743713378906e-06, + "ewc_loss_parallel": 1.614155735296663e-05, + "grad_norm": 2.717104911804199, + "learning_rate": 2.61551504874947e-07, + "loss": 0.5317, + "mean_token_accuracy": 0.8318856954574585, + "num_tokens": 23440261.0, + "step": 618 + }, + { + "epoch": 0.07874316244752576, + "ewc_loss": 0.006021374370902777, + "ewc_loss_diag": 4.380941390991211e-06, + "ewc_loss_parallel": 1.6268431863863952e-05, + "grad_norm": 2.63165545463562, + "learning_rate": 2.619754133107249e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8377112150192261, + "num_tokens": 23481706.0, + "step": 619 + }, + { + "epoch": 0.07887037272611627, + "ewc_loss": 0.006042882800102234, + "ewc_loss_diag": 4.410743713378906e-06, + "ewc_loss_parallel": 1.6178340956685133e-05, + "grad_norm": 2.784144639968872, + "learning_rate": 2.623993217465028e-07, + "loss": 0.575, + "mean_token_accuracy": 0.815623939037323, + "num_tokens": 23515410.0, + "step": 620 + }, + { + "epoch": 0.07899758300470679, + "ewc_loss": 0.0060681370086967945, + "ewc_loss_diag": 4.410743713378906e-06, + "ewc_loss_parallel": 1.6430882169515826e-05, + "grad_norm": 2.6121115684509277, + "learning_rate": 2.6282323018228063e-07, + "loss": 0.5586, + "mean_token_accuracy": 0.827308714389801, + "num_tokens": 23554627.0, + "step": 621 + }, + { + "epoch": 0.07912479328329729, + "ewc_loss": 0.006076600402593613, + "ewc_loss_diag": 4.470348358154297e-06, + "ewc_loss_parallel": 1.6210340618272312e-05, + "grad_norm": 2.5871920585632324, + "learning_rate": 2.632471386180585e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.8156650066375732, + "num_tokens": 23593400.0, + "step": 622 + }, + { + "epoch": 0.0792520035618878, + "ewc_loss": 0.006111597642302513, + "ewc_loss_diag": 4.500150680541992e-06, + "ewc_loss_parallel": 1.6255135051324032e-05, + "grad_norm": 2.7571539878845215, + "learning_rate": 2.6367104705383637e-07, + "loss": 0.58, + "mean_token_accuracy": 0.816495418548584, + "num_tokens": 23628242.0, + "step": 623 + }, + { + "epoch": 0.07937921384047832, + "ewc_loss": 0.006144671700894833, + "ewc_loss_diag": 4.500150680541992e-06, + "ewc_loss_parallel": 1.6585876437602565e-05, + "grad_norm": 2.5198962688446045, + "learning_rate": 2.6409495548961427e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.8256683945655823, + "num_tokens": 23671041.0, + "step": 624 + }, + { + "epoch": 0.07950642411906882, + "ewc_loss": 0.0061121741309762, + "ewc_loss_diag": 4.500150680541992e-06, + "ewc_loss_parallel": 1.6260903066722676e-05, + "grad_norm": 2.829206943511963, + "learning_rate": 2.6451886392539206e-07, + "loss": 0.5435, + "mean_token_accuracy": 0.8311895728111267, + "num_tokens": 23701904.0, + "step": 625 + }, + { + "epoch": 0.07963363439765933, + "ewc_loss": 0.006258354522287846, + "ewc_loss_diag": 4.589557647705078e-06, + "ewc_loss_parallel": 1.680717832641676e-05, + "grad_norm": 2.5593018531799316, + "learning_rate": 2.6494277236116996e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8382618427276611, + "num_tokens": 23742374.0, + "step": 626 + }, + { + "epoch": 0.07976084467624985, + "ewc_loss": 0.006190669722855091, + "ewc_loss_diag": 4.559755325317383e-06, + "ewc_loss_parallel": 1.6435507859569043e-05, + "grad_norm": 2.706631898880005, + "learning_rate": 2.6536668079694786e-07, + "loss": 0.6127, + "mean_token_accuracy": 0.8087316751480103, + "num_tokens": 23780216.0, + "step": 627 + }, + { + "epoch": 0.07988805495484035, + "ewc_loss": 0.0062781465239822865, + "ewc_loss_diag": 4.6193599700927734e-06, + "ewc_loss_parallel": 1.669992161623668e-05, + "grad_norm": 2.846949577331543, + "learning_rate": 2.6579058923272576e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8522120714187622, + "num_tokens": 23813641.0, + "step": 628 + }, + { + "epoch": 0.08001526523343086, + "ewc_loss": 0.006301825866103172, + "ewc_loss_diag": 4.6193599700927734e-06, + "ewc_loss_parallel": 1.6936714018811472e-05, + "grad_norm": 2.631166696548462, + "learning_rate": 2.6621449766850356e-07, + "loss": 0.5526, + "mean_token_accuracy": 0.826214075088501, + "num_tokens": 23850189.0, + "step": 629 + }, + { + "epoch": 0.08014247551202137, + "ewc_loss": 0.006266807205975056, + "ewc_loss_diag": 4.6193599700927734e-06, + "ewc_loss_parallel": 1.6586529454798438e-05, + "grad_norm": 2.5893328189849854, + "learning_rate": 2.6663840610428145e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8390181064605713, + "num_tokens": 23884410.0, + "step": 630 + }, + { + "epoch": 0.08026968579061187, + "ewc_loss": 0.006266633048653603, + "ewc_loss_diag": 4.6193599700927734e-06, + "ewc_loss_parallel": 1.6584788681939244e-05, + "grad_norm": 2.4910202026367188, + "learning_rate": 2.6706231454005935e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8366594314575195, + "num_tokens": 23930139.0, + "step": 631 + }, + { + "epoch": 0.08039689606920239, + "ewc_loss": 0.0062618074007332325, + "ewc_loss_diag": 4.6193599700927734e-06, + "ewc_loss_parallel": 1.6536532712052576e-05, + "grad_norm": 2.4287736415863037, + "learning_rate": 2.6748622297583725e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8369813561439514, + "num_tokens": 23973124.0, + "step": 632 + }, + { + "epoch": 0.0805241063477929, + "ewc_loss": 0.006291799247264862, + "ewc_loss_diag": 4.649162292480469e-06, + "ewc_loss_parallel": 1.653127583267633e-05, + "grad_norm": 2.720899820327759, + "learning_rate": 2.6791013141161505e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.83579421043396, + "num_tokens": 24008676.0, + "step": 633 + }, + { + "epoch": 0.08065131662638342, + "ewc_loss": 0.006322498433291912, + "ewc_loss_diag": 4.6193599700927734e-06, + "ewc_loss_parallel": 1.714343852654565e-05, + "grad_norm": 2.901794672012329, + "learning_rate": 2.6833403984739294e-07, + "loss": 0.5233, + "mean_token_accuracy": 0.8339544534683228, + "num_tokens": 24044880.0, + "step": 634 + }, + { + "epoch": 0.08077852690497392, + "ewc_loss": 0.0063749048858881, + "ewc_loss_diag": 4.649162292480469e-06, + "ewc_loss_parallel": 1.7362332073389553e-05, + "grad_norm": 2.472381830215454, + "learning_rate": 2.6875794828317084e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.8405391573905945, + "num_tokens": 24090140.0, + "step": 635 + }, + { + "epoch": 0.08090573718356443, + "ewc_loss": 0.00630913395434618, + "ewc_loss_diag": 4.649162292480469e-06, + "ewc_loss_parallel": 1.6704618246876635e-05, + "grad_norm": 2.738886594772339, + "learning_rate": 2.6918185671894874e-07, + "loss": 0.596, + "mean_token_accuracy": 0.8163609504699707, + "num_tokens": 24123311.0, + "step": 636 + }, + { + "epoch": 0.08103294746215495, + "ewc_loss": 0.006355405785143375, + "ewc_loss_diag": 4.649162292480469e-06, + "ewc_loss_parallel": 1.716733822831884e-05, + "grad_norm": 2.7094812393188477, + "learning_rate": 2.6960576515472654e-07, + "loss": 0.6306, + "mean_token_accuracy": 0.8019572496414185, + "num_tokens": 24164019.0, + "step": 637 + }, + { + "epoch": 0.08116015774074545, + "ewc_loss": 0.006393025163561106, + "ewc_loss_diag": 4.678964614868164e-06, + "ewc_loss_parallel": 1.723835703160148e-05, + "grad_norm": 2.5914368629455566, + "learning_rate": 2.7002967359050443e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8425393104553223, + "num_tokens": 24205986.0, + "step": 638 + }, + { + "epoch": 0.08128736801933596, + "ewc_loss": 0.006370007060468197, + "ewc_loss_diag": 4.678964614868164e-06, + "ewc_loss_parallel": 1.7008176655508578e-05, + "grad_norm": 2.557645559310913, + "learning_rate": 2.7045358202628233e-07, + "loss": 0.5548, + "mean_token_accuracy": 0.822283923625946, + "num_tokens": 24251956.0, + "step": 639 + }, + { + "epoch": 0.08141457829792648, + "ewc_loss": 0.006336970254778862, + "ewc_loss_diag": 4.649162292480469e-06, + "ewc_loss_parallel": 1.698298547125887e-05, + "grad_norm": 2.6110363006591797, + "learning_rate": 2.7087749046206023e-07, + "loss": 0.502, + "mean_token_accuracy": 0.8388174772262573, + "num_tokens": 24284990.0, + "step": 640 + }, + { + "epoch": 0.08154178857651698, + "ewc_loss": 0.006355311721563339, + "ewc_loss_diag": 4.649162292480469e-06, + "ewc_loss_parallel": 1.7166395991807804e-05, + "grad_norm": 2.6720213890075684, + "learning_rate": 2.71301398897838e-07, + "loss": 0.5577, + "mean_token_accuracy": 0.8233961462974548, + "num_tokens": 24323305.0, + "step": 641 + }, + { + "epoch": 0.08166899885510749, + "ewc_loss": 0.006402531638741493, + "ewc_loss_diag": 4.678964614868164e-06, + "ewc_loss_parallel": 1.733341923682019e-05, + "grad_norm": 2.554684638977051, + "learning_rate": 2.717253073336159e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8488264083862305, + "num_tokens": 24365847.0, + "step": 642 + }, + { + "epoch": 0.081796209133698, + "ewc_loss": 0.00634837057441473, + "ewc_loss_diag": 4.649162292480469e-06, + "ewc_loss_parallel": 1.70969869941473e-05, + "grad_norm": 2.547968864440918, + "learning_rate": 2.721492157693938e-07, + "loss": 0.476, + "mean_token_accuracy": 0.8477078676223755, + "num_tokens": 24407269.0, + "step": 643 + }, + { + "epoch": 0.0819234194122885, + "ewc_loss": 0.006383213214576244, + "ewc_loss_diag": 4.678964614868164e-06, + "ewc_loss_parallel": 1.7140238924184814e-05, + "grad_norm": 2.6084587574005127, + "learning_rate": 2.7257312420517167e-07, + "loss": 0.532, + "mean_token_accuracy": 0.8313010334968567, + "num_tokens": 24444990.0, + "step": 644 + }, + { + "epoch": 0.08205062969087902, + "ewc_loss": 0.006398245692253113, + "ewc_loss_diag": 4.678964614868164e-06, + "ewc_loss_parallel": 1.729056384647265e-05, + "grad_norm": 2.484252452850342, + "learning_rate": 2.729970326409495e-07, + "loss": 0.5615, + "mean_token_accuracy": 0.8252276182174683, + "num_tokens": 24487293.0, + "step": 645 + }, + { + "epoch": 0.08217783996946953, + "ewc_loss": 0.006439918652176857, + "ewc_loss_diag": 4.738569259643555e-06, + "ewc_loss_parallel": 1.7096937881433405e-05, + "grad_norm": 2.7102766036987305, + "learning_rate": 2.734209410767274e-07, + "loss": 0.5241, + "mean_token_accuracy": 0.8345855474472046, + "num_tokens": 24521403.0, + "step": 646 + }, + { + "epoch": 0.08230505024806005, + "ewc_loss": 0.006487012840807438, + "ewc_loss_diag": 4.738569259643555e-06, + "ewc_loss_parallel": 1.756788515194785e-05, + "grad_norm": 2.614475727081299, + "learning_rate": 2.738448495125053e-07, + "loss": 0.5348, + "mean_token_accuracy": 0.8337790369987488, + "num_tokens": 24562740.0, + "step": 647 + }, + { + "epoch": 0.08243226052665055, + "ewc_loss": 0.006501801311969757, + "ewc_loss_diag": 4.76837158203125e-06, + "ewc_loss_parallel": 1.7410591681255028e-05, + "grad_norm": 2.6987340450286865, + "learning_rate": 2.7426875794828316e-07, + "loss": 0.541, + "mean_token_accuracy": 0.8254110217094421, + "num_tokens": 24597507.0, + "step": 648 + }, + { + "epoch": 0.08255947080524106, + "ewc_loss": 0.0065137241035699844, + "ewc_loss_diag": 4.76837158203125e-06, + "ewc_loss_parallel": 1.752981734171044e-05, + "grad_norm": 2.6022305488586426, + "learning_rate": 2.74692666384061e-07, + "loss": 0.503, + "mean_token_accuracy": 0.8379186391830444, + "num_tokens": 24635629.0, + "step": 649 + }, + { + "epoch": 0.08268668108383158, + "ewc_loss": 0.006506957113742828, + "ewc_loss_diag": 4.76837158203125e-06, + "ewc_loss_parallel": 1.7462147297919728e-05, + "grad_norm": 2.534175157546997, + "learning_rate": 2.751165748198389e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8307693600654602, + "num_tokens": 24676087.0, + "step": 650 + }, + { + "epoch": 0.08281389136242208, + "ewc_loss": 0.006501343101263046, + "ewc_loss_diag": 4.76837158203125e-06, + "ewc_loss_parallel": 1.7406009646947496e-05, + "grad_norm": 2.6734931468963623, + "learning_rate": 2.755404832556168e-07, + "loss": 0.5452, + "mean_token_accuracy": 0.8276430368423462, + "num_tokens": 24713652.0, + "step": 651 + }, + { + "epoch": 0.0829411016410126, + "ewc_loss": 0.006628274451941252, + "ewc_loss_diag": 4.857778549194336e-06, + "ewc_loss_parallel": 1.775979580997955e-05, + "grad_norm": 2.6798336505889893, + "learning_rate": 2.7596439169139465e-07, + "loss": 0.5289, + "mean_token_accuracy": 0.834001898765564, + "num_tokens": 24750130.0, + "step": 652 + }, + { + "epoch": 0.08306831191960311, + "ewc_loss": 0.006656446494162083, + "ewc_loss_diag": 4.887580871582031e-06, + "ewc_loss_parallel": 1.773634176061023e-05, + "grad_norm": 2.6879658699035645, + "learning_rate": 2.763883001271725e-07, + "loss": 0.5781, + "mean_token_accuracy": 0.8180346488952637, + "num_tokens": 24787337.0, + "step": 653 + }, + { + "epoch": 0.08319552219819361, + "ewc_loss": 0.006656257435679436, + "ewc_loss_diag": 4.887580871582031e-06, + "ewc_loss_parallel": 1.7734451830619946e-05, + "grad_norm": 2.8353145122528076, + "learning_rate": 2.768122085629504e-07, + "loss": 0.5704, + "mean_token_accuracy": 0.8145996332168579, + "num_tokens": 24822736.0, + "step": 654 + }, + { + "epoch": 0.08332273247678412, + "ewc_loss": 0.00671259593218565, + "ewc_loss_diag": 4.9173831939697266e-06, + "ewc_loss_parallel": 1.7992659195442684e-05, + "grad_norm": 2.7386302947998047, + "learning_rate": 2.772361169987283e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8402331471443176, + "num_tokens": 24858743.0, + "step": 655 + }, + { + "epoch": 0.08344994275537464, + "ewc_loss": 0.006667397450655699, + "ewc_loss_diag": 4.887580871582031e-06, + "ewc_loss_parallel": 1.78458503796719e-05, + "grad_norm": 2.570638418197632, + "learning_rate": 2.7766002543450614e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8325998783111572, + "num_tokens": 24901286.0, + "step": 656 + }, + { + "epoch": 0.08357715303396514, + "ewc_loss": 0.006637755315750837, + "ewc_loss_diag": 4.887580871582031e-06, + "ewc_loss_parallel": 1.7549429685459472e-05, + "grad_norm": 2.647562265396118, + "learning_rate": 2.78083933870284e-07, + "loss": 0.5884, + "mean_token_accuracy": 0.8171637654304504, + "num_tokens": 24943457.0, + "step": 657 + }, + { + "epoch": 0.08370436331255565, + "ewc_loss": 0.0066632432863116264, + "ewc_loss_diag": 4.887580871582031e-06, + "ewc_loss_parallel": 1.7804310118663125e-05, + "grad_norm": 2.765723705291748, + "learning_rate": 2.785078423060619e-07, + "loss": 0.5289, + "mean_token_accuracy": 0.8304177522659302, + "num_tokens": 24979247.0, + "step": 658 + }, + { + "epoch": 0.08383157359114617, + "ewc_loss": 0.00668887747451663, + "ewc_loss_diag": 4.887580871582031e-06, + "ewc_loss_parallel": 1.8060651200357825e-05, + "grad_norm": 2.6087234020233154, + "learning_rate": 2.789317507418398e-07, + "loss": 0.5423, + "mean_token_accuracy": 0.8280234932899475, + "num_tokens": 25017456.0, + "step": 659 + }, + { + "epoch": 0.08395878386973668, + "ewc_loss": 0.006690670736134052, + "ewc_loss_diag": 4.9173831939697266e-06, + "ewc_loss_parallel": 1.7773409126675688e-05, + "grad_norm": 2.629517078399658, + "learning_rate": 2.7935565917761763e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.839697003364563, + "num_tokens": 25054682.0, + "step": 660 + }, + { + "epoch": 0.08408599414832718, + "ewc_loss": 0.0066679310984909534, + "ewc_loss_diag": 4.887580871582031e-06, + "ewc_loss_parallel": 1.7851187294581905e-05, + "grad_norm": 2.8340630531311035, + "learning_rate": 2.797795676133955e-07, + "loss": 0.571, + "mean_token_accuracy": 0.8175370693206787, + "num_tokens": 25092501.0, + "step": 661 + }, + { + "epoch": 0.0842132044269177, + "ewc_loss": 0.006739096716046333, + "ewc_loss_diag": 4.9173831939697266e-06, + "ewc_loss_parallel": 1.8257664123666473e-05, + "grad_norm": 2.5751349925994873, + "learning_rate": 2.802034760491734e-07, + "loss": 0.4413, + "mean_token_accuracy": 0.8587406277656555, + "num_tokens": 25129253.0, + "step": 662 + }, + { + "epoch": 0.08434041470550821, + "ewc_loss": 0.0066602290607988834, + "ewc_loss_diag": 4.887580871582031e-06, + "ewc_loss_parallel": 1.7774165826267563e-05, + "grad_norm": 2.4812910556793213, + "learning_rate": 2.806273844849512e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8473666906356812, + "num_tokens": 25169119.0, + "step": 663 + }, + { + "epoch": 0.08446762498409871, + "ewc_loss": 0.006682340055704117, + "ewc_loss_diag": 4.9173831939697266e-06, + "ewc_loss_parallel": 1.769010123098269e-05, + "grad_norm": 2.6541600227355957, + "learning_rate": 2.810512929207291e-07, + "loss": 0.5495, + "mean_token_accuracy": 0.8209236860275269, + "num_tokens": 25207654.0, + "step": 664 + }, + { + "epoch": 0.08459483526268922, + "ewc_loss": 0.006760620046406984, + "ewc_loss_diag": 4.947185516357422e-06, + "ewc_loss_parallel": 1.8167722373618744e-05, + "grad_norm": 2.526790142059326, + "learning_rate": 2.8147520135650697e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.839158296585083, + "num_tokens": 25243258.0, + "step": 665 + }, + { + "epoch": 0.08472204554127974, + "ewc_loss": 0.006742131430655718, + "ewc_loss_diag": 4.947185516357422e-06, + "ewc_loss_parallel": 1.7982836652663536e-05, + "grad_norm": 2.5373849868774414, + "learning_rate": 2.8189910979228487e-07, + "loss": 0.5346, + "mean_token_accuracy": 0.8325753211975098, + "num_tokens": 25281301.0, + "step": 666 + }, + { + "epoch": 0.08484925581987024, + "ewc_loss": 0.006780270952731371, + "ewc_loss_diag": 4.976987838745117e-06, + "ewc_loss_parallel": 1.805905776564032e-05, + "grad_norm": 2.7062923908233643, + "learning_rate": 2.823230182280627e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8452003002166748, + "num_tokens": 25313500.0, + "step": 667 + }, + { + "epoch": 0.08497646609846075, + "ewc_loss": 0.006850294768810272, + "ewc_loss_diag": 5.0067901611328125e-06, + "ewc_loss_parallel": 1.845412225520704e-05, + "grad_norm": 2.535991668701172, + "learning_rate": 2.827469266638406e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8331999778747559, + "num_tokens": 25355121.0, + "step": 668 + }, + { + "epoch": 0.08510367637705127, + "ewc_loss": 0.006821158807724714, + "ewc_loss_diag": 5.0067901611328125e-06, + "ewc_loss_parallel": 1.8162758351536468e-05, + "grad_norm": 2.585434913635254, + "learning_rate": 2.8317083509961846e-07, + "loss": 0.539, + "mean_token_accuracy": 0.829940140247345, + "num_tokens": 25398868.0, + "step": 669 + }, + { + "epoch": 0.08523088665564178, + "ewc_loss": 0.006867096293717623, + "ewc_loss_diag": 5.036592483520508e-06, + "ewc_loss_parallel": 1.8316957721253857e-05, + "grad_norm": 2.515592098236084, + "learning_rate": 2.8359474353539636e-07, + "loss": 0.5183, + "mean_token_accuracy": 0.8345080614089966, + "num_tokens": 25438670.0, + "step": 670 + }, + { + "epoch": 0.08535809693423228, + "ewc_loss": 0.0068655312061309814, + "ewc_loss_diag": 5.036592483520508e-06, + "ewc_loss_parallel": 1.830130713642575e-05, + "grad_norm": 2.6635425090789795, + "learning_rate": 2.840186519711742e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8251276612281799, + "num_tokens": 25479243.0, + "step": 671 + }, + { + "epoch": 0.0854853072128228, + "ewc_loss": 0.006960656959563494, + "ewc_loss_diag": 5.0961971282958984e-06, + "ewc_loss_parallel": 1.8642213035491295e-05, + "grad_norm": 2.8095414638519287, + "learning_rate": 2.844425604069521e-07, + "loss": 0.5495, + "mean_token_accuracy": 0.8254145383834839, + "num_tokens": 25514615.0, + "step": 672 + }, + { + "epoch": 0.08561251749141331, + "ewc_loss": 0.006986748427152634, + "ewc_loss_diag": 5.0961971282958984e-06, + "ewc_loss_parallel": 1.890312705654651e-05, + "grad_norm": 2.7922048568725586, + "learning_rate": 2.8486646884272995e-07, + "loss": 0.5254, + "mean_token_accuracy": 0.8326021432876587, + "num_tokens": 25552753.0, + "step": 673 + }, + { + "epoch": 0.08573972777000381, + "ewc_loss": 0.007003795355558395, + "ewc_loss_diag": 5.125999450683594e-06, + "ewc_loss_parallel": 1.8768419977277517e-05, + "grad_norm": 2.652418851852417, + "learning_rate": 2.8529037727850785e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8496137857437134, + "num_tokens": 25587415.0, + "step": 674 + }, + { + "epoch": 0.08586693804859433, + "ewc_loss": 0.007012790534645319, + "ewc_loss_diag": 5.155801773071289e-06, + "ewc_loss_parallel": 1.8553198970039375e-05, + "grad_norm": 2.5327510833740234, + "learning_rate": 2.857142857142857e-07, + "loss": 0.4924, + "mean_token_accuracy": 0.8456611633300781, + "num_tokens": 25625962.0, + "step": 675 + }, + { + "epoch": 0.08599414832718484, + "ewc_loss": 0.006998097524046898, + "ewc_loss_diag": 5.155801773071289e-06, + "ewc_loss_parallel": 1.8406268281978555e-05, + "grad_norm": 2.7790541648864746, + "learning_rate": 2.861381941500636e-07, + "loss": 0.5408, + "mean_token_accuracy": 0.8280887603759766, + "num_tokens": 25669680.0, + "step": 676 + }, + { + "epoch": 0.08612135860577534, + "ewc_loss": 0.007046788930892944, + "ewc_loss_diag": 5.155801773071289e-06, + "ewc_loss_parallel": 1.889318082248792e-05, + "grad_norm": 2.612616777420044, + "learning_rate": 2.8656210258584144e-07, + "loss": 0.5745, + "mean_token_accuracy": 0.8147197961807251, + "num_tokens": 25709221.0, + "step": 677 + }, + { + "epoch": 0.08624856888436586, + "ewc_loss": 0.007019192911684513, + "ewc_loss_diag": 5.155801773071289e-06, + "ewc_loss_parallel": 1.8617223759065382e-05, + "grad_norm": 2.696122884750366, + "learning_rate": 2.869860110216193e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8146566152572632, + "num_tokens": 25741880.0, + "step": 678 + }, + { + "epoch": 0.08637577916295637, + "ewc_loss": 0.007092002779245377, + "ewc_loss_diag": 5.21540641784668e-06, + "ewc_loss_parallel": 1.873496876214631e-05, + "grad_norm": 2.511716365814209, + "learning_rate": 2.874099194573972e-07, + "loss": 0.5097, + "mean_token_accuracy": 0.8380110263824463, + "num_tokens": 25786282.0, + "step": 679 + }, + { + "epoch": 0.08650298944154687, + "ewc_loss": 0.007061145734041929, + "ewc_loss_diag": 5.21540641784668e-06, + "ewc_loss_parallel": 1.8426399037707597e-05, + "grad_norm": 2.6509976387023926, + "learning_rate": 2.878338278931751e-07, + "loss": 0.5547, + "mean_token_accuracy": 0.8265821933746338, + "num_tokens": 25820229.0, + "step": 680 + }, + { + "epoch": 0.08663019972013739, + "ewc_loss": 0.007100103422999382, + "ewc_loss_diag": 5.21540641784668e-06, + "ewc_loss_parallel": 1.8815977455233224e-05, + "grad_norm": 2.595766544342041, + "learning_rate": 2.8825773632895293e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.8474661111831665, + "num_tokens": 25856465.0, + "step": 681 + }, + { + "epoch": 0.0867574099987279, + "ewc_loss": 0.0070936717092990875, + "ewc_loss_diag": 5.21540641784668e-06, + "ewc_loss_parallel": 1.8751659808913246e-05, + "grad_norm": 2.5516998767852783, + "learning_rate": 2.886816447647308e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8434672355651855, + "num_tokens": 25897407.0, + "step": 682 + }, + { + "epoch": 0.08688462027731841, + "ewc_loss": 0.007119514048099518, + "ewc_loss_diag": 5.245208740234375e-06, + "ewc_loss_parallel": 1.8704906324273907e-05, + "grad_norm": 2.652757406234741, + "learning_rate": 2.891055532005087e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8370634317398071, + "num_tokens": 25938825.0, + "step": 683 + }, + { + "epoch": 0.08701183055590891, + "ewc_loss": 0.007145070470869541, + "ewc_loss_diag": 5.245208740234375e-06, + "ewc_loss_parallel": 1.896046887850389e-05, + "grad_norm": 2.7097184658050537, + "learning_rate": 2.8952946163628657e-07, + "loss": 0.5423, + "mean_token_accuracy": 0.825762927532196, + "num_tokens": 25974944.0, + "step": 684 + }, + { + "epoch": 0.08713904083449943, + "ewc_loss": 0.0071859294548630714, + "ewc_loss_diag": 5.27501106262207e-06, + "ewc_loss_parallel": 1.9063882064074278e-05, + "grad_norm": 2.6537299156188965, + "learning_rate": 2.899533700720644e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8383961319923401, + "num_tokens": 26012864.0, + "step": 685 + }, + { + "epoch": 0.08726625111308994, + "ewc_loss": 0.007175374310463667, + "ewc_loss_diag": 5.27501106262207e-06, + "ewc_loss_parallel": 1.8958331565954722e-05, + "grad_norm": 2.6045801639556885, + "learning_rate": 2.9037727850784227e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.826172947883606, + "num_tokens": 26049779.0, + "step": 686 + }, + { + "epoch": 0.08739346139168044, + "ewc_loss": 0.007169805467128754, + "ewc_loss_diag": 5.27501106262207e-06, + "ewc_loss_parallel": 1.8902643205365166e-05, + "grad_norm": 2.614950180053711, + "learning_rate": 2.9080118694362016e-07, + "loss": 0.5185, + "mean_token_accuracy": 0.8360446691513062, + "num_tokens": 26089095.0, + "step": 687 + }, + { + "epoch": 0.08752067167027096, + "ewc_loss": 0.007172151934355497, + "ewc_loss_diag": 5.27501106262207e-06, + "ewc_loss_parallel": 1.892610998766031e-05, + "grad_norm": 2.4658098220825195, + "learning_rate": 2.9122509537939806e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8363182544708252, + "num_tokens": 26132553.0, + "step": 688 + }, + { + "epoch": 0.08764788194886147, + "ewc_loss": 0.007156402338296175, + "ewc_loss_diag": 5.27501106262207e-06, + "ewc_loss_parallel": 1.8768614609143697e-05, + "grad_norm": 2.6210219860076904, + "learning_rate": 2.916490038151759e-07, + "loss": 0.569, + "mean_token_accuracy": 0.8163555860519409, + "num_tokens": 26171301.0, + "step": 689 + }, + { + "epoch": 0.08777509222745197, + "ewc_loss": 0.007223784923553467, + "ewc_loss_diag": 5.304813385009766e-06, + "ewc_loss_parallel": 1.913726191560272e-05, + "grad_norm": 2.611367702484131, + "learning_rate": 2.9207291225095376e-07, + "loss": 0.5551, + "mean_token_accuracy": 0.8224893808364868, + "num_tokens": 26217871.0, + "step": 690 + }, + { + "epoch": 0.08790230250604249, + "ewc_loss": 0.0072257863357663155, + "ewc_loss_diag": 5.304813385009766e-06, + "ewc_loss_parallel": 1.9157279893988743e-05, + "grad_norm": 2.5362279415130615, + "learning_rate": 2.9249682068673166e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8336071968078613, + "num_tokens": 26255927.0, + "step": 691 + }, + { + "epoch": 0.088029512784633, + "ewc_loss": 0.007213209290057421, + "ewc_loss_diag": 5.304813385009766e-06, + "ewc_loss_parallel": 1.903150769066997e-05, + "grad_norm": 2.521512985229492, + "learning_rate": 2.9292072912250955e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.8419111371040344, + "num_tokens": 26296041.0, + "step": 692 + }, + { + "epoch": 0.0881567230632235, + "ewc_loss": 0.007278612814843655, + "ewc_loss_diag": 5.364418029785156e-06, + "ewc_loss_parallel": 1.9075192540185526e-05, + "grad_norm": 2.706256628036499, + "learning_rate": 2.933446375582874e-07, + "loss": 0.5826, + "mean_token_accuracy": 0.8166229724884033, + "num_tokens": 26333785.0, + "step": 693 + }, + { + "epoch": 0.08828393334181402, + "ewc_loss": 0.0073180971667170525, + "ewc_loss_diag": 5.364418029785156e-06, + "ewc_loss_parallel": 1.9470031475066207e-05, + "grad_norm": 2.6026716232299805, + "learning_rate": 2.9376854599406525e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8404359221458435, + "num_tokens": 26368837.0, + "step": 694 + }, + { + "epoch": 0.08841114362040453, + "ewc_loss": 0.0073235780000686646, + "ewc_loss_diag": 5.3942203521728516e-06, + "ewc_loss_parallel": 1.9219663954572752e-05, + "grad_norm": 2.7125144004821777, + "learning_rate": 2.9419245442984315e-07, + "loss": 0.5645, + "mean_token_accuracy": 0.821792721748352, + "num_tokens": 26402151.0, + "step": 695 + }, + { + "epoch": 0.08853835389899505, + "ewc_loss": 0.007316680625081062, + "ewc_loss_diag": 5.364418029785156e-06, + "ewc_loss_parallel": 1.94558688235702e-05, + "grad_norm": 2.6508402824401855, + "learning_rate": 2.9461636286562104e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.832588791847229, + "num_tokens": 26435550.0, + "step": 696 + }, + { + "epoch": 0.08866556417758555, + "ewc_loss": 0.007368593476712704, + "ewc_loss_diag": 5.424022674560547e-06, + "ewc_loss_parallel": 1.9364642867003568e-05, + "grad_norm": 2.4813971519470215, + "learning_rate": 2.9504027130139884e-07, + "loss": 0.5094, + "mean_token_accuracy": 0.8389729857444763, + "num_tokens": 26479299.0, + "step": 697 + }, + { + "epoch": 0.08879277445617606, + "ewc_loss": 0.007344711571931839, + "ewc_loss_diag": 5.424022674560547e-06, + "ewc_loss_parallel": 1.9125824110233225e-05, + "grad_norm": 2.5799174308776855, + "learning_rate": 2.9546417973717674e-07, + "loss": 0.5457, + "mean_token_accuracy": 0.8273953199386597, + "num_tokens": 26519380.0, + "step": 698 + }, + { + "epoch": 0.08891998473476657, + "ewc_loss": 0.007378892507404089, + "ewc_loss_diag": 5.424022674560547e-06, + "ewc_loss_parallel": 1.9467635866021737e-05, + "grad_norm": 2.5616230964660645, + "learning_rate": 2.9588808817295464e-07, + "loss": 0.4713, + "mean_token_accuracy": 0.8472744822502136, + "num_tokens": 26557306.0, + "step": 699 + }, + { + "epoch": 0.08904719501335707, + "ewc_loss": 0.007374372333288193, + "ewc_loss_diag": 5.424022674560547e-06, + "ewc_loss_parallel": 1.9422435798333026e-05, + "grad_norm": 2.724752426147461, + "learning_rate": 2.9631199660873253e-07, + "loss": 0.5414, + "mean_token_accuracy": 0.82911217212677, + "num_tokens": 26593549.0, + "step": 700 + }, + { + "epoch": 0.08917440529194759, + "ewc_loss": 0.007413368206471205, + "ewc_loss_diag": 5.424022674560547e-06, + "ewc_loss_parallel": 1.981239256565459e-05, + "grad_norm": 2.615154981613159, + "learning_rate": 2.9673590504451033e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8515609502792358, + "num_tokens": 26629926.0, + "step": 701 + }, + { + "epoch": 0.0893016155705381, + "ewc_loss": 0.007386759389191866, + "ewc_loss_diag": 5.424022674560547e-06, + "ewc_loss_parallel": 1.9546305338735692e-05, + "grad_norm": 2.5167746543884277, + "learning_rate": 2.9715981348028823e-07, + "loss": 0.4982, + "mean_token_accuracy": 0.8428855538368225, + "num_tokens": 26668652.0, + "step": 702 + }, + { + "epoch": 0.0894288258491286, + "ewc_loss": 0.007378048729151487, + "ewc_loss_diag": 5.424022674560547e-06, + "ewc_loss_parallel": 1.9459199393168092e-05, + "grad_norm": 2.6463265419006348, + "learning_rate": 2.975837219160661e-07, + "loss": 0.5593, + "mean_token_accuracy": 0.8220473527908325, + "num_tokens": 26708084.0, + "step": 703 + }, + { + "epoch": 0.08955603612771912, + "ewc_loss": 0.007416890934109688, + "ewc_loss_diag": 5.424022674560547e-06, + "ewc_loss_parallel": 1.9847620933433063e-05, + "grad_norm": 2.565347671508789, + "learning_rate": 2.98007630351844e-07, + "loss": 0.5261, + "mean_token_accuracy": 0.8331052660942078, + "num_tokens": 26750134.0, + "step": 704 + }, + { + "epoch": 0.08968324640630963, + "ewc_loss": 0.007460210472345352, + "ewc_loss_diag": 5.4836273193359375e-06, + "ewc_loss_parallel": 1.9670462279464118e-05, + "grad_norm": 2.7165417671203613, + "learning_rate": 2.984315387876218e-07, + "loss": 0.5343, + "mean_token_accuracy": 0.8302900195121765, + "num_tokens": 26785410.0, + "step": 705 + }, + { + "epoch": 0.08981045668490013, + "ewc_loss": 0.007491124793887138, + "ewc_loss_diag": 5.4836273193359375e-06, + "ewc_loss_parallel": 1.9979610442533158e-05, + "grad_norm": 2.675349473953247, + "learning_rate": 2.988554472233997e-07, + "loss": 0.5397, + "mean_token_accuracy": 0.8313932418823242, + "num_tokens": 26825226.0, + "step": 706 + }, + { + "epoch": 0.08993766696349065, + "ewc_loss": 0.007485567592084408, + "ewc_loss_diag": 5.4836273193359375e-06, + "ewc_loss_parallel": 1.9924036678276025e-05, + "grad_norm": 2.507127285003662, + "learning_rate": 2.992793556591776e-07, + "loss": 0.5315, + "mean_token_accuracy": 0.8312628865242004, + "num_tokens": 26865401.0, + "step": 707 + }, + { + "epoch": 0.09006487724208116, + "ewc_loss": 0.00745102996006608, + "ewc_loss_diag": 5.4836273193359375e-06, + "ewc_loss_parallel": 1.957865788426716e-05, + "grad_norm": 2.6362087726593018, + "learning_rate": 2.997032640949555e-07, + "loss": 0.5424, + "mean_token_accuracy": 0.8301952481269836, + "num_tokens": 26905365.0, + "step": 708 + }, + { + "epoch": 0.09019208752067168, + "ewc_loss": 0.0074937548488378525, + "ewc_loss_diag": 5.4836273193359375e-06, + "ewc_loss_parallel": 2.0005911210319027e-05, + "grad_norm": 2.5880045890808105, + "learning_rate": 3.001271725307333e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8446332216262817, + "num_tokens": 26941348.0, + "step": 709 + }, + { + "epoch": 0.09031929779926218, + "ewc_loss": 0.007516656070947647, + "ewc_loss_diag": 5.513429641723633e-06, + "ewc_loss_parallel": 1.9929741029045545e-05, + "grad_norm": 2.8952929973602295, + "learning_rate": 3.005510809665112e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8353230953216553, + "num_tokens": 26975803.0, + "step": 710 + }, + { + "epoch": 0.09044650807785269, + "ewc_loss": 0.007575712166726589, + "ewc_loss_diag": 5.513429641723633e-06, + "ewc_loss_parallel": 2.052030868071597e-05, + "grad_norm": 2.6440210342407227, + "learning_rate": 3.009749894022891e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8512282371520996, + "num_tokens": 27014931.0, + "step": 711 + }, + { + "epoch": 0.0905737183564432, + "ewc_loss": 0.007514864671975374, + "ewc_loss_diag": 5.513429641723633e-06, + "ewc_loss_parallel": 1.991182944038883e-05, + "grad_norm": 2.6151413917541504, + "learning_rate": 3.01398897838067e-07, + "loss": 0.5201, + "mean_token_accuracy": 0.8368490934371948, + "num_tokens": 27055505.0, + "step": 712 + }, + { + "epoch": 0.0907009286350337, + "ewc_loss": 0.007509232498705387, + "ewc_loss_diag": 5.513429641723633e-06, + "ewc_loss_parallel": 1.9855506252497435e-05, + "grad_norm": 2.6113877296447754, + "learning_rate": 3.018228062738448e-07, + "loss": 0.5485, + "mean_token_accuracy": 0.8296002745628357, + "num_tokens": 27092121.0, + "step": 713 + }, + { + "epoch": 0.09082813891362422, + "ewc_loss": 0.007523961365222931, + "ewc_loss_diag": 5.513429641723633e-06, + "ewc_loss_parallel": 2.000279891944956e-05, + "grad_norm": 2.5360634326934814, + "learning_rate": 3.022467147096227e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8448922634124756, + "num_tokens": 27129827.0, + "step": 714 + }, + { + "epoch": 0.09095534919221474, + "ewc_loss": 0.007548147346824408, + "ewc_loss_diag": 5.543231964111328e-06, + "ewc_loss_parallel": 1.9939481717301533e-05, + "grad_norm": 2.462017774581909, + "learning_rate": 3.026706231454006e-07, + "loss": 0.5018, + "mean_token_accuracy": 0.8390816450119019, + "num_tokens": 27174977.0, + "step": 715 + }, + { + "epoch": 0.09108255947080524, + "ewc_loss": 0.007537911646068096, + "ewc_loss_diag": 5.543231964111328e-06, + "ewc_loss_parallel": 1.98371235455852e-05, + "grad_norm": 2.6622323989868164, + "learning_rate": 3.0309453158117844e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.8389180898666382, + "num_tokens": 27211472.0, + "step": 716 + }, + { + "epoch": 0.09120976974939575, + "ewc_loss": 0.007590801455080509, + "ewc_loss_diag": 5.543231964111328e-06, + "ewc_loss_parallel": 2.0366025637486018e-05, + "grad_norm": 2.6363117694854736, + "learning_rate": 3.035184400169563e-07, + "loss": 0.5871, + "mean_token_accuracy": 0.8178431987762451, + "num_tokens": 27249160.0, + "step": 717 + }, + { + "epoch": 0.09133698002798626, + "ewc_loss": 0.007610669359564781, + "ewc_loss_diag": 5.5730342864990234e-06, + "ewc_loss_parallel": 2.0259525626897812e-05, + "grad_norm": 2.7832205295562744, + "learning_rate": 3.039423484527342e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8400900959968567, + "num_tokens": 27287946.0, + "step": 718 + }, + { + "epoch": 0.09146419030657676, + "ewc_loss": 0.007634901441633701, + "ewc_loss_diag": 5.5730342864990234e-06, + "ewc_loss_parallel": 2.0501847757259384e-05, + "grad_norm": 2.712961196899414, + "learning_rate": 3.043662568885121e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8275337219238281, + "num_tokens": 27323388.0, + "step": 719 + }, + { + "epoch": 0.09159140058516728, + "ewc_loss": 0.007649412844330072, + "ewc_loss_diag": 5.602836608886719e-06, + "ewc_loss_parallel": 2.0341783965704963e-05, + "grad_norm": 2.822378396987915, + "learning_rate": 3.0479016532428993e-07, + "loss": 0.5806, + "mean_token_accuracy": 0.8152880072593689, + "num_tokens": 27357524.0, + "step": 720 + }, + { + "epoch": 0.0917186108637578, + "ewc_loss": 0.007645704783499241, + "ewc_loss_diag": 5.5730342864990234e-06, + "ewc_loss_parallel": 2.060988117591478e-05, + "grad_norm": 2.6477267742156982, + "learning_rate": 3.052140737600678e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8422410488128662, + "num_tokens": 27392037.0, + "step": 721 + }, + { + "epoch": 0.09184582114234831, + "ewc_loss": 0.007613432127982378, + "ewc_loss_diag": 5.5730342864990234e-06, + "ewc_loss_parallel": 2.028715425694827e-05, + "grad_norm": 2.5827465057373047, + "learning_rate": 3.056379821958457e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.8403858542442322, + "num_tokens": 27429149.0, + "step": 722 + }, + { + "epoch": 0.09197303142093881, + "ewc_loss": 0.0076072197407484055, + "ewc_loss_diag": 5.5730342864990234e-06, + "ewc_loss_parallel": 2.0225030311848968e-05, + "grad_norm": 2.6934282779693604, + "learning_rate": 3.060618906316236e-07, + "loss": 0.5043, + "mean_token_accuracy": 0.840749979019165, + "num_tokens": 27465023.0, + "step": 723 + }, + { + "epoch": 0.09210024169952932, + "ewc_loss": 0.007670237682759762, + "ewc_loss_diag": 5.602836608886719e-06, + "ewc_loss_parallel": 2.0550032786559314e-05, + "grad_norm": 2.594852924346924, + "learning_rate": 3.064857990674014e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8416339159011841, + "num_tokens": 27501268.0, + "step": 724 + }, + { + "epoch": 0.09222745197811984, + "ewc_loss": 0.007650325074791908, + "ewc_loss_diag": 5.602836608886719e-06, + "ewc_loss_parallel": 2.0350909835542552e-05, + "grad_norm": 2.6289288997650146, + "learning_rate": 3.0690970750317927e-07, + "loss": 0.5478, + "mean_token_accuracy": 0.8272777795791626, + "num_tokens": 27539937.0, + "step": 725 + }, + { + "epoch": 0.09235466225671034, + "ewc_loss": 0.00766304787248373, + "ewc_loss_diag": 5.602836608886719e-06, + "ewc_loss_parallel": 2.0478131773415953e-05, + "grad_norm": 2.6695709228515625, + "learning_rate": 3.0733361593895717e-07, + "loss": 0.5104, + "mean_token_accuracy": 0.837217390537262, + "num_tokens": 27574576.0, + "step": 726 + }, + { + "epoch": 0.09248187253530085, + "ewc_loss": 0.007705959491431713, + "ewc_loss_diag": 5.632638931274414e-06, + "ewc_loss_parallel": 2.0602077711373568e-05, + "grad_norm": 2.5703063011169434, + "learning_rate": 3.0775752437473507e-07, + "loss": 0.519, + "mean_token_accuracy": 0.830467164516449, + "num_tokens": 27616036.0, + "step": 727 + }, + { + "epoch": 0.09260908281389137, + "ewc_loss": 0.007687930949032307, + "ewc_loss_diag": 5.632638931274414e-06, + "ewc_loss_parallel": 2.0421790395630524e-05, + "grad_norm": 2.542145013809204, + "learning_rate": 3.081814328105129e-07, + "loss": 0.5413, + "mean_token_accuracy": 0.8287261128425598, + "num_tokens": 27662040.0, + "step": 728 + }, + { + "epoch": 0.09273629309248187, + "ewc_loss": 0.00769943930208683, + "ewc_loss_diag": 5.632638931274414e-06, + "ewc_loss_parallel": 2.0536876036203466e-05, + "grad_norm": 2.649503707885742, + "learning_rate": 3.0860534124629076e-07, + "loss": 0.5707, + "mean_token_accuracy": 0.8191495537757874, + "num_tokens": 27705136.0, + "step": 729 + }, + { + "epoch": 0.09286350337107238, + "ewc_loss": 0.007725202478468418, + "ewc_loss_diag": 5.632638931274414e-06, + "ewc_loss_parallel": 2.0794504962395877e-05, + "grad_norm": 2.693436622619629, + "learning_rate": 3.0902924968206866e-07, + "loss": 0.5198, + "mean_token_accuracy": 0.8360300064086914, + "num_tokens": 27740564.0, + "step": 730 + }, + { + "epoch": 0.0929907136496629, + "ewc_loss": 0.007735081948339939, + "ewc_loss_diag": 5.632638931274414e-06, + "ewc_loss_parallel": 2.089329791488126e-05, + "grad_norm": 2.83612060546875, + "learning_rate": 3.0945315811784656e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8399829864501953, + "num_tokens": 27774788.0, + "step": 731 + }, + { + "epoch": 0.0931179239282534, + "ewc_loss": 0.007792503573000431, + "ewc_loss_diag": 5.662441253662109e-06, + "ewc_loss_parallel": 2.116234099958092e-05, + "grad_norm": 2.5937917232513428, + "learning_rate": 3.098770665536244e-07, + "loss": 0.6378, + "mean_token_accuracy": 0.7968612313270569, + "num_tokens": 27818300.0, + "step": 732 + }, + { + "epoch": 0.09324513420684391, + "ewc_loss": 0.007768399082124233, + "ewc_loss_diag": 5.692243576049805e-06, + "ewc_loss_parallel": 2.0616116671590135e-05, + "grad_norm": 2.7324297428131104, + "learning_rate": 3.1030097498940225e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8445966243743896, + "num_tokens": 27857739.0, + "step": 733 + }, + { + "epoch": 0.09337234448543442, + "ewc_loss": 0.007799104787409306, + "ewc_loss_diag": 5.692243576049805e-06, + "ewc_loss_parallel": 2.09231748158345e-05, + "grad_norm": 2.617690086364746, + "learning_rate": 3.1072488342518015e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8295875787734985, + "num_tokens": 27893897.0, + "step": 734 + }, + { + "epoch": 0.09349955476402494, + "ewc_loss": 0.007783136330544949, + "ewc_loss_diag": 5.692243576049805e-06, + "ewc_loss_parallel": 2.0763489374076016e-05, + "grad_norm": 2.6657707691192627, + "learning_rate": 3.11148791860958e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8364214897155762, + "num_tokens": 27930511.0, + "step": 735 + }, + { + "epoch": 0.09362676504261544, + "ewc_loss": 0.007800180930644274, + "ewc_loss_diag": 5.692243576049805e-06, + "ewc_loss_parallel": 2.093393777613528e-05, + "grad_norm": 2.8720974922180176, + "learning_rate": 3.115727002967359e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.8337434530258179, + "num_tokens": 27961128.0, + "step": 736 + }, + { + "epoch": 0.09375397532120595, + "ewc_loss": 0.007841652259230614, + "ewc_loss_diag": 5.692243576049805e-06, + "ewc_loss_parallel": 2.1348650989239104e-05, + "grad_norm": 2.5711867809295654, + "learning_rate": 3.1199660873251374e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8341577649116516, + "num_tokens": 28000748.0, + "step": 737 + }, + { + "epoch": 0.09388118559979647, + "ewc_loss": 0.00780778331682086, + "ewc_loss_diag": 5.7220458984375e-06, + "ewc_loss_parallel": 2.070478512905538e-05, + "grad_norm": 2.498197078704834, + "learning_rate": 3.1242051716829164e-07, + "loss": 0.5592, + "mean_token_accuracy": 0.8215175867080688, + "num_tokens": 28046455.0, + "step": 738 + }, + { + "epoch": 0.09400839587838697, + "ewc_loss": 0.0078085726127028465, + "ewc_loss_diag": 5.7220458984375e-06, + "ewc_loss_parallel": 2.0712677724077366e-05, + "grad_norm": 2.6949985027313232, + "learning_rate": 3.128444256040695e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.8313426971435547, + "num_tokens": 28087026.0, + "step": 739 + }, + { + "epoch": 0.09413560615697748, + "ewc_loss": 0.007865656167268753, + "ewc_loss_diag": 5.7220458984375e-06, + "ewc_loss_parallel": 2.128351479768753e-05, + "grad_norm": 2.59397029876709, + "learning_rate": 3.132683340398474e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8503503203392029, + "num_tokens": 28126382.0, + "step": 740 + }, + { + "epoch": 0.094262816435568, + "ewc_loss": 0.007834228686988354, + "ewc_loss_diag": 5.751848220825195e-06, + "ewc_loss_parallel": 2.09692407224793e-05, + "grad_norm": 2.587968587875366, + "learning_rate": 3.1369224247562523e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8460047245025635, + "num_tokens": 28161198.0, + "step": 741 + }, + { + "epoch": 0.0943900267141585, + "ewc_loss": 0.007872327230870724, + "ewc_loss_diag": 5.781650543212891e-06, + "ewc_loss_parallel": 2.1045047105872072e-05, + "grad_norm": 2.490278482437134, + "learning_rate": 3.1411615091140313e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8542046546936035, + "num_tokens": 28204612.0, + "step": 742 + }, + { + "epoch": 0.09451723699274901, + "ewc_loss": 0.007865477353334427, + "ewc_loss_diag": 5.781650543212891e-06, + "ewc_loss_parallel": 2.0976554878870957e-05, + "grad_norm": 2.574031114578247, + "learning_rate": 3.14540059347181e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8380498886108398, + "num_tokens": 28241050.0, + "step": 743 + }, + { + "epoch": 0.09464444727133953, + "ewc_loss": 0.007894812151789665, + "ewc_loss_diag": 5.781650543212891e-06, + "ewc_loss_parallel": 2.1269896024023183e-05, + "grad_norm": 2.6362264156341553, + "learning_rate": 3.149639677829589e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8489091396331787, + "num_tokens": 28278032.0, + "step": 744 + }, + { + "epoch": 0.09477165754993004, + "ewc_loss": 0.007908333092927933, + "ewc_loss_diag": 5.781650543212891e-06, + "ewc_loss_parallel": 2.1405110601335764e-05, + "grad_norm": 2.661731481552124, + "learning_rate": 3.153878762187368e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.8280216455459595, + "num_tokens": 28315599.0, + "step": 745 + }, + { + "epoch": 0.09489886782852054, + "ewc_loss": 0.007911917753517628, + "ewc_loss_diag": 5.781650543212891e-06, + "ewc_loss_parallel": 2.1440957425511442e-05, + "grad_norm": 2.6122968196868896, + "learning_rate": 3.158117846545146e-07, + "loss": 0.5802, + "mean_token_accuracy": 0.8183304071426392, + "num_tokens": 28358504.0, + "step": 746 + }, + { + "epoch": 0.09502607810711106, + "ewc_loss": 0.007930075749754906, + "ewc_loss_diag": 5.811452865600586e-06, + "ewc_loss_parallel": 2.1317353457561694e-05, + "grad_norm": 2.8343424797058105, + "learning_rate": 3.1623569309029247e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8413820266723633, + "num_tokens": 28389626.0, + "step": 747 + }, + { + "epoch": 0.09515328838570157, + "ewc_loss": 0.008012941107153893, + "ewc_loss_diag": 5.841255187988281e-06, + "ewc_loss_parallel": 2.1840834961039945e-05, + "grad_norm": 3.0117948055267334, + "learning_rate": 3.1665960152607037e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8452808856964111, + "num_tokens": 28430361.0, + "step": 748 + }, + { + "epoch": 0.09528049866429207, + "ewc_loss": 0.008060756139457226, + "ewc_loss_diag": 5.8710575103759766e-06, + "ewc_loss_parallel": 2.2013809939380735e-05, + "grad_norm": 2.549412965774536, + "learning_rate": 3.1708350996184826e-07, + "loss": 0.5158, + "mean_token_accuracy": 0.8342442512512207, + "num_tokens": 28469792.0, + "step": 749 + }, + { + "epoch": 0.09540770894288259, + "ewc_loss": 0.007958327420055866, + "ewc_loss_diag": 5.8710575103759766e-06, + "ewc_loss_parallel": 2.098952427331824e-05, + "grad_norm": 2.6110048294067383, + "learning_rate": 3.175074183976261e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8445879817008972, + "num_tokens": 28507569.0, + "step": 750 + }, + { + "epoch": 0.0955349192214731, + "ewc_loss": 0.007986525073647499, + "ewc_loss_diag": 5.8710575103759766e-06, + "ewc_loss_parallel": 2.127150582964532e-05, + "grad_norm": 2.669461488723755, + "learning_rate": 3.1793132683340396e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.8350608348846436, + "num_tokens": 28542336.0, + "step": 751 + }, + { + "epoch": 0.0956621295000636, + "ewc_loss": 0.008047716692090034, + "ewc_loss_diag": 5.900859832763672e-06, + "ewc_loss_parallel": 2.157824201276526e-05, + "grad_norm": 2.6502039432525635, + "learning_rate": 3.1835523526918186e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.848234236240387, + "num_tokens": 28578587.0, + "step": 752 + }, + { + "epoch": 0.09578933977865411, + "ewc_loss": 0.008030892349779606, + "ewc_loss_diag": 5.900859832763672e-06, + "ewc_loss_parallel": 2.1410000044852495e-05, + "grad_norm": 2.6489431858062744, + "learning_rate": 3.1877914370495975e-07, + "loss": 0.5502, + "mean_token_accuracy": 0.8258622884750366, + "num_tokens": 28617505.0, + "step": 753 + }, + { + "epoch": 0.09591655005724463, + "ewc_loss": 0.008034388534724712, + "ewc_loss_diag": 5.900859832763672e-06, + "ewc_loss_parallel": 2.144495738320984e-05, + "grad_norm": 2.593441963195801, + "learning_rate": 3.1920305214073755e-07, + "loss": 0.5543, + "mean_token_accuracy": 0.825078010559082, + "num_tokens": 28658526.0, + "step": 754 + }, + { + "epoch": 0.09604376033583513, + "ewc_loss": 0.008065013214945793, + "ewc_loss_diag": 5.930662155151367e-06, + "ewc_loss_parallel": 2.1446028767968528e-05, + "grad_norm": 2.608336925506592, + "learning_rate": 3.1962696057651545e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8428063988685608, + "num_tokens": 28698825.0, + "step": 755 + }, + { + "epoch": 0.09617097061442564, + "ewc_loss": 0.008072298020124435, + "ewc_loss_diag": 5.930662155151367e-06, + "ewc_loss_parallel": 2.1518882931559347e-05, + "grad_norm": 2.6826956272125244, + "learning_rate": 3.2005086901229335e-07, + "loss": 0.5341, + "mean_token_accuracy": 0.8273690938949585, + "num_tokens": 28733238.0, + "step": 756 + }, + { + "epoch": 0.09629818089301616, + "ewc_loss": 0.008121136575937271, + "ewc_loss_diag": 5.9604644775390625e-06, + "ewc_loss_parallel": 2.1702091544284485e-05, + "grad_norm": 2.54278826713562, + "learning_rate": 3.2047477744807125e-07, + "loss": 0.534, + "mean_token_accuracy": 0.8321740031242371, + "num_tokens": 28771552.0, + "step": 757 + }, + { + "epoch": 0.09642539117160667, + "ewc_loss": 0.008100520819425583, + "ewc_loss_diag": 5.9604644775390625e-06, + "ewc_loss_parallel": 2.1495936380233616e-05, + "grad_norm": 2.592994213104248, + "learning_rate": 3.2089868588384904e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8187298774719238, + "num_tokens": 28810827.0, + "step": 758 + }, + { + "epoch": 0.09655260145019717, + "ewc_loss": 0.008090173825621605, + "ewc_loss_diag": 5.930662155151367e-06, + "ewc_loss_parallel": 2.1697635020245798e-05, + "grad_norm": 2.6197381019592285, + "learning_rate": 3.2132259431962694e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.8464555144309998, + "num_tokens": 28849017.0, + "step": 759 + }, + { + "epoch": 0.09667981172878769, + "ewc_loss": 0.00816483236849308, + "ewc_loss_diag": 5.990266799926758e-06, + "ewc_loss_parallel": 2.1833868231624365e-05, + "grad_norm": 2.6388742923736572, + "learning_rate": 3.2174650275540484e-07, + "loss": 0.5294, + "mean_token_accuracy": 0.8372611999511719, + "num_tokens": 28887659.0, + "step": 760 + }, + { + "epoch": 0.0968070220073782, + "ewc_loss": 0.008171076886355877, + "ewc_loss_diag": 5.990266799926758e-06, + "ewc_loss_parallel": 2.1896314137848094e-05, + "grad_norm": 2.5140461921691895, + "learning_rate": 3.2217041119118274e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8415478467941284, + "num_tokens": 28928983.0, + "step": 761 + }, + { + "epoch": 0.0969342322859687, + "ewc_loss": 0.008146964013576508, + "ewc_loss_diag": 5.990266799926758e-06, + "ewc_loss_parallel": 2.1655187083524652e-05, + "grad_norm": 2.671562910079956, + "learning_rate": 3.2259431962696053e-07, + "loss": 0.527, + "mean_token_accuracy": 0.8337080478668213, + "num_tokens": 28962195.0, + "step": 762 + }, + { + "epoch": 0.09706144256455922, + "ewc_loss": 0.008196627721190453, + "ewc_loss_diag": 5.990266799926758e-06, + "ewc_loss_parallel": 2.215182212239597e-05, + "grad_norm": 2.6911404132843018, + "learning_rate": 3.2301822806273843e-07, + "loss": 0.5101, + "mean_token_accuracy": 0.8359889984130859, + "num_tokens": 28997351.0, + "step": 763 + }, + { + "epoch": 0.09718865284314973, + "ewc_loss": 0.00822925753891468, + "ewc_loss_diag": 6.020069122314453e-06, + "ewc_loss_parallel": 2.2172944227349944e-05, + "grad_norm": 2.646703004837036, + "learning_rate": 3.2344213649851633e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.8294880390167236, + "num_tokens": 29040988.0, + "step": 764 + }, + { + "epoch": 0.09731586312174023, + "ewc_loss": 0.008189631626009941, + "ewc_loss_diag": 5.990266799926758e-06, + "ewc_loss_parallel": 2.2081858332967386e-05, + "grad_norm": 2.55439829826355, + "learning_rate": 3.238660449342942e-07, + "loss": 0.4995, + "mean_token_accuracy": 0.8388888835906982, + "num_tokens": 29081873.0, + "step": 765 + }, + { + "epoch": 0.09744307340033075, + "ewc_loss": 0.008204245008528233, + "ewc_loss_diag": 6.020069122314453e-06, + "ewc_loss_parallel": 2.192281863244716e-05, + "grad_norm": 2.7052574157714844, + "learning_rate": 3.24289953370072e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8444457054138184, + "num_tokens": 29115360.0, + "step": 766 + }, + { + "epoch": 0.09757028367892126, + "ewc_loss": 0.008254002779722214, + "ewc_loss_diag": 6.020069122314453e-06, + "ewc_loss_parallel": 2.2420395907829516e-05, + "grad_norm": 2.5269739627838135, + "learning_rate": 3.247138618058499e-07, + "loss": 0.5168, + "mean_token_accuracy": 0.8312041759490967, + "num_tokens": 29160139.0, + "step": 767 + }, + { + "epoch": 0.09769749395751176, + "ewc_loss": 0.008213400840759277, + "ewc_loss_diag": 6.020069122314453e-06, + "ewc_loss_parallel": 2.2014379283064045e-05, + "grad_norm": 2.557737112045288, + "learning_rate": 3.251377702416278e-07, + "loss": 0.5456, + "mean_token_accuracy": 0.8283448815345764, + "num_tokens": 29202872.0, + "step": 768 + }, + { + "epoch": 0.09782470423610228, + "ewc_loss": 0.008259080350399017, + "ewc_loss_diag": 6.0498714447021484e-06, + "ewc_loss_parallel": 2.21659956878284e-05, + "grad_norm": 2.5581793785095215, + "learning_rate": 3.255616786774057e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8413722515106201, + "num_tokens": 29246965.0, + "step": 769 + }, + { + "epoch": 0.09795191451469279, + "ewc_loss": 0.008270762860774994, + "ewc_loss_diag": 6.0498714447021484e-06, + "ewc_loss_parallel": 2.228282937721815e-05, + "grad_norm": 2.537001609802246, + "learning_rate": 3.259855871131835e-07, + "loss": 0.4331, + "mean_token_accuracy": 0.8601912260055542, + "num_tokens": 29289531.0, + "step": 770 + }, + { + "epoch": 0.0980791247932833, + "ewc_loss": 0.008260088972747326, + "ewc_loss_diag": 6.0498714447021484e-06, + "ewc_loss_parallel": 2.2176085622049868e-05, + "grad_norm": 2.547708749771118, + "learning_rate": 3.264094955489614e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8431332111358643, + "num_tokens": 29330484.0, + "step": 771 + }, + { + "epoch": 0.0982063350718738, + "ewc_loss": 0.00827416218817234, + "ewc_loss_diag": 6.0498714447021484e-06, + "ewc_loss_parallel": 2.2316813556244597e-05, + "grad_norm": 2.550172805786133, + "learning_rate": 3.268334039847393e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8345460295677185, + "num_tokens": 29369595.0, + "step": 772 + }, + { + "epoch": 0.09833354535046432, + "ewc_loss": 0.008304986171424389, + "ewc_loss_diag": 6.079673767089844e-06, + "ewc_loss_parallel": 2.231988219136838e-05, + "grad_norm": 2.5554730892181396, + "learning_rate": 3.2725731242051715e-07, + "loss": 0.5039, + "mean_token_accuracy": 0.8421350717544556, + "num_tokens": 29411856.0, + "step": 773 + }, + { + "epoch": 0.09846075562905483, + "ewc_loss": 0.008311985991895199, + "ewc_loss_diag": 6.079673767089844e-06, + "ewc_loss_parallel": 2.2389882360585034e-05, + "grad_norm": 2.5833535194396973, + "learning_rate": 3.27681220856295e-07, + "loss": 0.4867, + "mean_token_accuracy": 0.8450192213058472, + "num_tokens": 29454754.0, + "step": 774 + }, + { + "epoch": 0.09858796590764533, + "ewc_loss": 0.008316539227962494, + "ewc_loss_diag": 6.079673767089844e-06, + "ewc_loss_parallel": 2.2435415303334594e-05, + "grad_norm": 2.680482864379883, + "learning_rate": 3.281051292920729e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8429188132286072, + "num_tokens": 29489131.0, + "step": 775 + }, + { + "epoch": 0.09871517618623585, + "ewc_loss": 0.008344831876456738, + "ewc_loss_diag": 6.079673767089844e-06, + "ewc_loss_parallel": 2.2718339096172713e-05, + "grad_norm": 2.5722525119781494, + "learning_rate": 3.285290377278508e-07, + "loss": 0.5369, + "mean_token_accuracy": 0.8279837965965271, + "num_tokens": 29528292.0, + "step": 776 + }, + { + "epoch": 0.09884238646482636, + "ewc_loss": 0.008316470310091972, + "ewc_loss_diag": 6.079673767089844e-06, + "ewc_loss_parallel": 2.243472590635065e-05, + "grad_norm": 2.642929792404175, + "learning_rate": 3.2895294616362864e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.850973904132843, + "num_tokens": 29564878.0, + "step": 777 + }, + { + "epoch": 0.09896959674341686, + "ewc_loss": 0.008373241871595383, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.2697260646964423e-05, + "grad_norm": 2.642634153366089, + "learning_rate": 3.293768545994065e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8406140804290771, + "num_tokens": 29602826.0, + "step": 778 + }, + { + "epoch": 0.09909680702200738, + "ewc_loss": 0.008371788077056408, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.268272146466188e-05, + "grad_norm": 2.658413887023926, + "learning_rate": 3.298007630351844e-07, + "loss": 0.5587, + "mean_token_accuracy": 0.824781596660614, + "num_tokens": 29640323.0, + "step": 779 + }, + { + "epoch": 0.09922401730059789, + "ewc_loss": 0.008378841914236546, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.2753260054741986e-05, + "grad_norm": 2.5274617671966553, + "learning_rate": 3.302246714709623e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8423029184341431, + "num_tokens": 29683302.0, + "step": 780 + }, + { + "epoch": 0.09935122757918839, + "ewc_loss": 0.008345983922481537, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.2424685084843077e-05, + "grad_norm": 2.612771987915039, + "learning_rate": 3.3064857990674013e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.839094877243042, + "num_tokens": 29721085.0, + "step": 781 + }, + { + "epoch": 0.0994784378577789, + "ewc_loss": 0.008382929489016533, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.2794143660576083e-05, + "grad_norm": 2.733567237854004, + "learning_rate": 3.31072488342518e-07, + "loss": 0.4654, + "mean_token_accuracy": 0.8479887247085571, + "num_tokens": 29753810.0, + "step": 782 + }, + { + "epoch": 0.09960564813636942, + "ewc_loss": 0.008405003696680069, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.3014879843685776e-05, + "grad_norm": 2.559495687484741, + "learning_rate": 3.314963967782959e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.8438812494277954, + "num_tokens": 29796488.0, + "step": 783 + }, + { + "epoch": 0.09973285841495994, + "ewc_loss": 0.008357291109859943, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.2537757104146294e-05, + "grad_norm": 2.5537753105163574, + "learning_rate": 3.319203052140738e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.8386191129684448, + "num_tokens": 29838181.0, + "step": 784 + }, + { + "epoch": 0.09986006869355044, + "ewc_loss": 0.008375771343708038, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.272255915158894e-05, + "grad_norm": 2.8133773803710938, + "learning_rate": 3.323442136498516e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.845060408115387, + "num_tokens": 29868893.0, + "step": 785 + }, + { + "epoch": 0.09998727897214095, + "ewc_loss": 0.008436977863311768, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.3334621801041067e-05, + "grad_norm": 2.7472991943359375, + "learning_rate": 3.3276812208562947e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.837737500667572, + "num_tokens": 29903571.0, + "step": 786 + }, + { + "epoch": 0.10011448925073146, + "ewc_loss": 0.008406942710280418, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.303427208971698e-05, + "grad_norm": 2.669059991836548, + "learning_rate": 3.3319203052140737e-07, + "loss": 0.5576, + "mean_token_accuracy": 0.8296943306922913, + "num_tokens": 29940438.0, + "step": 787 + }, + { + "epoch": 0.10024169952932196, + "ewc_loss": 0.008388943038880825, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.2854272174299695e-05, + "grad_norm": 2.6463205814361572, + "learning_rate": 3.336159389571852e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.85047447681427, + "num_tokens": 29976538.0, + "step": 788 + }, + { + "epoch": 0.10036890980791248, + "ewc_loss": 0.00839464645832777, + "ewc_loss_diag": 6.109476089477539e-06, + "ewc_loss_parallel": 2.2911310225026682e-05, + "grad_norm": 2.5518910884857178, + "learning_rate": 3.340398473929631e-07, + "loss": 0.5501, + "mean_token_accuracy": 0.8258598446846008, + "num_tokens": 30019814.0, + "step": 789 + }, + { + "epoch": 0.100496120086503, + "ewc_loss": 0.008411810733377934, + "ewc_loss_diag": 6.139278411865234e-06, + "ewc_loss_parallel": 2.277777821291238e-05, + "grad_norm": 3.0865917205810547, + "learning_rate": 3.3446375582874096e-07, + "loss": 0.5002, + "mean_token_accuracy": 0.8411203026771545, + "num_tokens": 30057578.0, + "step": 790 + }, + { + "epoch": 0.1006233303650935, + "ewc_loss": 0.008541588671505451, + "ewc_loss_diag": 6.139278411865234e-06, + "ewc_loss_parallel": 2.407555621175561e-05, + "grad_norm": 2.7050046920776367, + "learning_rate": 3.3488766426451886e-07, + "loss": 0.5713, + "mean_token_accuracy": 0.8264821171760559, + "num_tokens": 30094869.0, + "step": 791 + }, + { + "epoch": 0.10075054064368401, + "ewc_loss": 0.0084222462028265, + "ewc_loss_diag": 6.139278411865234e-06, + "ewc_loss_parallel": 2.288213545398321e-05, + "grad_norm": 2.5421297550201416, + "learning_rate": 3.353115727002967e-07, + "loss": 0.5331, + "mean_token_accuracy": 0.8306554555892944, + "num_tokens": 30136446.0, + "step": 792 + }, + { + "epoch": 0.10087775092227452, + "ewc_loss": 0.008406627923250198, + "ewc_loss_diag": 6.139278411865234e-06, + "ewc_loss_parallel": 2.272594974783715e-05, + "grad_norm": 2.577070713043213, + "learning_rate": 3.357354811360746e-07, + "loss": 0.527, + "mean_token_accuracy": 0.8282399773597717, + "num_tokens": 30179226.0, + "step": 793 + }, + { + "epoch": 0.10100496120086502, + "ewc_loss": 0.008441293612122536, + "ewc_loss_diag": 6.139278411865234e-06, + "ewc_loss_parallel": 2.3072607291396707e-05, + "grad_norm": 2.6594736576080322, + "learning_rate": 3.3615938957185245e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.8332881927490234, + "num_tokens": 30213890.0, + "step": 794 + }, + { + "epoch": 0.10113217147945554, + "ewc_loss": 0.008461760357022285, + "ewc_loss_diag": 6.139278411865234e-06, + "ewc_loss_parallel": 2.3277269065147266e-05, + "grad_norm": 2.824608087539673, + "learning_rate": 3.3658329800763035e-07, + "loss": 0.5677, + "mean_token_accuracy": 0.81656813621521, + "num_tokens": 30245691.0, + "step": 795 + }, + { + "epoch": 0.10125938175804605, + "ewc_loss": 0.008531627245247364, + "ewc_loss_diag": 6.16908073425293e-06, + "ewc_loss_parallel": 2.3670767404837534e-05, + "grad_norm": 2.652714252471924, + "learning_rate": 3.370072064434082e-07, + "loss": 0.5234, + "mean_token_accuracy": 0.826127290725708, + "num_tokens": 30281738.0, + "step": 796 + }, + { + "epoch": 0.10138659203663657, + "ewc_loss": 0.008514507673680782, + "ewc_loss_diag": 6.198883056640625e-06, + "ewc_loss_parallel": 2.3194390450953506e-05, + "grad_norm": 2.549816608428955, + "learning_rate": 3.374311148791861e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.8414066433906555, + "num_tokens": 30320707.0, + "step": 797 + }, + { + "epoch": 0.10151380231522707, + "ewc_loss": 0.008505206555128098, + "ewc_loss_diag": 6.198883056640625e-06, + "ewc_loss_parallel": 2.3101380065781996e-05, + "grad_norm": 2.841911554336548, + "learning_rate": 3.3785502331496394e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8390330076217651, + "num_tokens": 30351379.0, + "step": 798 + }, + { + "epoch": 0.10164101259381758, + "ewc_loss": 0.008563975803554058, + "ewc_loss_diag": 6.16908073425293e-06, + "ewc_loss_parallel": 2.399424738541711e-05, + "grad_norm": 2.8422863483428955, + "learning_rate": 3.3827893175074184e-07, + "loss": 0.5198, + "mean_token_accuracy": 0.8324953317642212, + "num_tokens": 30382395.0, + "step": 799 + }, + { + "epoch": 0.1017682228724081, + "ewc_loss": 0.00857490859925747, + "ewc_loss_diag": 6.198883056640625e-06, + "ewc_loss_parallel": 2.3798400434316136e-05, + "grad_norm": 2.651181221008301, + "learning_rate": 3.387028401865197e-07, + "loss": 0.5263, + "mean_token_accuracy": 0.8363356590270996, + "num_tokens": 30419758.0, + "step": 800 + }, + { + "epoch": 0.1018954331509986, + "ewc_loss": 0.008563963696360588, + "ewc_loss_diag": 6.22868537902832e-06, + "ewc_loss_parallel": 2.3383779989671893e-05, + "grad_norm": 2.6411187648773193, + "learning_rate": 3.391267486222976e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8340634107589722, + "num_tokens": 30458115.0, + "step": 801 + }, + { + "epoch": 0.10202264342958911, + "ewc_loss": 0.008583340793848038, + "ewc_loss_diag": 6.22868537902832e-06, + "ewc_loss_parallel": 2.357754419790581e-05, + "grad_norm": 2.842219114303589, + "learning_rate": 3.3955065705807543e-07, + "loss": 0.5448, + "mean_token_accuracy": 0.824825644493103, + "num_tokens": 30489679.0, + "step": 802 + }, + { + "epoch": 0.10214985370817962, + "ewc_loss": 0.008663492277264595, + "ewc_loss_diag": 6.258487701416016e-06, + "ewc_loss_parallel": 2.4073891836451367e-05, + "grad_norm": 2.701596260070801, + "learning_rate": 3.3997456549385333e-07, + "loss": 0.4825, + "mean_token_accuracy": 0.8466613292694092, + "num_tokens": 30524874.0, + "step": 803 + }, + { + "epoch": 0.10227706398677013, + "ewc_loss": 0.008625578135251999, + "ewc_loss_diag": 6.258487701416016e-06, + "ewc_loss_parallel": 2.3694745323155075e-05, + "grad_norm": 2.7905707359313965, + "learning_rate": 3.403984739296312e-07, + "loss": 0.5538, + "mean_token_accuracy": 0.8241166472434998, + "num_tokens": 30558354.0, + "step": 804 + }, + { + "epoch": 0.10240427426536064, + "ewc_loss": 0.008648738265037537, + "ewc_loss_diag": 6.258487701416016e-06, + "ewc_loss_parallel": 2.392634814896155e-05, + "grad_norm": 2.638239622116089, + "learning_rate": 3.408223823654091e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8437426686286926, + "num_tokens": 30594827.0, + "step": 805 + }, + { + "epoch": 0.10253148454395115, + "ewc_loss": 0.008617019280791283, + "ewc_loss_diag": 6.258487701416016e-06, + "ewc_loss_parallel": 2.3609161871718243e-05, + "grad_norm": 2.7029268741607666, + "learning_rate": 3.412462908011869e-07, + "loss": 0.528, + "mean_token_accuracy": 0.8376647233963013, + "num_tokens": 30635259.0, + "step": 806 + }, + { + "epoch": 0.10265869482254165, + "ewc_loss": 0.008674872107803822, + "ewc_loss_diag": 6.288290023803711e-06, + "ewc_loss_parallel": 2.3882510504336096e-05, + "grad_norm": 2.590625762939453, + "learning_rate": 3.4167019923696477e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.8397361040115356, + "num_tokens": 30681850.0, + "step": 807 + }, + { + "epoch": 0.10278590510113217, + "ewc_loss": 0.008653547614812851, + "ewc_loss_diag": 6.288290023803711e-06, + "ewc_loss_parallel": 2.3669259462621994e-05, + "grad_norm": 2.6170880794525146, + "learning_rate": 3.4209410767274267e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8473678231239319, + "num_tokens": 30721302.0, + "step": 808 + }, + { + "epoch": 0.10291311537972268, + "ewc_loss": 0.008667385205626488, + "ewc_loss_diag": 6.288290023803711e-06, + "ewc_loss_parallel": 2.3807635443517938e-05, + "grad_norm": 2.7080464363098145, + "learning_rate": 3.4251801610852057e-07, + "loss": 0.5611, + "mean_token_accuracy": 0.8190284967422485, + "num_tokens": 30761975.0, + "step": 809 + }, + { + "epoch": 0.1030403256583132, + "ewc_loss": 0.008723730221390724, + "ewc_loss_diag": 6.318092346191406e-06, + "ewc_loss_parallel": 2.4065910110948607e-05, + "grad_norm": 2.7545065879821777, + "learning_rate": 3.429419245442984e-07, + "loss": 0.5123, + "mean_token_accuracy": 0.8347157835960388, + "num_tokens": 30797754.0, + "step": 810 + }, + { + "epoch": 0.1031675359369037, + "ewc_loss": 0.008733228780329227, + "ewc_loss_diag": 6.318092346191406e-06, + "ewc_loss_parallel": 2.4160903194569983e-05, + "grad_norm": 2.6289517879486084, + "learning_rate": 3.4336583298007626e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.845268726348877, + "num_tokens": 30834183.0, + "step": 811 + }, + { + "epoch": 0.10329474621549421, + "ewc_loss": 0.008705250918865204, + "ewc_loss_diag": 6.318092346191406e-06, + "ewc_loss_parallel": 2.388112261542119e-05, + "grad_norm": 2.5461254119873047, + "learning_rate": 3.4378974141585416e-07, + "loss": 0.5156, + "mean_token_accuracy": 0.8357340097427368, + "num_tokens": 30874160.0, + "step": 812 + }, + { + "epoch": 0.10342195649408473, + "ewc_loss": 0.008696042001247406, + "ewc_loss_diag": 6.318092346191406e-06, + "ewc_loss_parallel": 2.3789036276866682e-05, + "grad_norm": 2.533229351043701, + "learning_rate": 3.4421364985163206e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8528903722763062, + "num_tokens": 30916827.0, + "step": 813 + }, + { + "epoch": 0.10354916677267523, + "ewc_loss": 0.008744563907384872, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.3969081667019054e-05, + "grad_norm": 2.5733518600463867, + "learning_rate": 3.446375582874099e-07, + "loss": 0.5242, + "mean_token_accuracy": 0.8349275588989258, + "num_tokens": 30960063.0, + "step": 814 + }, + { + "epoch": 0.10367637705126574, + "ewc_loss": 0.008765341714024544, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.417685027467087e-05, + "grad_norm": 2.7255215644836426, + "learning_rate": 3.4506146672318775e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8459571003913879, + "num_tokens": 30995172.0, + "step": 815 + }, + { + "epoch": 0.10380358732985626, + "ewc_loss": 0.008802102878689766, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.4544460757169873e-05, + "grad_norm": 2.689845561981201, + "learning_rate": 3.4548537515896565e-07, + "loss": 0.5301, + "mean_token_accuracy": 0.8313716650009155, + "num_tokens": 31035089.0, + "step": 816 + }, + { + "epoch": 0.10393079760844676, + "ewc_loss": 0.00877574272453785, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.4280863726744428e-05, + "grad_norm": 2.5794506072998047, + "learning_rate": 3.4590928359474355e-07, + "loss": 0.5146, + "mean_token_accuracy": 0.8347490429878235, + "num_tokens": 31074836.0, + "step": 817 + }, + { + "epoch": 0.10405800788703727, + "ewc_loss": 0.008764401078224182, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.416744791844394e-05, + "grad_norm": 2.6262335777282715, + "learning_rate": 3.463331920305214e-07, + "loss": 0.533, + "mean_token_accuracy": 0.8307114243507385, + "num_tokens": 31114700.0, + "step": 818 + }, + { + "epoch": 0.10418521816562779, + "ewc_loss": 0.008750551380217075, + "ewc_loss_diag": 6.318092346191406e-06, + "ewc_loss_parallel": 2.433412373648025e-05, + "grad_norm": 2.794914484024048, + "learning_rate": 3.4675710046629924e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.845443844795227, + "num_tokens": 31148258.0, + "step": 819 + }, + { + "epoch": 0.1043124284442183, + "ewc_loss": 0.008827460929751396, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.4798044250928797e-05, + "grad_norm": 2.6926910877227783, + "learning_rate": 3.4718100890207714e-07, + "loss": 0.5201, + "mean_token_accuracy": 0.8358312845230103, + "num_tokens": 31185720.0, + "step": 820 + }, + { + "epoch": 0.1044396387228088, + "ewc_loss": 0.008787340484559536, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.439684067212511e-05, + "grad_norm": 2.7467219829559326, + "learning_rate": 3.4760491733785504e-07, + "loss": 0.5301, + "mean_token_accuracy": 0.8281036019325256, + "num_tokens": 31220353.0, + "step": 821 + }, + { + "epoch": 0.10456684900139931, + "ewc_loss": 0.00880733598023653, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.4596798539278097e-05, + "grad_norm": 2.6711513996124268, + "learning_rate": 3.480288257736329e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8346061110496521, + "num_tokens": 31256477.0, + "step": 822 + }, + { + "epoch": 0.10469405927998983, + "ewc_loss": 0.008825540542602539, + "ewc_loss_diag": 6.377696990966797e-06, + "ewc_loss_parallel": 2.4473672965541482e-05, + "grad_norm": 2.699352264404297, + "learning_rate": 3.4845273420941073e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.839282751083374, + "num_tokens": 31294365.0, + "step": 823 + }, + { + "epoch": 0.10482126955858033, + "ewc_loss": 0.008808162994682789, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.4605067665106617e-05, + "grad_norm": 2.804182529449463, + "learning_rate": 3.4887664264518863e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8360263705253601, + "num_tokens": 31329357.0, + "step": 824 + }, + { + "epoch": 0.10494847983717084, + "ewc_loss": 0.008830400183796883, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.4827439119690098e-05, + "grad_norm": 2.5638089179992676, + "learning_rate": 3.4930055108096653e-07, + "loss": 0.5083, + "mean_token_accuracy": 0.8379216194152832, + "num_tokens": 31372180.0, + "step": 825 + }, + { + "epoch": 0.10507569011576136, + "ewc_loss": 0.008776688016951084, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.4290317014674656e-05, + "grad_norm": 2.6159329414367676, + "learning_rate": 3.497244595167443e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8413066864013672, + "num_tokens": 31414866.0, + "step": 826 + }, + { + "epoch": 0.10520290039435186, + "ewc_loss": 0.008804390206933022, + "ewc_loss_diag": 6.3478946685791016e-06, + "ewc_loss_parallel": 2.4567336367908865e-05, + "grad_norm": 2.790079116821289, + "learning_rate": 3.501483679525222e-07, + "loss": 0.5582, + "mean_token_accuracy": 0.8218954205513, + "num_tokens": 31450583.0, + "step": 827 + }, + { + "epoch": 0.10533011067294237, + "ewc_loss": 0.008884532377123833, + "ewc_loss_diag": 6.377696990966797e-06, + "ewc_loss_parallel": 2.5063587600016035e-05, + "grad_norm": 2.6522881984710693, + "learning_rate": 3.505722763883001e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8411634564399719, + "num_tokens": 31490371.0, + "step": 828 + }, + { + "epoch": 0.10545732095153289, + "ewc_loss": 0.00883690919727087, + "ewc_loss_diag": 6.377696990966797e-06, + "ewc_loss_parallel": 2.458735616528429e-05, + "grad_norm": 2.6945009231567383, + "learning_rate": 3.50996184824078e-07, + "loss": 0.5319, + "mean_token_accuracy": 0.828818142414093, + "num_tokens": 31525951.0, + "step": 829 + }, + { + "epoch": 0.10558453123012339, + "ewc_loss": 0.008856810629367828, + "ewc_loss_diag": 6.377696990966797e-06, + "ewc_loss_parallel": 2.4786368157947436e-05, + "grad_norm": 2.607915163040161, + "learning_rate": 3.514200932598558e-07, + "loss": 0.5716, + "mean_token_accuracy": 0.8194581866264343, + "num_tokens": 31570109.0, + "step": 830 + }, + { + "epoch": 0.1057117415087139, + "ewc_loss": 0.008875974453985691, + "ewc_loss_diag": 6.407499313354492e-06, + "ewc_loss_parallel": 2.467283047735691e-05, + "grad_norm": 2.6607232093811035, + "learning_rate": 3.518440016956337e-07, + "loss": 0.5151, + "mean_token_accuracy": 0.8355769515037537, + "num_tokens": 31610569.0, + "step": 831 + }, + { + "epoch": 0.10583895178730442, + "ewc_loss": 0.00890000257641077, + "ewc_loss_diag": 6.407499313354492e-06, + "ewc_loss_parallel": 2.4913113520597108e-05, + "grad_norm": 2.6491012573242188, + "learning_rate": 3.522679101314116e-07, + "loss": 0.5269, + "mean_token_accuracy": 0.8351877927780151, + "num_tokens": 31653585.0, + "step": 832 + }, + { + "epoch": 0.10596616206589493, + "ewc_loss": 0.008937424048781395, + "ewc_loss_diag": 6.4373016357421875e-06, + "ewc_loss_parallel": 2.498214780644048e-05, + "grad_norm": 2.7021403312683105, + "learning_rate": 3.526918185671895e-07, + "loss": 0.5364, + "mean_token_accuracy": 0.8294344544410706, + "num_tokens": 31696353.0, + "step": 833 + }, + { + "epoch": 0.10609337234448543, + "ewc_loss": 0.008949991315603256, + "ewc_loss_diag": 6.4373016357421875e-06, + "ewc_loss_parallel": 2.510781996534206e-05, + "grad_norm": 2.728440999984741, + "learning_rate": 3.531157270029673e-07, + "loss": 0.4686, + "mean_token_accuracy": 0.8502870798110962, + "num_tokens": 31730620.0, + "step": 834 + }, + { + "epoch": 0.10622058262307595, + "ewc_loss": 0.008955038152635098, + "ewc_loss_diag": 6.4373016357421875e-06, + "ewc_loss_parallel": 2.5158289645332843e-05, + "grad_norm": 2.6650354862213135, + "learning_rate": 3.535396354387452e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8439505696296692, + "num_tokens": 31770538.0, + "step": 835 + }, + { + "epoch": 0.10634779290166646, + "ewc_loss": 0.008925799280405045, + "ewc_loss_diag": 6.4373016357421875e-06, + "ewc_loss_parallel": 2.4865907107596286e-05, + "grad_norm": 2.67965030670166, + "learning_rate": 3.539635438745231e-07, + "loss": 0.5255, + "mean_token_accuracy": 0.8315849900245667, + "num_tokens": 31808687.0, + "step": 836 + }, + { + "epoch": 0.10647500318025696, + "ewc_loss": 0.008951004594564438, + "ewc_loss_diag": 6.4373016357421875e-06, + "ewc_loss_parallel": 2.5117959012277424e-05, + "grad_norm": 2.5683979988098145, + "learning_rate": 3.54387452310301e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.8386123180389404, + "num_tokens": 31852310.0, + "step": 837 + }, + { + "epoch": 0.10660221345884748, + "ewc_loss": 0.008922233246266842, + "ewc_loss_diag": 6.4373016357421875e-06, + "ewc_loss_parallel": 2.483024036337156e-05, + "grad_norm": 2.6199638843536377, + "learning_rate": 3.548113607460788e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8504118919372559, + "num_tokens": 31887897.0, + "step": 838 + }, + { + "epoch": 0.10672942373743799, + "ewc_loss": 0.00900947954505682, + "ewc_loss_diag": 6.496906280517578e-06, + "ewc_loss_parallel": 2.5092354917433113e-05, + "grad_norm": 2.71390438079834, + "learning_rate": 3.552352691818567e-07, + "loss": 0.4989, + "mean_token_accuracy": 0.8422518372535706, + "num_tokens": 31926159.0, + "step": 839 + }, + { + "epoch": 0.10685663401602849, + "ewc_loss": 0.009041013196110725, + "ewc_loss_diag": 6.496906280517578e-06, + "ewc_loss_parallel": 2.5407691282453015e-05, + "grad_norm": 2.603778600692749, + "learning_rate": 3.556591776176346e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.8483316898345947, + "num_tokens": 31968432.0, + "step": 840 + }, + { + "epoch": 0.106983844294619, + "ewc_loss": 0.009033041074872017, + "ewc_loss_diag": 6.5267086029052734e-06, + "ewc_loss_parallel": 2.502279130567331e-05, + "grad_norm": 2.868135452270508, + "learning_rate": 3.560830860534125e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8413782119750977, + "num_tokens": 32000949.0, + "step": 841 + }, + { + "epoch": 0.10711105457320952, + "ewc_loss": 0.009108472615480423, + "ewc_loss_diag": 6.5267086029052734e-06, + "ewc_loss_parallel": 2.5777109840419143e-05, + "grad_norm": 2.822340488433838, + "learning_rate": 3.565069944891903e-07, + "loss": 0.5477, + "mean_token_accuracy": 0.825205385684967, + "num_tokens": 32033862.0, + "step": 842 + }, + { + "epoch": 0.10723826485180002, + "ewc_loss": 0.009118902496993542, + "ewc_loss_diag": 6.556510925292969e-06, + "ewc_loss_parallel": 2.5576229745638557e-05, + "grad_norm": 2.720144510269165, + "learning_rate": 3.569309029249682e-07, + "loss": 0.5113, + "mean_token_accuracy": 0.8381936550140381, + "num_tokens": 32068623.0, + "step": 843 + }, + { + "epoch": 0.10736547513039053, + "ewc_loss": 0.009086621925234795, + "ewc_loss_diag": 6.556510925292969e-06, + "ewc_loss_parallel": 2.5253428248106502e-05, + "grad_norm": 2.7013847827911377, + "learning_rate": 3.573548113607461e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8418033123016357, + "num_tokens": 32106658.0, + "step": 844 + }, + { + "epoch": 0.10749268540898105, + "ewc_loss": 0.009097038768231869, + "ewc_loss_diag": 6.556510925292969e-06, + "ewc_loss_parallel": 2.5357598133268766e-05, + "grad_norm": 2.685537576675415, + "learning_rate": 3.577787197965239e-07, + "loss": 0.5386, + "mean_token_accuracy": 0.8276568651199341, + "num_tokens": 32142031.0, + "step": 845 + }, + { + "epoch": 0.10761989568757156, + "ewc_loss": 0.009167074225842953, + "ewc_loss_diag": 6.616115570068359e-06, + "ewc_loss_parallel": 2.5447598090977408e-05, + "grad_norm": 2.5678744316101074, + "learning_rate": 3.5820262823230177e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.852778434753418, + "num_tokens": 32182686.0, + "step": 846 + }, + { + "epoch": 0.10774710596616206, + "ewc_loss": 0.009111574850976467, + "ewc_loss_diag": 6.586313247680664e-06, + "ewc_loss_parallel": 2.5197781724273227e-05, + "grad_norm": 2.7971835136413574, + "learning_rate": 3.5862653666807967e-07, + "loss": 0.5672, + "mean_token_accuracy": 0.8194170594215393, + "num_tokens": 32218027.0, + "step": 847 + }, + { + "epoch": 0.10787431624475258, + "ewc_loss": 0.009185152128338814, + "ewc_loss_diag": 6.586313247680664e-06, + "ewc_loss_parallel": 2.59335502050817e-05, + "grad_norm": 2.6867880821228027, + "learning_rate": 3.5905044510385757e-07, + "loss": 0.5236, + "mean_token_accuracy": 0.8365799188613892, + "num_tokens": 32257854.0, + "step": 848 + }, + { + "epoch": 0.10800152652334309, + "ewc_loss": 0.009112754836678505, + "ewc_loss_diag": 6.556510925292969e-06, + "ewc_loss_parallel": 2.551474972278811e-05, + "grad_norm": 2.6625256538391113, + "learning_rate": 3.594743535396354e-07, + "loss": 0.5063, + "mean_token_accuracy": 0.83905428647995, + "num_tokens": 32295019.0, + "step": 849 + }, + { + "epoch": 0.10812873680193359, + "ewc_loss": 0.009121181443333626, + "ewc_loss_diag": 6.556510925292969e-06, + "ewc_loss_parallel": 2.559901804488618e-05, + "grad_norm": 2.648120880126953, + "learning_rate": 3.5989826197541326e-07, + "loss": 0.5549, + "mean_token_accuracy": 0.8243465423583984, + "num_tokens": 32335652.0, + "step": 850 + }, + { + "epoch": 0.1082559470805241, + "ewc_loss": 0.009161500260233879, + "ewc_loss_diag": 6.586313247680664e-06, + "ewc_loss_parallel": 2.569703065091744e-05, + "grad_norm": 2.6098053455352783, + "learning_rate": 3.6032217041119116e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8359440565109253, + "num_tokens": 32377249.0, + "step": 851 + }, + { + "epoch": 0.10838315735911462, + "ewc_loss": 0.009151693433523178, + "ewc_loss_diag": 6.586313247680664e-06, + "ewc_loss_parallel": 2.559896711318288e-05, + "grad_norm": 2.586638927459717, + "learning_rate": 3.6074607884696906e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8504011631011963, + "num_tokens": 32422467.0, + "step": 852 + }, + { + "epoch": 0.10851036763770512, + "ewc_loss": 0.009160943329334259, + "ewc_loss_diag": 6.586313247680664e-06, + "ewc_loss_parallel": 2.56914700003108e-05, + "grad_norm": 2.60337495803833, + "learning_rate": 3.611699872827469e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8494198322296143, + "num_tokens": 32462120.0, + "step": 853 + }, + { + "epoch": 0.10863757791629564, + "ewc_loss": 0.00917091779410839, + "ewc_loss_diag": 6.586313247680664e-06, + "ewc_loss_parallel": 2.5791214284254238e-05, + "grad_norm": 2.8093817234039307, + "learning_rate": 3.6159389571852475e-07, + "loss": 0.5827, + "mean_token_accuracy": 0.8173922300338745, + "num_tokens": 32498484.0, + "step": 854 + }, + { + "epoch": 0.10876478819488615, + "ewc_loss": 0.00922534428536892, + "ewc_loss_diag": 6.586313247680664e-06, + "ewc_loss_parallel": 2.6335468646720983e-05, + "grad_norm": 2.648076057434082, + "learning_rate": 3.6201780415430265e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8348366618156433, + "num_tokens": 32541536.0, + "step": 855 + }, + { + "epoch": 0.10889199847347665, + "ewc_loss": 0.009231306612491608, + "ewc_loss_diag": 6.645917892456055e-06, + "ewc_loss_parallel": 2.578474595793523e-05, + "grad_norm": 2.695823907852173, + "learning_rate": 3.6244171259008055e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8314698338508606, + "num_tokens": 32579750.0, + "step": 856 + }, + { + "epoch": 0.10901920875206716, + "ewc_loss": 0.00925514753907919, + "ewc_loss_diag": 6.645917892456055e-06, + "ewc_loss_parallel": 2.6023155442089774e-05, + "grad_norm": 2.687720537185669, + "learning_rate": 3.628656210258584e-07, + "loss": 0.5512, + "mean_token_accuracy": 0.8232021331787109, + "num_tokens": 32617518.0, + "step": 857 + }, + { + "epoch": 0.10914641903065768, + "ewc_loss": 0.009251773357391357, + "ewc_loss_diag": 6.645917892456055e-06, + "ewc_loss_parallel": 2.5989411369664595e-05, + "grad_norm": 2.6522557735443115, + "learning_rate": 3.6328952946163624e-07, + "loss": 0.493, + "mean_token_accuracy": 0.8420554995536804, + "num_tokens": 32653931.0, + "step": 858 + }, + { + "epoch": 0.1092736293092482, + "ewc_loss": 0.009245837107300758, + "ewc_loss_diag": 6.645917892456055e-06, + "ewc_loss_parallel": 2.593004501250107e-05, + "grad_norm": 2.658895254135132, + "learning_rate": 3.6371343789741414e-07, + "loss": 0.481, + "mean_token_accuracy": 0.845951497554779, + "num_tokens": 32693797.0, + "step": 859 + }, + { + "epoch": 0.1094008395878387, + "ewc_loss": 0.009259478189051151, + "ewc_loss_diag": 6.645917892456055e-06, + "ewc_loss_parallel": 2.606646012281999e-05, + "grad_norm": 2.679198741912842, + "learning_rate": 3.6413734633319204e-07, + "loss": 0.5277, + "mean_token_accuracy": 0.8333169221878052, + "num_tokens": 32732432.0, + "step": 860 + }, + { + "epoch": 0.10952804986642921, + "ewc_loss": 0.009303191676735878, + "ewc_loss_diag": 6.67572021484375e-06, + "ewc_loss_parallel": 2.6198425985057838e-05, + "grad_norm": 2.7779107093811035, + "learning_rate": 3.645612547689699e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8413677215576172, + "num_tokens": 32769225.0, + "step": 861 + }, + { + "epoch": 0.10965526014501972, + "ewc_loss": 0.009318198077380657, + "ewc_loss_diag": 6.67572021484375e-06, + "ewc_loss_parallel": 2.6348483515903354e-05, + "grad_norm": 2.6144516468048096, + "learning_rate": 3.6498516320474773e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.8485841155052185, + "num_tokens": 32812027.0, + "step": 862 + }, + { + "epoch": 0.10978247042361022, + "ewc_loss": 0.009271730668842793, + "ewc_loss_diag": 6.67572021484375e-06, + "ewc_loss_parallel": 2.588380993984174e-05, + "grad_norm": 2.8995237350463867, + "learning_rate": 3.6540907164052563e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.8546723127365112, + "num_tokens": 32849334.0, + "step": 863 + }, + { + "epoch": 0.10990968070220074, + "ewc_loss": 0.009426157921552658, + "ewc_loss_diag": 6.735324859619141e-06, + "ewc_loss_parallel": 2.6817731850314885e-05, + "grad_norm": 2.781134843826294, + "learning_rate": 3.658329800763035e-07, + "loss": 0.5481, + "mean_token_accuracy": 0.8235636353492737, + "num_tokens": 32888422.0, + "step": 864 + }, + { + "epoch": 0.11003689098079125, + "ewc_loss": 0.00936727412045002, + "ewc_loss_diag": 6.735324859619141e-06, + "ewc_loss_parallel": 2.622888860059902e-05, + "grad_norm": 2.767921209335327, + "learning_rate": 3.662568885120814e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.8382214307785034, + "num_tokens": 32922672.0, + "step": 865 + }, + { + "epoch": 0.11016410125938175, + "ewc_loss": 0.009361060336232185, + "ewc_loss_diag": 6.735324859619141e-06, + "ewc_loss_parallel": 2.61667555605527e-05, + "grad_norm": 2.7215657234191895, + "learning_rate": 3.666807969478592e-07, + "loss": 0.5386, + "mean_token_accuracy": 0.8262618780136108, + "num_tokens": 32961694.0, + "step": 866 + }, + { + "epoch": 0.11029131153797227, + "ewc_loss": 0.009387973695993423, + "ewc_loss_diag": 6.765127182006836e-06, + "ewc_loss_parallel": 2.6130708647542633e-05, + "grad_norm": 2.673344850540161, + "learning_rate": 3.671047053836371e-07, + "loss": 0.526, + "mean_token_accuracy": 0.8296123743057251, + "num_tokens": 32998595.0, + "step": 867 + }, + { + "epoch": 0.11041852181656278, + "ewc_loss": 0.009381071664392948, + "ewc_loss_diag": 6.765127182006836e-06, + "ewc_loss_parallel": 2.6061696189572103e-05, + "grad_norm": 2.6511943340301514, + "learning_rate": 3.6752861381941497e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8426785469055176, + "num_tokens": 33035884.0, + "step": 868 + }, + { + "epoch": 0.11054573209515328, + "ewc_loss": 0.00939332414418459, + "ewc_loss_diag": 6.765127182006836e-06, + "ewc_loss_parallel": 2.618421785882674e-05, + "grad_norm": 3.0493245124816895, + "learning_rate": 3.6795252225519287e-07, + "loss": 0.548, + "mean_token_accuracy": 0.8288952112197876, + "num_tokens": 33073335.0, + "step": 869 + }, + { + "epoch": 0.1106729423737438, + "ewc_loss": 0.009494089521467686, + "ewc_loss_diag": 6.765127182006836e-06, + "ewc_loss_parallel": 2.719187250477262e-05, + "grad_norm": 2.6418395042419434, + "learning_rate": 3.6837643069097077e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8543515801429749, + "num_tokens": 33111443.0, + "step": 870 + }, + { + "epoch": 0.11080015265233431, + "ewc_loss": 0.009355672635138035, + "ewc_loss_diag": 6.765127182006836e-06, + "ewc_loss_parallel": 2.5807699785218574e-05, + "grad_norm": 2.6180338859558105, + "learning_rate": 3.688003391267486e-07, + "loss": 0.5088, + "mean_token_accuracy": 0.841157853603363, + "num_tokens": 33156362.0, + "step": 871 + }, + { + "epoch": 0.11092736293092482, + "ewc_loss": 0.00941961258649826, + "ewc_loss_diag": 6.794929504394531e-06, + "ewc_loss_parallel": 2.6141929993173108e-05, + "grad_norm": 2.5893208980560303, + "learning_rate": 3.6922424756252646e-07, + "loss": 0.4783, + "mean_token_accuracy": 0.8461257219314575, + "num_tokens": 33204968.0, + "step": 872 + }, + { + "epoch": 0.11105457320951533, + "ewc_loss": 0.009432004764676094, + "ewc_loss_diag": 6.794929504394531e-06, + "ewc_loss_parallel": 2.6265852284268476e-05, + "grad_norm": 2.8044707775115967, + "learning_rate": 3.6964815599830436e-07, + "loss": 0.5239, + "mean_token_accuracy": 0.8308970928192139, + "num_tokens": 33239265.0, + "step": 873 + }, + { + "epoch": 0.11118178348810584, + "ewc_loss": 0.009519847109913826, + "ewc_loss_diag": 6.8247318267822266e-06, + "ewc_loss_parallel": 2.683909951883834e-05, + "grad_norm": 2.7813029289245605, + "learning_rate": 3.7007206443408226e-07, + "loss": 0.4593, + "mean_token_accuracy": 0.853672981262207, + "num_tokens": 33275897.0, + "step": 874 + }, + { + "epoch": 0.11130899376669635, + "ewc_loss": 0.009492321871221066, + "ewc_loss_diag": 6.8247318267822266e-06, + "ewc_loss_parallel": 2.6563846404314972e-05, + "grad_norm": 2.697824001312256, + "learning_rate": 3.704959728698601e-07, + "loss": 0.5792, + "mean_token_accuracy": 0.8167681694030762, + "num_tokens": 33313966.0, + "step": 875 + }, + { + "epoch": 0.11143620404528685, + "ewc_loss": 0.009452796541154385, + "ewc_loss_diag": 6.794929504394531e-06, + "ewc_loss_parallel": 2.647376459208317e-05, + "grad_norm": 2.7231650352478027, + "learning_rate": 3.7091988130563795e-07, + "loss": 0.5269, + "mean_token_accuracy": 0.8336833715438843, + "num_tokens": 33352909.0, + "step": 876 + }, + { + "epoch": 0.11156341432387737, + "ewc_loss": 0.009531856514513493, + "ewc_loss_diag": 6.854534149169922e-06, + "ewc_loss_parallel": 2.6654015528038144e-05, + "grad_norm": 2.615086317062378, + "learning_rate": 3.7134378974141585e-07, + "loss": 0.5098, + "mean_token_accuracy": 0.8398805856704712, + "num_tokens": 33400120.0, + "step": 877 + }, + { + "epoch": 0.11169062460246788, + "ewc_loss": 0.009537940844893456, + "ewc_loss_diag": 6.884336471557617e-06, + "ewc_loss_parallel": 2.640968705236446e-05, + "grad_norm": 2.795069456100464, + "learning_rate": 3.7176769817719375e-07, + "loss": 0.5349, + "mean_token_accuracy": 0.8307166695594788, + "num_tokens": 33436703.0, + "step": 878 + }, + { + "epoch": 0.11181783488105838, + "ewc_loss": 0.009511927142739296, + "ewc_loss_diag": 6.794929504394531e-06, + "ewc_loss_parallel": 2.7065074391430244e-05, + "grad_norm": 2.7392988204956055, + "learning_rate": 3.7219160661297154e-07, + "loss": 0.446, + "mean_token_accuracy": 0.8574267625808716, + "num_tokens": 33473515.0, + "step": 879 + }, + { + "epoch": 0.1119450451596489, + "ewc_loss": 0.009482716210186481, + "ewc_loss_diag": 6.794929504394531e-06, + "ewc_loss_parallel": 2.677296106412541e-05, + "grad_norm": 2.7093565464019775, + "learning_rate": 3.7261551504874944e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8409071564674377, + "num_tokens": 33507659.0, + "step": 880 + }, + { + "epoch": 0.11207225543823941, + "ewc_loss": 0.00947977602481842, + "ewc_loss_diag": 6.794929504394531e-06, + "ewc_loss_parallel": 2.674356619536411e-05, + "grad_norm": 2.7062909603118896, + "learning_rate": 3.7303942348452734e-07, + "loss": 0.453, + "mean_token_accuracy": 0.852502167224884, + "num_tokens": 33542430.0, + "step": 881 + }, + { + "epoch": 0.11219946571682991, + "ewc_loss": 0.009492547251284122, + "ewc_loss_diag": 6.794929504394531e-06, + "ewc_loss_parallel": 2.6871271984418854e-05, + "grad_norm": 2.6679532527923584, + "learning_rate": 3.7346333192030524e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.8551496267318726, + "num_tokens": 33582579.0, + "step": 882 + }, + { + "epoch": 0.11232667599542043, + "ewc_loss": 0.009546589106321335, + "ewc_loss_diag": 6.854534149169922e-06, + "ewc_loss_parallel": 2.6801340936799534e-05, + "grad_norm": 2.620492696762085, + "learning_rate": 3.7388724035608303e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.8483209013938904, + "num_tokens": 33625928.0, + "step": 883 + }, + { + "epoch": 0.11245388627401094, + "ewc_loss": 0.009505156427621841, + "ewc_loss_diag": 6.8247318267822266e-06, + "ewc_loss_parallel": 2.6692192477639765e-05, + "grad_norm": 2.7392897605895996, + "learning_rate": 3.7431114879186093e-07, + "loss": 0.5251, + "mean_token_accuracy": 0.8339810371398926, + "num_tokens": 33661325.0, + "step": 884 + }, + { + "epoch": 0.11258109655260146, + "ewc_loss": 0.009581652469933033, + "ewc_loss_diag": 6.854534149169922e-06, + "ewc_loss_parallel": 2.7151974791195244e-05, + "grad_norm": 2.7649240493774414, + "learning_rate": 3.7473505722763883e-07, + "loss": 0.5467, + "mean_token_accuracy": 0.8265842795372009, + "num_tokens": 33696472.0, + "step": 885 + }, + { + "epoch": 0.11270830683119196, + "ewc_loss": 0.00958256796002388, + "ewc_loss_diag": 6.854534149169922e-06, + "ewc_loss_parallel": 2.7161127945873886e-05, + "grad_norm": 2.7226648330688477, + "learning_rate": 3.7515896566341673e-07, + "loss": 0.5001, + "mean_token_accuracy": 0.8374384641647339, + "num_tokens": 33732637.0, + "step": 886 + }, + { + "epoch": 0.11283551710978247, + "ewc_loss": 0.009630812332034111, + "ewc_loss_diag": 6.9141387939453125e-06, + "ewc_loss_parallel": 2.7033222067984752e-05, + "grad_norm": 2.6765763759613037, + "learning_rate": 3.755828740991945e-07, + "loss": 0.5253, + "mean_token_accuracy": 0.832754373550415, + "num_tokens": 33775106.0, + "step": 887 + }, + { + "epoch": 0.11296272738837299, + "ewc_loss": 0.00965933408588171, + "ewc_loss_diag": 6.943941116333008e-06, + "ewc_loss_parallel": 2.7013262297259644e-05, + "grad_norm": 2.7256908416748047, + "learning_rate": 3.760067825349724e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8455896377563477, + "num_tokens": 33811297.0, + "step": 888 + }, + { + "epoch": 0.11308993766696349, + "ewc_loss": 0.00971222948282957, + "ewc_loss_diag": 6.973743438720703e-06, + "ewc_loss_parallel": 2.7237038011662662e-05, + "grad_norm": 2.5898683071136475, + "learning_rate": 3.764306909707503e-07, + "loss": 0.4509, + "mean_token_accuracy": 0.8545900583267212, + "num_tokens": 33854038.0, + "step": 889 + }, + { + "epoch": 0.113217147945554, + "ewc_loss": 0.009678898379206657, + "ewc_loss_diag": 6.973743438720703e-06, + "ewc_loss_parallel": 2.690372639335692e-05, + "grad_norm": 2.8151895999908447, + "learning_rate": 3.768545994065282e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8374578952789307, + "num_tokens": 33884929.0, + "step": 890 + }, + { + "epoch": 0.11334435822414451, + "ewc_loss": 0.009758153930306435, + "ewc_loss_diag": 6.973743438720703e-06, + "ewc_loss_parallel": 2.7696289180312306e-05, + "grad_norm": 2.7023706436157227, + "learning_rate": 3.77278507842306e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8365994691848755, + "num_tokens": 33926065.0, + "step": 891 + }, + { + "epoch": 0.11347156850273502, + "ewc_loss": 0.009710384532809258, + "ewc_loss_diag": 6.973743438720703e-06, + "ewc_loss_parallel": 2.7218593459110707e-05, + "grad_norm": 2.6748106479644775, + "learning_rate": 3.777024162780839e-07, + "loss": 0.468, + "mean_token_accuracy": 0.8478862047195435, + "num_tokens": 33965557.0, + "step": 892 + }, + { + "epoch": 0.11359877878132553, + "ewc_loss": 0.009682307951152325, + "ewc_loss_diag": 6.943941116333008e-06, + "ewc_loss_parallel": 2.7243000658927485e-05, + "grad_norm": 2.719282865524292, + "learning_rate": 3.781263247138618e-07, + "loss": 0.475, + "mean_token_accuracy": 0.846487283706665, + "num_tokens": 34004132.0, + "step": 893 + }, + { + "epoch": 0.11372598905991604, + "ewc_loss": 0.009710079059004784, + "ewc_loss_diag": 6.943941116333008e-06, + "ewc_loss_parallel": 2.7520707590156235e-05, + "grad_norm": 2.7735817432403564, + "learning_rate": 3.785502331496397e-07, + "loss": 0.502, + "mean_token_accuracy": 0.8332573175430298, + "num_tokens": 34039781.0, + "step": 894 + }, + { + "epoch": 0.11385319933850654, + "ewc_loss": 0.009717958979308605, + "ewc_loss_diag": 6.943941116333008e-06, + "ewc_loss_parallel": 2.759950984909665e-05, + "grad_norm": 2.698561668395996, + "learning_rate": 3.789741415854175e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8312194347381592, + "num_tokens": 34080487.0, + "step": 895 + }, + { + "epoch": 0.11398040961709706, + "ewc_loss": 0.009694906882941723, + "ewc_loss_diag": 6.943941116333008e-06, + "ewc_loss_parallel": 2.7368991140974686e-05, + "grad_norm": 2.678433895111084, + "learning_rate": 3.793980500211954e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8382591605186462, + "num_tokens": 34119913.0, + "step": 896 + }, + { + "epoch": 0.11410761989568757, + "ewc_loss": 0.009705722332000732, + "ewc_loss_diag": 6.943941116333008e-06, + "ewc_loss_parallel": 2.7477148250909522e-05, + "grad_norm": 2.7028160095214844, + "learning_rate": 3.798219584569733e-07, + "loss": 0.4393, + "mean_token_accuracy": 0.8577316999435425, + "num_tokens": 34155811.0, + "step": 897 + }, + { + "epoch": 0.11423483017427809, + "ewc_loss": 0.009750045835971832, + "ewc_loss_diag": 6.973743438720703e-06, + "ewc_loss_parallel": 2.761520227068104e-05, + "grad_norm": 2.8011515140533447, + "learning_rate": 3.8024586689275115e-07, + "loss": 0.5599, + "mean_token_accuracy": 0.8270871639251709, + "num_tokens": 34195022.0, + "step": 898 + }, + { + "epoch": 0.11436204045286859, + "ewc_loss": 0.0098046213388443, + "ewc_loss_diag": 7.033348083496094e-06, + "ewc_loss_parallel": 2.785577999020461e-05, + "grad_norm": 2.7297492027282715, + "learning_rate": 3.80669775328529e-07, + "loss": 0.493, + "mean_token_accuracy": 0.8398466110229492, + "num_tokens": 34229182.0, + "step": 899 + }, + { + "epoch": 0.1144892507314591, + "ewc_loss": 0.00978110171854496, + "ewc_loss_diag": 7.033348083496094e-06, + "ewc_loss_parallel": 2.7620588298304938e-05, + "grad_norm": 2.693666696548462, + "learning_rate": 3.810936837643069e-07, + "loss": 0.457, + "mean_token_accuracy": 0.851717472076416, + "num_tokens": 34266931.0, + "step": 900 + }, + { + "epoch": 0.11461646101004962, + "ewc_loss": 0.009786194190382957, + "ewc_loss_diag": 7.033348083496094e-06, + "ewc_loss_parallel": 2.767151454463601e-05, + "grad_norm": 2.716820478439331, + "learning_rate": 3.815175922000848e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.853653073310852, + "num_tokens": 34301705.0, + "step": 901 + }, + { + "epoch": 0.11474367128864012, + "ewc_loss": 0.009801194071769714, + "ewc_loss_diag": 7.033348083496094e-06, + "ewc_loss_parallel": 2.782151204883121e-05, + "grad_norm": 2.7230899333953857, + "learning_rate": 3.8194150063586264e-07, + "loss": 0.5292, + "mean_token_accuracy": 0.8346850872039795, + "num_tokens": 34340975.0, + "step": 902 + }, + { + "epoch": 0.11487088156723063, + "ewc_loss": 0.009796185418963432, + "ewc_loss_diag": 7.033348083496094e-06, + "ewc_loss_parallel": 2.7771427994593978e-05, + "grad_norm": 2.7148873805999756, + "learning_rate": 3.823654090716405e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8383709192276001, + "num_tokens": 34380024.0, + "step": 903 + }, + { + "epoch": 0.11499809184582115, + "ewc_loss": 0.009802693501114845, + "ewc_loss_diag": 7.033348083496094e-06, + "ewc_loss_parallel": 2.7836500521516427e-05, + "grad_norm": 2.761847734451294, + "learning_rate": 3.827893175074184e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8406660556793213, + "num_tokens": 34419654.0, + "step": 904 + }, + { + "epoch": 0.11512530212441165, + "ewc_loss": 0.009815791621804237, + "ewc_loss_diag": 7.033348083496094e-06, + "ewc_loss_parallel": 2.7967485948465765e-05, + "grad_norm": 2.763209819793701, + "learning_rate": 3.832132259431963e-07, + "loss": 0.4511, + "mean_token_accuracy": 0.8564378023147583, + "num_tokens": 34453034.0, + "step": 905 + }, + { + "epoch": 0.11525251240300216, + "ewc_loss": 0.009880405850708485, + "ewc_loss_diag": 7.092952728271484e-06, + "ewc_loss_parallel": 2.800327456498053e-05, + "grad_norm": 3.064443588256836, + "learning_rate": 3.8363713437897413e-07, + "loss": 0.5546, + "mean_token_accuracy": 0.8224054574966431, + "num_tokens": 34493694.0, + "step": 906 + }, + { + "epoch": 0.11537972268159268, + "ewc_loss": 0.009958966635167599, + "ewc_loss_diag": 7.092952728271484e-06, + "ewc_loss_parallel": 2.878888335544616e-05, + "grad_norm": 2.7996530532836914, + "learning_rate": 3.8406104281475197e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8279938101768494, + "num_tokens": 34531636.0, + "step": 907 + }, + { + "epoch": 0.11550693296018319, + "ewc_loss": 0.009826458990573883, + "ewc_loss_diag": 7.063150405883789e-06, + "ewc_loss_parallel": 2.7768985091825016e-05, + "grad_norm": 2.736585855484009, + "learning_rate": 3.8448495125052987e-07, + "loss": 0.5319, + "mean_token_accuracy": 0.8284282684326172, + "num_tokens": 34570851.0, + "step": 908 + }, + { + "epoch": 0.11563414323877369, + "ewc_loss": 0.009828547015786171, + "ewc_loss_diag": 7.063150405883789e-06, + "ewc_loss_parallel": 2.7789868909167126e-05, + "grad_norm": 2.6891181468963623, + "learning_rate": 3.8490885968630777e-07, + "loss": 0.484, + "mean_token_accuracy": 0.846188485622406, + "num_tokens": 34613314.0, + "step": 909 + }, + { + "epoch": 0.1157613535173642, + "ewc_loss": 0.009893232956528664, + "ewc_loss_diag": 7.12275505065918e-06, + "ewc_loss_parallel": 2.7826372388517484e-05, + "grad_norm": 2.681407928466797, + "learning_rate": 3.853327681220856e-07, + "loss": 0.5, + "mean_token_accuracy": 0.8386249542236328, + "num_tokens": 34656701.0, + "step": 910 + }, + { + "epoch": 0.11588856379595472, + "ewc_loss": 0.00990261696279049, + "ewc_loss_diag": 7.12275505065918e-06, + "ewc_loss_parallel": 2.7920217689825222e-05, + "grad_norm": 2.87451434135437, + "learning_rate": 3.8575667655786346e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8488689661026001, + "num_tokens": 34688595.0, + "step": 911 + }, + { + "epoch": 0.11601577407454522, + "ewc_loss": 0.009965503588318825, + "ewc_loss_diag": 7.12275505065918e-06, + "ewc_loss_parallel": 2.8549075068440288e-05, + "grad_norm": 2.6938483715057373, + "learning_rate": 3.8618058499364136e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8426438570022583, + "num_tokens": 34726609.0, + "step": 912 + }, + { + "epoch": 0.11614298435313573, + "ewc_loss": 0.00989608746021986, + "ewc_loss_diag": 7.12275505065918e-06, + "ewc_loss_parallel": 2.7854914151248522e-05, + "grad_norm": 2.7422261238098145, + "learning_rate": 3.8660449342941926e-07, + "loss": 0.5486, + "mean_token_accuracy": 0.8292853832244873, + "num_tokens": 34769213.0, + "step": 913 + }, + { + "epoch": 0.11627019463172625, + "ewc_loss": 0.009933278895914555, + "ewc_loss_diag": 7.12275505065918e-06, + "ewc_loss_parallel": 2.8226833819644526e-05, + "grad_norm": 2.7865958213806152, + "learning_rate": 3.870284018651971e-07, + "loss": 0.5227, + "mean_token_accuracy": 0.8345494270324707, + "num_tokens": 34806722.0, + "step": 914 + }, + { + "epoch": 0.11639740491031675, + "ewc_loss": 0.009984385222196579, + "ewc_loss_diag": 7.152557373046875e-06, + "ewc_loss_parallel": 2.843271977326367e-05, + "grad_norm": 2.705744981765747, + "learning_rate": 3.8745231030097495e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8492992520332336, + "num_tokens": 34846408.0, + "step": 915 + }, + { + "epoch": 0.11652461518890726, + "ewc_loss": 0.009981407783925533, + "ewc_loss_diag": 7.18235969543457e-06, + "ewc_loss_parallel": 2.8097769245505333e-05, + "grad_norm": 2.747349500656128, + "learning_rate": 3.8787621873675285e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8489869236946106, + "num_tokens": 34883791.0, + "step": 916 + }, + { + "epoch": 0.11665182546749778, + "ewc_loss": 0.010016880929470062, + "ewc_loss_diag": 7.18235969543457e-06, + "ewc_loss_parallel": 2.845249582605902e-05, + "grad_norm": 2.7109599113464355, + "learning_rate": 3.883001271725307e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8513054251670837, + "num_tokens": 34922768.0, + "step": 917 + }, + { + "epoch": 0.11677903574608828, + "ewc_loss": 0.010000629350543022, + "ewc_loss_diag": 7.18235969543457e-06, + "ewc_loss_parallel": 2.828998003678862e-05, + "grad_norm": 2.6588034629821777, + "learning_rate": 3.887240356083086e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.8568843603134155, + "num_tokens": 34962184.0, + "step": 918 + }, + { + "epoch": 0.11690624602467879, + "ewc_loss": 0.009998946450650692, + "ewc_loss_diag": 7.18235969543457e-06, + "ewc_loss_parallel": 2.8273152565816417e-05, + "grad_norm": 2.920654773712158, + "learning_rate": 3.8914794404408644e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.843337893486023, + "num_tokens": 34991455.0, + "step": 919 + }, + { + "epoch": 0.1170334563032693, + "ewc_loss": 0.01012017298489809, + "ewc_loss_diag": 7.212162017822266e-06, + "ewc_loss_parallel": 2.918024438258726e-05, + "grad_norm": 2.9091193675994873, + "learning_rate": 3.8957185247986434e-07, + "loss": 0.5156, + "mean_token_accuracy": 0.8346143364906311, + "num_tokens": 35025044.0, + "step": 920 + }, + { + "epoch": 0.11716066658185982, + "ewc_loss": 0.010117615573108196, + "ewc_loss_diag": 7.241964340209961e-06, + "ewc_loss_parallel": 2.8849495720351115e-05, + "grad_norm": 2.7299389839172363, + "learning_rate": 3.899957609156422e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.8425707221031189, + "num_tokens": 35064564.0, + "step": 921 + }, + { + "epoch": 0.11728787686045032, + "ewc_loss": 0.010007469914853573, + "ewc_loss_diag": 7.18235969543457e-06, + "ewc_loss_parallel": 2.8358388590277173e-05, + "grad_norm": 2.6823043823242188, + "learning_rate": 3.904196693514201e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.8397601246833801, + "num_tokens": 35109786.0, + "step": 922 + }, + { + "epoch": 0.11741508713904084, + "ewc_loss": 0.010050238110125065, + "ewc_loss_diag": 7.212162017822266e-06, + "ewc_loss_parallel": 2.8480897526605986e-05, + "grad_norm": 2.7734906673431396, + "learning_rate": 3.9084357778719793e-07, + "loss": 0.5683, + "mean_token_accuracy": 0.8201642036437988, + "num_tokens": 35148351.0, + "step": 923 + }, + { + "epoch": 0.11754229741763135, + "ewc_loss": 0.01014654990285635, + "ewc_loss_diag": 7.271766662597656e-06, + "ewc_loss_parallel": 2.8833663236582652e-05, + "grad_norm": 2.8060405254364014, + "learning_rate": 3.9126748622297583e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8316663503646851, + "num_tokens": 35183464.0, + "step": 924 + }, + { + "epoch": 0.11766950769622185, + "ewc_loss": 0.010145909152925014, + "ewc_loss_diag": 7.271766662597656e-06, + "ewc_loss_parallel": 2.882725493691396e-05, + "grad_norm": 2.7449841499328613, + "learning_rate": 3.916913946587537e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8335294723510742, + "num_tokens": 35218561.0, + "step": 925 + }, + { + "epoch": 0.11779671797481236, + "ewc_loss": 0.010125608183443546, + "ewc_loss_diag": 7.271766662597656e-06, + "ewc_loss_parallel": 2.862424480554182e-05, + "grad_norm": 2.7349870204925537, + "learning_rate": 3.921153030945316e-07, + "loss": 0.5292, + "mean_token_accuracy": 0.8306019306182861, + "num_tokens": 35257740.0, + "step": 926 + }, + { + "epoch": 0.11792392825340288, + "ewc_loss": 0.010136190801858902, + "ewc_loss_diag": 7.271766662597656e-06, + "ewc_loss_parallel": 2.873006997106131e-05, + "grad_norm": 2.827152967453003, + "learning_rate": 3.925392115303094e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8449142575263977, + "num_tokens": 35291632.0, + "step": 927 + }, + { + "epoch": 0.11805113853199338, + "ewc_loss": 0.010167454369366169, + "ewc_loss_diag": 7.271766662597656e-06, + "ewc_loss_parallel": 2.904270513681695e-05, + "grad_norm": 2.7713778018951416, + "learning_rate": 3.929631199660873e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8396506309509277, + "num_tokens": 35327686.0, + "step": 928 + }, + { + "epoch": 0.1181783488105839, + "ewc_loss": 0.010151797905564308, + "ewc_loss_diag": 7.271766662597656e-06, + "ewc_loss_parallel": 2.8886148356832564e-05, + "grad_norm": 2.7299373149871826, + "learning_rate": 3.9338702840186517e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8403297662734985, + "num_tokens": 35364480.0, + "step": 929 + }, + { + "epoch": 0.11830555908917441, + "ewc_loss": 0.010184433311223984, + "ewc_loss_diag": 7.3015689849853516e-06, + "ewc_loss_parallel": 2.8907326850458048e-05, + "grad_norm": 2.641592025756836, + "learning_rate": 3.9381093683764307e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8408191800117493, + "num_tokens": 35406017.0, + "step": 930 + }, + { + "epoch": 0.11843276936776491, + "ewc_loss": 0.010173003189265728, + "ewc_loss_diag": 7.3015689849853516e-06, + "ewc_loss_parallel": 2.8793019737349823e-05, + "grad_norm": 2.7220559120178223, + "learning_rate": 3.942348452734209e-07, + "loss": 0.555, + "mean_token_accuracy": 0.823093056678772, + "num_tokens": 35448447.0, + "step": 931 + }, + { + "epoch": 0.11855997964635542, + "ewc_loss": 0.010274345986545086, + "ewc_loss_diag": 7.361173629760742e-06, + "ewc_loss_parallel": 2.9196093237260357e-05, + "grad_norm": 2.7792956829071045, + "learning_rate": 3.946587537091988e-07, + "loss": 0.5042, + "mean_token_accuracy": 0.8370701670646667, + "num_tokens": 35488120.0, + "step": 932 + }, + { + "epoch": 0.11868718992494594, + "ewc_loss": 0.010321632027626038, + "ewc_loss_diag": 7.3909759521484375e-06, + "ewc_loss_parallel": 2.936377677542623e-05, + "grad_norm": 2.64987850189209, + "learning_rate": 3.9508266214497666e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.8455173969268799, + "num_tokens": 35531759.0, + "step": 933 + }, + { + "epoch": 0.11881440020353645, + "ewc_loss": 0.010269448161125183, + "ewc_loss_diag": 7.3909759521484375e-06, + "ewc_loss_parallel": 2.884194145735819e-05, + "grad_norm": 2.705073118209839, + "learning_rate": 3.9550657058075456e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8529841303825378, + "num_tokens": 35571476.0, + "step": 934 + }, + { + "epoch": 0.11894161048212695, + "ewc_loss": 0.01036739256232977, + "ewc_loss_diag": 7.450580596923828e-06, + "ewc_loss_parallel": 2.921103805419989e-05, + "grad_norm": 2.815373182296753, + "learning_rate": 3.959304790165324e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8402823805809021, + "num_tokens": 35608440.0, + "step": 935 + }, + { + "epoch": 0.11906882076071747, + "ewc_loss": 0.010433454066514969, + "ewc_loss_diag": 7.4803829193115234e-06, + "ewc_loss_parallel": 2.9566475859610364e-05, + "grad_norm": 2.8526690006256104, + "learning_rate": 3.9635438745231025e-07, + "loss": 0.4995, + "mean_token_accuracy": 0.8415261507034302, + "num_tokens": 35641861.0, + "step": 936 + }, + { + "epoch": 0.11919603103930798, + "ewc_loss": 0.010422918014228344, + "ewc_loss_diag": 7.4803829193115234e-06, + "ewc_loss_parallel": 2.946111089840997e-05, + "grad_norm": 2.7405190467834473, + "learning_rate": 3.9677829588808815e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.832999587059021, + "num_tokens": 35684626.0, + "step": 937 + }, + { + "epoch": 0.11932324131789848, + "ewc_loss": 0.01039212103933096, + "ewc_loss_diag": 7.4803829193115234e-06, + "ewc_loss_parallel": 2.9153145078453235e-05, + "grad_norm": 2.816298723220825, + "learning_rate": 3.9720220432386605e-07, + "loss": 0.5276, + "mean_token_accuracy": 0.8309621810913086, + "num_tokens": 35718125.0, + "step": 938 + }, + { + "epoch": 0.119450451596489, + "ewc_loss": 0.01055234670639038, + "ewc_loss_diag": 7.599592208862305e-06, + "ewc_loss_parallel": 2.9534698114730418e-05, + "grad_norm": 11.718361854553223, + "learning_rate": 3.976261127596439e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8511856198310852, + "num_tokens": 35752818.0, + "step": 939 + }, + { + "epoch": 0.11957766187507951, + "ewc_loss": 0.011669714003801346, + "ewc_loss_diag": 7.510185241699219e-06, + "ewc_loss_parallel": 4.162390177953057e-05, + "grad_norm": 3.424267292022705, + "learning_rate": 3.9805002119542174e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8546762466430664, + "num_tokens": 35787097.0, + "step": 940 + }, + { + "epoch": 0.11970487215367001, + "ewc_loss": 0.010892266407608986, + "ewc_loss_diag": 7.599592208862305e-06, + "ewc_loss_parallel": 3.293389454483986e-05, + "grad_norm": 2.6854782104492188, + "learning_rate": 3.9847392963119964e-07, + "loss": 0.5479, + "mean_token_accuracy": 0.8292605876922607, + "num_tokens": 35825887.0, + "step": 941 + }, + { + "epoch": 0.11983208243226053, + "ewc_loss": 0.010498368181288242, + "ewc_loss_diag": 7.599592208862305e-06, + "ewc_loss_parallel": 2.899491119023878e-05, + "grad_norm": 2.93672251701355, + "learning_rate": 3.9889783806697754e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.849243700504303, + "num_tokens": 35863592.0, + "step": 942 + }, + { + "epoch": 0.11995929271085104, + "ewc_loss": 0.010799828916788101, + "ewc_loss_diag": 7.599592208862305e-06, + "ewc_loss_parallel": 3.200952414772473e-05, + "grad_norm": 2.973167657852173, + "learning_rate": 3.993217465027554e-07, + "loss": 0.5522, + "mean_token_accuracy": 0.8282678723335266, + "num_tokens": 35904483.0, + "step": 943 + }, + { + "epoch": 0.12008650298944154, + "ewc_loss": 0.010691690258681774, + "ewc_loss_diag": 7.569789886474609e-06, + "ewc_loss_parallel": 3.1233310437528417e-05, + "grad_norm": 2.8759098052978516, + "learning_rate": 3.9974565493853323e-07, + "loss": 0.44, + "mean_token_accuracy": 0.8586419820785522, + "num_tokens": 35938662.0, + "step": 944 + }, + { + "epoch": 0.12021371326803205, + "ewc_loss": 0.010585508309304714, + "ewc_loss_diag": 7.569789886474609e-06, + "ewc_loss_parallel": 3.017148992512375e-05, + "grad_norm": 2.8540303707122803, + "learning_rate": 4.0016956337431113e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8387198448181152, + "num_tokens": 35975176.0, + "step": 945 + }, + { + "epoch": 0.12034092354662257, + "ewc_loss": 0.010588640347123146, + "ewc_loss_diag": 7.569789886474609e-06, + "ewc_loss_parallel": 3.020281292265281e-05, + "grad_norm": 2.926251173019409, + "learning_rate": 4.0059347181008903e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.8428507447242737, + "num_tokens": 36008893.0, + "step": 946 + }, + { + "epoch": 0.12046813382521308, + "ewc_loss": 0.010598675347864628, + "ewc_loss_diag": 7.569789886474609e-06, + "ewc_loss_parallel": 3.0303157473099418e-05, + "grad_norm": 2.8070321083068848, + "learning_rate": 4.010173802458669e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8575148582458496, + "num_tokens": 36049882.0, + "step": 947 + }, + { + "epoch": 0.12059534410380358, + "ewc_loss": 0.010535111650824547, + "ewc_loss_diag": 7.569789886474609e-06, + "ewc_loss_parallel": 2.966752253996674e-05, + "grad_norm": 2.8274102210998535, + "learning_rate": 4.014412886816447e-07, + "loss": 0.5612, + "mean_token_accuracy": 0.8247431516647339, + "num_tokens": 36090615.0, + "step": 948 + }, + { + "epoch": 0.1207225543823941, + "ewc_loss": 0.01055297814309597, + "ewc_loss_diag": 7.569789886474609e-06, + "ewc_loss_parallel": 2.9846185498172417e-05, + "grad_norm": 2.8102550506591797, + "learning_rate": 4.018651971174226e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8418751955032349, + "num_tokens": 36128159.0, + "step": 949 + }, + { + "epoch": 0.12084976466098461, + "ewc_loss": 0.01057041622698307, + "ewc_loss_diag": 7.599592208862305e-06, + "ewc_loss_parallel": 2.971539470308926e-05, + "grad_norm": 2.722163677215576, + "learning_rate": 4.022891055532005e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8385661244392395, + "num_tokens": 36172851.0, + "step": 950 + }, + { + "epoch": 0.12097697493957511, + "ewc_loss": 0.010552993975579739, + "ewc_loss_diag": 7.599592208862305e-06, + "ewc_loss_parallel": 2.954116826003883e-05, + "grad_norm": 2.78196382522583, + "learning_rate": 4.0271301398897837e-07, + "loss": 0.5564, + "mean_token_accuracy": 0.8263104557991028, + "num_tokens": 36214282.0, + "step": 951 + }, + { + "epoch": 0.12110418521816563, + "ewc_loss": 0.010573236271739006, + "ewc_loss_diag": 7.599592208862305e-06, + "ewc_loss_parallel": 2.9743590857833624e-05, + "grad_norm": 2.861051321029663, + "learning_rate": 4.031369224247562e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8421692848205566, + "num_tokens": 36248619.0, + "step": 952 + }, + { + "epoch": 0.12123139549675614, + "ewc_loss": 0.010593172162771225, + "ewc_loss_diag": 7.599592208862305e-06, + "ewc_loss_parallel": 2.9942948458483443e-05, + "grad_norm": 2.7091660499572754, + "learning_rate": 4.035608308605341e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8343148827552795, + "num_tokens": 36289070.0, + "step": 953 + }, + { + "epoch": 0.12135860577534664, + "ewc_loss": 0.010578539222478867, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 2.9491451641661115e-05, + "grad_norm": 2.8741796016693115, + "learning_rate": 4.03984739296312e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8382161855697632, + "num_tokens": 36330883.0, + "step": 954 + }, + { + "epoch": 0.12148581605393716, + "ewc_loss": 0.010647861286997795, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 3.018466850335244e-05, + "grad_norm": 2.7388484477996826, + "learning_rate": 4.044086477320898e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8430162072181702, + "num_tokens": 36371507.0, + "step": 955 + }, + { + "epoch": 0.12161302633252767, + "ewc_loss": 0.010582194663584232, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 2.952799877675716e-05, + "grad_norm": 2.8423171043395996, + "learning_rate": 4.048325561678677e-07, + "loss": 0.517, + "mean_token_accuracy": 0.8353707790374756, + "num_tokens": 36407262.0, + "step": 956 + }, + { + "epoch": 0.12174023661111817, + "ewc_loss": 0.010639414191246033, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 3.0100201911409386e-05, + "grad_norm": 4.353466987609863, + "learning_rate": 4.052564646036456e-07, + "loss": 0.5138, + "mean_token_accuracy": 0.8365030884742737, + "num_tokens": 36448089.0, + "step": 957 + }, + { + "epoch": 0.12186744688970869, + "ewc_loss": 0.01104973815381527, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 3.420343273319304e-05, + "grad_norm": 2.961392879486084, + "learning_rate": 4.056803730394235e-07, + "loss": 0.5266, + "mean_token_accuracy": 0.830345869064331, + "num_tokens": 36484022.0, + "step": 958 + }, + { + "epoch": 0.1219946571682992, + "ewc_loss": 0.010584305971860886, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 2.9549117243732326e-05, + "grad_norm": 2.70674729347229, + "learning_rate": 4.061042814752013e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8410626649856567, + "num_tokens": 36519385.0, + "step": 959 + }, + { + "epoch": 0.12212186744688971, + "ewc_loss": 0.010580982081592083, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 2.951587703137193e-05, + "grad_norm": 3.003410577774048, + "learning_rate": 4.065281899109792e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.8366059064865112, + "num_tokens": 36551943.0, + "step": 960 + }, + { + "epoch": 0.12224907772548022, + "ewc_loss": 0.01074952445924282, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 3.120129622402601e-05, + "grad_norm": 2.7363390922546387, + "learning_rate": 4.069520983467571e-07, + "loss": 0.4542, + "mean_token_accuracy": 0.8550376296043396, + "num_tokens": 36590927.0, + "step": 961 + }, + { + "epoch": 0.12237628800407073, + "ewc_loss": 0.010593918152153492, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 2.9645234462805092e-05, + "grad_norm": 2.8018276691436768, + "learning_rate": 4.07376006782535e-07, + "loss": 0.484, + "mean_token_accuracy": 0.8434193134307861, + "num_tokens": 36625199.0, + "step": 962 + }, + { + "epoch": 0.12250349828266124, + "ewc_loss": 0.010655668564140797, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 3.026273952855263e-05, + "grad_norm": 2.847437620162964, + "learning_rate": 4.077999152183128e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.8557744026184082, + "num_tokens": 36658981.0, + "step": 963 + }, + { + "epoch": 0.12263070856125174, + "ewc_loss": 0.01066911406815052, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 3.039719013031572e-05, + "grad_norm": 2.6633732318878174, + "learning_rate": 4.082238236540907e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8413042426109314, + "num_tokens": 36704214.0, + "step": 964 + }, + { + "epoch": 0.12275791883984226, + "ewc_loss": 0.010668466798961163, + "ewc_loss_diag": 7.68899917602539e-06, + "ewc_loss_parallel": 2.978037082357332e-05, + "grad_norm": 2.7751259803771973, + "learning_rate": 4.086477320898686e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.8455736637115479, + "num_tokens": 36744782.0, + "step": 965 + }, + { + "epoch": 0.12288512911843277, + "ewc_loss": 0.010677450336515903, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 3.048055623366963e-05, + "grad_norm": 2.774240016937256, + "learning_rate": 4.090716405256465e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8350167274475098, + "num_tokens": 36783645.0, + "step": 966 + }, + { + "epoch": 0.12301233939702327, + "ewc_loss": 0.01066621020436287, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 3.0368160878424533e-05, + "grad_norm": 2.8461952209472656, + "learning_rate": 4.094955489614243e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8446640968322754, + "num_tokens": 36817539.0, + "step": 967 + }, + { + "epoch": 0.12313954967561379, + "ewc_loss": 0.01074978057295084, + "ewc_loss_diag": 7.68899917602539e-06, + "ewc_loss_parallel": 3.059350638068281e-05, + "grad_norm": 2.7368721961975098, + "learning_rate": 4.099194573972022e-07, + "loss": 0.4827, + "mean_token_accuracy": 0.8448670506477356, + "num_tokens": 36859144.0, + "step": 968 + }, + { + "epoch": 0.1232667599542043, + "ewc_loss": 0.010665600188076496, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 3.036205998796504e-05, + "grad_norm": 2.7975881099700928, + "learning_rate": 4.1034336583298007e-07, + "loss": 0.5256, + "mean_token_accuracy": 0.833045244216919, + "num_tokens": 36900833.0, + "step": 969 + }, + { + "epoch": 0.1233939702327948, + "ewc_loss": 0.010706150904297829, + "ewc_loss_diag": 7.62939453125e-06, + "ewc_loss_parallel": 3.076756183872931e-05, + "grad_norm": 2.762454032897949, + "learning_rate": 4.1076727426875797e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8354547619819641, + "num_tokens": 36940935.0, + "step": 970 + }, + { + "epoch": 0.12352118051138532, + "ewc_loss": 0.010754420422017574, + "ewc_loss_diag": 7.68899917602539e-06, + "ewc_loss_parallel": 3.063990516238846e-05, + "grad_norm": 2.8417012691497803, + "learning_rate": 4.1119118270453577e-07, + "loss": 0.5752, + "mean_token_accuracy": 0.8155689835548401, + "num_tokens": 36979874.0, + "step": 971 + }, + { + "epoch": 0.12364839078997583, + "ewc_loss": 0.010777155868709087, + "ewc_loss_diag": 7.68899917602539e-06, + "ewc_loss_parallel": 3.0867260647937655e-05, + "grad_norm": 2.7527735233306885, + "learning_rate": 4.1161509114031366e-07, + "loss": 0.4577, + "mean_token_accuracy": 0.8544485569000244, + "num_tokens": 37017917.0, + "step": 972 + }, + { + "epoch": 0.12377560106856635, + "ewc_loss": 0.010748395696282387, + "ewc_loss_diag": 7.68899917602539e-06, + "ewc_loss_parallel": 3.057966387132183e-05, + "grad_norm": 2.8564271926879883, + "learning_rate": 4.1203899957609156e-07, + "loss": 0.5221, + "mean_token_accuracy": 0.8341368436813354, + "num_tokens": 37054914.0, + "step": 973 + }, + { + "epoch": 0.12390281134715685, + "ewc_loss": 0.010785985738039017, + "ewc_loss_diag": 7.68899917602539e-06, + "ewc_loss_parallel": 3.0955561669543386e-05, + "grad_norm": 2.8784961700439453, + "learning_rate": 4.124629080118694e-07, + "loss": 0.4725, + "mean_token_accuracy": 0.8451269268989563, + "num_tokens": 37088277.0, + "step": 974 + }, + { + "epoch": 0.12403002162574736, + "ewc_loss": 0.010847313329577446, + "ewc_loss_diag": 7.748603820800781e-06, + "ewc_loss_parallel": 3.0958490242483094e-05, + "grad_norm": 2.8149538040161133, + "learning_rate": 4.1288681644764726e-07, + "loss": 0.5176, + "mean_token_accuracy": 0.8337434530258179, + "num_tokens": 37126963.0, + "step": 975 + }, + { + "epoch": 0.12415723190433788, + "ewc_loss": 0.010822782292962074, + "ewc_loss_diag": 7.748603820800781e-06, + "ewc_loss_parallel": 3.071317041758448e-05, + "grad_norm": 2.8477301597595215, + "learning_rate": 4.1331072488342515e-07, + "loss": 0.5294, + "mean_token_accuracy": 0.8324334621429443, + "num_tokens": 37161948.0, + "step": 976 + }, + { + "epoch": 0.12428444218292838, + "ewc_loss": 0.010915289632976055, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.10278992401436e-05, + "grad_norm": 2.7660791873931885, + "learning_rate": 4.1373463331920305e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8569782972335815, + "num_tokens": 37198552.0, + "step": 977 + }, + { + "epoch": 0.12441165246151889, + "ewc_loss": 0.010874181054532528, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.061681127292104e-05, + "grad_norm": 2.769031524658203, + "learning_rate": 4.141585417549809e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.834090530872345, + "num_tokens": 37235990.0, + "step": 978 + }, + { + "epoch": 0.1245388627401094, + "ewc_loss": 0.010892128571867943, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.079628004343249e-05, + "grad_norm": 2.873014450073242, + "learning_rate": 4.1458245019075875e-07, + "loss": 0.475, + "mean_token_accuracy": 0.845730185508728, + "num_tokens": 37269664.0, + "step": 979 + }, + { + "epoch": 0.1246660730186999, + "ewc_loss": 0.010937453247606754, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.1249535823008046e-05, + "grad_norm": 2.76765775680542, + "learning_rate": 4.1500635862653664e-07, + "loss": 0.507, + "mean_token_accuracy": 0.8396363258361816, + "num_tokens": 37308647.0, + "step": 980 + }, + { + "epoch": 0.12479328329729042, + "ewc_loss": 0.010896043851971626, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.083544288529083e-05, + "grad_norm": 2.77051043510437, + "learning_rate": 4.1543026706231454e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8543649315834045, + "num_tokens": 37345953.0, + "step": 981 + }, + { + "epoch": 0.12492049357588093, + "ewc_loss": 0.010919286869466305, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.1067869713297114e-05, + "grad_norm": 2.808708429336548, + "learning_rate": 4.158541754980924e-07, + "loss": 0.4514, + "mean_token_accuracy": 0.8574049472808838, + "num_tokens": 37382735.0, + "step": 982 + }, + { + "epoch": 0.12504770385447145, + "ewc_loss": 0.010938254185020924, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.1257543014362454e-05, + "grad_norm": 2.847977876663208, + "learning_rate": 4.1627808393387024e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8316348791122437, + "num_tokens": 37419605.0, + "step": 983 + }, + { + "epoch": 0.12517491413306195, + "ewc_loss": 0.010947251692414284, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.1347521144198254e-05, + "grad_norm": 2.8894777297973633, + "learning_rate": 4.1670199236964813e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.8328343629837036, + "num_tokens": 37455395.0, + "step": 984 + }, + { + "epoch": 0.12530212441165245, + "ewc_loss": 0.010956796817481518, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.144297079415992e-05, + "grad_norm": 2.7647552490234375, + "learning_rate": 4.1712590080542603e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8508462905883789, + "num_tokens": 37492208.0, + "step": 985 + }, + { + "epoch": 0.12542933469024298, + "ewc_loss": 0.010924775153398514, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.1122755899559706e-05, + "grad_norm": 2.741765022277832, + "learning_rate": 4.175498092412039e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.840101957321167, + "num_tokens": 37533675.0, + "step": 986 + }, + { + "epoch": 0.12555654496883348, + "ewc_loss": 0.011004570871591568, + "ewc_loss_diag": 7.867813110351562e-06, + "ewc_loss_parallel": 3.1310355552705005e-05, + "grad_norm": 2.8394694328308105, + "learning_rate": 4.179737176769817e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8486657738685608, + "num_tokens": 37572792.0, + "step": 987 + }, + { + "epoch": 0.12568375524742398, + "ewc_loss": 0.01104531716555357, + "ewc_loss_diag": 7.867813110351562e-06, + "ewc_loss_parallel": 3.17178200930357e-05, + "grad_norm": 2.7451465129852295, + "learning_rate": 4.183976261127596e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8475079536437988, + "num_tokens": 37611481.0, + "step": 988 + }, + { + "epoch": 0.1258109655260145, + "ewc_loss": 0.011004828847944736, + "ewc_loss_diag": 7.867813110351562e-06, + "ewc_loss_parallel": 3.131293851765804e-05, + "grad_norm": 2.978118658065796, + "learning_rate": 4.1882153454853747e-07, + "loss": 0.549, + "mean_token_accuracy": 0.8248315453529358, + "num_tokens": 37648253.0, + "step": 989 + }, + { + "epoch": 0.125938175804605, + "ewc_loss": 0.011024987325072289, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.212487354176119e-05, + "grad_norm": 2.7787773609161377, + "learning_rate": 4.1924544298431537e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.8461323380470276, + "num_tokens": 37687086.0, + "step": 990 + }, + { + "epoch": 0.12606538608319554, + "ewc_loss": 0.010935217142105103, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.1227169529302046e-05, + "grad_norm": 2.726773262023926, + "learning_rate": 4.196693514200932e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8474743366241455, + "num_tokens": 37730928.0, + "step": 991 + }, + { + "epoch": 0.12619259636178604, + "ewc_loss": 0.01101587526500225, + "ewc_loss_diag": 7.867813110351562e-06, + "ewc_loss_parallel": 3.142340574413538e-05, + "grad_norm": 2.7930641174316406, + "learning_rate": 4.200932598558711e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8374271392822266, + "num_tokens": 37771857.0, + "step": 992 + }, + { + "epoch": 0.12631980664037654, + "ewc_loss": 0.010991349816322327, + "ewc_loss_diag": 7.808208465576172e-06, + "ewc_loss_parallel": 3.1788498745299876e-05, + "grad_norm": 2.730234384536743, + "learning_rate": 4.2051716829164896e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8425613641738892, + "num_tokens": 37817517.0, + "step": 993 + }, + { + "epoch": 0.12644701691896706, + "ewc_loss": 0.011019449681043625, + "ewc_loss_diag": 7.867813110351562e-06, + "ewc_loss_parallel": 3.1459148885915056e-05, + "grad_norm": 2.7889046669006348, + "learning_rate": 4.2094107672742686e-07, + "loss": 0.5464, + "mean_token_accuracy": 0.831447422504425, + "num_tokens": 37855891.0, + "step": 994 + }, + { + "epoch": 0.12657422719755757, + "ewc_loss": 0.011058708652853966, + "ewc_loss_diag": 7.867813110351562e-06, + "ewc_loss_parallel": 3.185173773090355e-05, + "grad_norm": 2.8989689350128174, + "learning_rate": 4.2136498516320476e-07, + "loss": 0.4651, + "mean_token_accuracy": 0.853508710861206, + "num_tokens": 37888719.0, + "step": 995 + }, + { + "epoch": 0.12670143747614807, + "ewc_loss": 0.011084173806011677, + "ewc_loss_diag": 7.867813110351562e-06, + "ewc_loss_parallel": 3.210638533346355e-05, + "grad_norm": 2.739919900894165, + "learning_rate": 4.217888935989826e-07, + "loss": 0.5185, + "mean_token_accuracy": 0.8335475921630859, + "num_tokens": 37932968.0, + "step": 996 + }, + { + "epoch": 0.1268286477547386, + "ewc_loss": 0.011149236932396889, + "ewc_loss_diag": 7.987022399902344e-06, + "ewc_loss_parallel": 3.1536317692371085e-05, + "grad_norm": 2.8343610763549805, + "learning_rate": 4.2221280203476045e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8365164995193481, + "num_tokens": 37974297.0, + "step": 997 + }, + { + "epoch": 0.1269558580333291, + "ewc_loss": 0.011258991435170174, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.2023504900280386e-05, + "grad_norm": 2.7677836418151855, + "learning_rate": 4.2263671047053835e-07, + "loss": 0.5261, + "mean_token_accuracy": 0.8324819803237915, + "num_tokens": 38020294.0, + "step": 998 + }, + { + "epoch": 0.1270830683119196, + "ewc_loss": 0.011244199238717556, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.1875588319962844e-05, + "grad_norm": 2.722485065460205, + "learning_rate": 4.2306061890631625e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8398873209953308, + "num_tokens": 38063322.0, + "step": 999 + }, + { + "epoch": 0.12721027859051012, + "ewc_loss": 0.011238102801144123, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.1814619433134794e-05, + "grad_norm": 2.8788113594055176, + "learning_rate": 4.234845273420941e-07, + "loss": 0.5934, + "mean_token_accuracy": 0.8196272850036621, + "num_tokens": 38101052.0, + "step": 1000 + }, + { + "epoch": 0.12733748886910062, + "ewc_loss": 0.011297974735498428, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.2413336157333106e-05, + "grad_norm": 2.7719757556915283, + "learning_rate": 4.2390843577787194e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8344743251800537, + "num_tokens": 38145472.0, + "step": 1001 + }, + { + "epoch": 0.12746469914769112, + "ewc_loss": 0.011246779933571815, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.1901392503641546e-05, + "grad_norm": 2.807725191116333, + "learning_rate": 4.2433234421364984e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.8433903455734253, + "num_tokens": 38182662.0, + "step": 1002 + }, + { + "epoch": 0.12759190942628165, + "ewc_loss": 0.011274298653006554, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.217657649656758e-05, + "grad_norm": 2.841097831726074, + "learning_rate": 4.2475625264942774e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.8484123945236206, + "num_tokens": 38224735.0, + "step": 1003 + }, + { + "epoch": 0.12771911970487215, + "ewc_loss": 0.011288086883723736, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.231445953133516e-05, + "grad_norm": 3.2023518085479736, + "learning_rate": 4.251801610852056e-07, + "loss": 0.5213, + "mean_token_accuracy": 0.8344979286193848, + "num_tokens": 38255847.0, + "step": 1004 + }, + { + "epoch": 0.12784632998346265, + "ewc_loss": 0.011394120752811432, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.3374803024344146e-05, + "grad_norm": 3.0625782012939453, + "learning_rate": 4.2560406952098343e-07, + "loss": 0.5342, + "mean_token_accuracy": 0.8322797417640686, + "num_tokens": 38296921.0, + "step": 1005 + }, + { + "epoch": 0.12797354026205318, + "ewc_loss": 0.011354515329003334, + "ewc_loss_diag": 8.106231689453125e-06, + "ewc_loss_parallel": 3.236839984310791e-05, + "grad_norm": 2.7698233127593994, + "learning_rate": 4.2602797795676133e-07, + "loss": 0.5453, + "mean_token_accuracy": 0.8291614055633545, + "num_tokens": 38336408.0, + "step": 1006 + }, + { + "epoch": 0.12810075054064368, + "ewc_loss": 0.01121921930462122, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.162578650517389e-05, + "grad_norm": 2.734067678451538, + "learning_rate": 4.2645188639253923e-07, + "loss": 0.4562, + "mean_token_accuracy": 0.8525686860084534, + "num_tokens": 38372809.0, + "step": 1007 + }, + { + "epoch": 0.12822796081923418, + "ewc_loss": 0.011266735382378101, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.2100950193125755e-05, + "grad_norm": 2.8504817485809326, + "learning_rate": 4.26875794828317e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.8360227346420288, + "num_tokens": 38413312.0, + "step": 1008 + }, + { + "epoch": 0.1283551710978247, + "ewc_loss": 0.011356279253959656, + "ewc_loss_diag": 8.106231689453125e-06, + "ewc_loss_parallel": 3.238603312638588e-05, + "grad_norm": 2.755361795425415, + "learning_rate": 4.272997032640949e-07, + "loss": 0.532, + "mean_token_accuracy": 0.8306625485420227, + "num_tokens": 38457710.0, + "step": 1009 + }, + { + "epoch": 0.1284823813764152, + "ewc_loss": 0.011315597221255302, + "ewc_loss_diag": 8.106231689453125e-06, + "ewc_loss_parallel": 3.197921614628285e-05, + "grad_norm": 2.788358688354492, + "learning_rate": 4.277236116998728e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.843766450881958, + "num_tokens": 38499630.0, + "step": 1010 + }, + { + "epoch": 0.1286095916550057, + "ewc_loss": 0.011272676289081573, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.2160354749066755e-05, + "grad_norm": 2.814493179321289, + "learning_rate": 4.281475201356507e-07, + "loss": 0.5004, + "mean_token_accuracy": 0.8413702249526978, + "num_tokens": 38538133.0, + "step": 1011 + }, + { + "epoch": 0.12873680193359624, + "ewc_loss": 0.011294826865196228, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.2381864002672955e-05, + "grad_norm": 2.851494550704956, + "learning_rate": 4.285714285714285e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8309985399246216, + "num_tokens": 38576816.0, + "step": 1012 + }, + { + "epoch": 0.12886401221218674, + "ewc_loss": 0.01130777969956398, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.251138696214184e-05, + "grad_norm": 2.7896053791046143, + "learning_rate": 4.289953370072064e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8471498489379883, + "num_tokens": 38615270.0, + "step": 1013 + }, + { + "epoch": 0.12899122249077727, + "ewc_loss": 0.011285149492323399, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.228509012842551e-05, + "grad_norm": 2.8728389739990234, + "learning_rate": 4.294192454429843e-07, + "loss": 0.5233, + "mean_token_accuracy": 0.832205593585968, + "num_tokens": 38654173.0, + "step": 1014 + }, + { + "epoch": 0.12911843276936777, + "ewc_loss": 0.011326390318572521, + "ewc_loss_diag": 8.046627044677734e-06, + "ewc_loss_parallel": 3.269749868195504e-05, + "grad_norm": 2.7189176082611084, + "learning_rate": 4.298431538787622e-07, + "loss": 0.4108, + "mean_token_accuracy": 0.8692671060562134, + "num_tokens": 38695758.0, + "step": 1015 + }, + { + "epoch": 0.12924564304795827, + "ewc_loss": 0.011341549456119537, + "ewc_loss_diag": 8.106231689453125e-06, + "ewc_loss_parallel": 3.2238735002465546e-05, + "grad_norm": 2.930663585662842, + "learning_rate": 4.3026706231454e-07, + "loss": 0.5545, + "mean_token_accuracy": 0.8241203427314758, + "num_tokens": 38729524.0, + "step": 1016 + }, + { + "epoch": 0.1293728533265488, + "ewc_loss": 0.011503766290843487, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.325055513414554e-05, + "grad_norm": 2.8658201694488525, + "learning_rate": 4.306909707503179e-07, + "loss": 0.5028, + "mean_token_accuracy": 0.8425127863883972, + "num_tokens": 38767500.0, + "step": 1017 + }, + { + "epoch": 0.1295000636051393, + "ewc_loss": 0.011447885073721409, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.269173976150341e-05, + "grad_norm": 2.913886308670044, + "learning_rate": 4.311148791860958e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8413205742835999, + "num_tokens": 38798683.0, + "step": 1018 + }, + { + "epoch": 0.1296272738837298, + "ewc_loss": 0.011472254991531372, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.2935437047854066e-05, + "grad_norm": 2.8638386726379395, + "learning_rate": 4.315387876218737e-07, + "loss": 0.5394, + "mean_token_accuracy": 0.831073522567749, + "num_tokens": 38833653.0, + "step": 1019 + }, + { + "epoch": 0.12975448416232033, + "ewc_loss": 0.011458258144557476, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.2795473089208826e-05, + "grad_norm": 3.3203399181365967, + "learning_rate": 4.319626960576515e-07, + "loss": 0.5378, + "mean_token_accuracy": 0.8268699049949646, + "num_tokens": 38869428.0, + "step": 1020 + }, + { + "epoch": 0.12988169444091083, + "ewc_loss": 0.011544039472937584, + "ewc_loss_diag": 8.106231689453125e-06, + "ewc_loss_parallel": 3.426364128245041e-05, + "grad_norm": 2.8434224128723145, + "learning_rate": 4.323866044934294e-07, + "loss": 0.475, + "mean_token_accuracy": 0.8447591066360474, + "num_tokens": 38909502.0, + "step": 1021 + }, + { + "epoch": 0.13000890471950133, + "ewc_loss": 0.011336879804730415, + "ewc_loss_diag": 8.106231689453125e-06, + "ewc_loss_parallel": 3.2192041544476524e-05, + "grad_norm": 2.7168262004852295, + "learning_rate": 4.328105129292073e-07, + "loss": 0.4981, + "mean_token_accuracy": 0.8433389663696289, + "num_tokens": 38951730.0, + "step": 1022 + }, + { + "epoch": 0.13013611499809186, + "ewc_loss": 0.011354954913258553, + "ewc_loss_diag": 8.106231689453125e-06, + "ewc_loss_parallel": 3.237278724554926e-05, + "grad_norm": 2.806391716003418, + "learning_rate": 4.332344213649852e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8451218605041504, + "num_tokens": 38988981.0, + "step": 1023 + }, + { + "epoch": 0.13026332527668236, + "ewc_loss": 0.011422272771596909, + "ewc_loss_diag": 8.106231689453125e-06, + "ewc_loss_parallel": 3.3045966119971126e-05, + "grad_norm": 2.829575538635254, + "learning_rate": 4.33658329800763e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8453352451324463, + "num_tokens": 39026289.0, + "step": 1024 + }, + { + "epoch": 0.13039053555527286, + "ewc_loss": 0.01147373579442501, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.295024362159893e-05, + "grad_norm": 2.744405508041382, + "learning_rate": 4.340822382365409e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8459823131561279, + "num_tokens": 39069113.0, + "step": 1025 + }, + { + "epoch": 0.13051774583386339, + "ewc_loss": 0.011451252736151218, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.272541653132066e-05, + "grad_norm": 2.7854514122009277, + "learning_rate": 4.345061466723188e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8533105850219727, + "num_tokens": 39112991.0, + "step": 1026 + }, + { + "epoch": 0.13064495611245389, + "ewc_loss": 0.0114911999553442, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.312488479423337e-05, + "grad_norm": 2.9274868965148926, + "learning_rate": 4.3493005510809663e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8355236649513245, + "num_tokens": 39149593.0, + "step": 1027 + }, + { + "epoch": 0.1307721663910444, + "ewc_loss": 0.01152871921658516, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.3500084100523964e-05, + "grad_norm": 2.8675506114959717, + "learning_rate": 4.353539635438745e-07, + "loss": 0.487, + "mean_token_accuracy": 0.843154788017273, + "num_tokens": 39186570.0, + "step": 1028 + }, + { + "epoch": 0.13089937666963491, + "ewc_loss": 0.011495772749185562, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.3170621463796124e-05, + "grad_norm": 2.945286989212036, + "learning_rate": 4.357778719796524e-07, + "loss": 0.5495, + "mean_token_accuracy": 0.8231098055839539, + "num_tokens": 39220666.0, + "step": 1029 + }, + { + "epoch": 0.13102658694822542, + "ewc_loss": 0.011524440720677376, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.345729783177376e-05, + "grad_norm": 2.815885543823242, + "learning_rate": 4.362017804154303e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.852227509021759, + "num_tokens": 39258372.0, + "step": 1030 + }, + { + "epoch": 0.13115379722681592, + "ewc_loss": 0.0114884153008461, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.309703970444389e-05, + "grad_norm": 2.810774803161621, + "learning_rate": 4.366256888512081e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8527758717536926, + "num_tokens": 39293647.0, + "step": 1031 + }, + { + "epoch": 0.13128100750540644, + "ewc_loss": 0.011500699445605278, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.321989061078057e-05, + "grad_norm": 2.7820327281951904, + "learning_rate": 4.3704959728698597e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8412474393844604, + "num_tokens": 39332811.0, + "step": 1032 + }, + { + "epoch": 0.13140821778399694, + "ewc_loss": 0.011505300179123878, + "ewc_loss_diag": 8.165836334228516e-06, + "ewc_loss_parallel": 3.326589285279624e-05, + "grad_norm": 3.015678644180298, + "learning_rate": 4.3747350572276386e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8409116864204407, + "num_tokens": 39365971.0, + "step": 1033 + }, + { + "epoch": 0.13153542806258745, + "ewc_loss": 0.011706934310495853, + "ewc_loss_diag": 8.285045623779297e-06, + "ewc_loss_parallel": 3.406153336982243e-05, + "grad_norm": 2.8690688610076904, + "learning_rate": 4.3789741415854176e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8401679396629333, + "num_tokens": 39403178.0, + "step": 1034 + }, + { + "epoch": 0.13166263834117797, + "ewc_loss": 0.011615362018346786, + "ewc_loss_diag": 8.285045623779297e-06, + "ewc_loss_parallel": 3.314581044833176e-05, + "grad_norm": 2.812755584716797, + "learning_rate": 4.383213225943196e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8391636610031128, + "num_tokens": 39441303.0, + "step": 1035 + }, + { + "epoch": 0.13178984861976847, + "ewc_loss": 0.011626916006207466, + "ewc_loss_diag": 8.285045623779297e-06, + "ewc_loss_parallel": 3.326134174130857e-05, + "grad_norm": 2.8156867027282715, + "learning_rate": 4.3874523103009746e-07, + "loss": 0.4426, + "mean_token_accuracy": 0.8586999177932739, + "num_tokens": 39477359.0, + "step": 1036 + }, + { + "epoch": 0.13191705889835897, + "ewc_loss": 0.011769749224185944, + "ewc_loss_diag": 8.404254913330078e-06, + "ewc_loss_parallel": 3.346897938172333e-05, + "grad_norm": 2.8457272052764893, + "learning_rate": 4.3916913946587536e-07, + "loss": 0.4532, + "mean_token_accuracy": 0.8511335849761963, + "num_tokens": 39511687.0, + "step": 1037 + }, + { + "epoch": 0.1320442691769495, + "ewc_loss": 0.011781573295593262, + "ewc_loss_diag": 8.404254913330078e-06, + "ewc_loss_parallel": 3.358721369295381e-05, + "grad_norm": 2.8682260513305664, + "learning_rate": 4.3959304790165325e-07, + "loss": 0.468, + "mean_token_accuracy": 0.8474280834197998, + "num_tokens": 39547453.0, + "step": 1038 + }, + { + "epoch": 0.13217147945554, + "ewc_loss": 0.011789286509156227, + "ewc_loss_diag": 8.404254913330078e-06, + "ewc_loss_parallel": 3.366434611962177e-05, + "grad_norm": 2.776348829269409, + "learning_rate": 4.400169563374311e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8436096906661987, + "num_tokens": 39588934.0, + "step": 1039 + }, + { + "epoch": 0.13229868973413053, + "ewc_loss": 0.011767450720071793, + "ewc_loss_diag": 8.404254913330078e-06, + "ewc_loss_parallel": 3.344599099364132e-05, + "grad_norm": 2.9006662368774414, + "learning_rate": 4.4044086477320895e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.821850061416626, + "num_tokens": 39627638.0, + "step": 1040 + }, + { + "epoch": 0.13242590001272103, + "ewc_loss": 0.011824335902929306, + "ewc_loss_diag": 8.404254913330078e-06, + "ewc_loss_parallel": 3.401484718779102e-05, + "grad_norm": 2.870089530944824, + "learning_rate": 4.4086477320898685e-07, + "loss": 0.4833, + "mean_token_accuracy": 0.8423088788986206, + "num_tokens": 39661696.0, + "step": 1041 + }, + { + "epoch": 0.13255311029131153, + "ewc_loss": 0.011806963942945004, + "ewc_loss_diag": 8.404254913330078e-06, + "ewc_loss_parallel": 3.384112278581597e-05, + "grad_norm": 2.797490358352661, + "learning_rate": 4.4128868164476474e-07, + "loss": 0.493, + "mean_token_accuracy": 0.8468668460845947, + "num_tokens": 39702357.0, + "step": 1042 + }, + { + "epoch": 0.13268032056990206, + "ewc_loss": 0.011789410375058651, + "ewc_loss_diag": 8.404254913330078e-06, + "ewc_loss_parallel": 3.366558667039499e-05, + "grad_norm": 2.820514678955078, + "learning_rate": 4.417125900805426e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8485636711120605, + "num_tokens": 39744960.0, + "step": 1043 + }, + { + "epoch": 0.13280753084849256, + "ewc_loss": 0.011809340678155422, + "ewc_loss_diag": 8.404254913330078e-06, + "ewc_loss_parallel": 3.38648897013627e-05, + "grad_norm": 2.89947772026062, + "learning_rate": 4.4213649851632044e-07, + "loss": 0.5342, + "mean_token_accuracy": 0.834320068359375, + "num_tokens": 39785394.0, + "step": 1044 + }, + { + "epoch": 0.13293474112708306, + "ewc_loss": 0.011838803999125957, + "ewc_loss_diag": 8.404254913330078e-06, + "ewc_loss_parallel": 3.415952232899144e-05, + "grad_norm": 2.8909130096435547, + "learning_rate": 4.4256040695209834e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.8388818502426147, + "num_tokens": 39822698.0, + "step": 1045 + }, + { + "epoch": 0.1330619514056736, + "ewc_loss": 0.01182418130338192, + "ewc_loss_diag": 8.404254913330078e-06, + "ewc_loss_parallel": 3.4013301046798006e-05, + "grad_norm": 2.9145469665527344, + "learning_rate": 4.429843153878762e-07, + "loss": 0.5119, + "mean_token_accuracy": 0.8353432416915894, + "num_tokens": 39856874.0, + "step": 1046 + }, + { + "epoch": 0.1331891616842641, + "ewc_loss": 0.011957170441746712, + "ewc_loss_diag": 8.52346420288086e-06, + "ewc_loss_parallel": 3.4122480428777635e-05, + "grad_norm": 2.838176965713501, + "learning_rate": 4.434082238236541e-07, + "loss": 0.5437, + "mean_token_accuracy": 0.8267366886138916, + "num_tokens": 39899683.0, + "step": 1047 + }, + { + "epoch": 0.1333163719628546, + "ewc_loss": 0.0119257103651762, + "ewc_loss_diag": 8.52346420288086e-06, + "ewc_loss_parallel": 3.3807889849413186e-05, + "grad_norm": 2.851710081100464, + "learning_rate": 4.4383213225943193e-07, + "loss": 0.5215, + "mean_token_accuracy": 0.835999608039856, + "num_tokens": 39940565.0, + "step": 1048 + }, + { + "epoch": 0.13344358224144512, + "ewc_loss": 0.01195357833057642, + "ewc_loss_diag": 8.52346420288086e-06, + "ewc_loss_parallel": 3.408656266401522e-05, + "grad_norm": 2.9051215648651123, + "learning_rate": 4.442560406952098e-07, + "loss": 0.5229, + "mean_token_accuracy": 0.832003116607666, + "num_tokens": 39979010.0, + "step": 1049 + }, + { + "epoch": 0.13357079252003562, + "ewc_loss": 0.011973180808126926, + "ewc_loss_diag": 8.52346420288086e-06, + "ewc_loss_parallel": 3.428258787607774e-05, + "grad_norm": 2.784254312515259, + "learning_rate": 4.4467994913098767e-07, + "loss": 0.4709, + "mean_token_accuracy": 0.8490025997161865, + "num_tokens": 40021151.0, + "step": 1050 + }, + { + "epoch": 0.13369800279862612, + "ewc_loss": 0.011928226798772812, + "ewc_loss_diag": 8.52346420288086e-06, + "ewc_loss_parallel": 3.3833046472864226e-05, + "grad_norm": 3.4367916584014893, + "learning_rate": 4.4510385756676557e-07, + "loss": 0.5308, + "mean_token_accuracy": 0.8288907408714294, + "num_tokens": 40059324.0, + "step": 1051 + }, + { + "epoch": 0.13382521307721665, + "ewc_loss": 0.012311051599681377, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.644059324869886e-05, + "grad_norm": 2.808821201324463, + "learning_rate": 4.455277660025434e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.842704176902771, + "num_tokens": 40104577.0, + "step": 1052 + }, + { + "epoch": 0.13395242335580715, + "ewc_loss": 0.012036697939038277, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.369705882505514e-05, + "grad_norm": 2.882875442504883, + "learning_rate": 4.459516744383213e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8342069983482361, + "num_tokens": 40139734.0, + "step": 1053 + }, + { + "epoch": 0.13407963363439765, + "ewc_loss": 0.01213185302913189, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.464861219981685e-05, + "grad_norm": 2.9195923805236816, + "learning_rate": 4.4637558287409916e-07, + "loss": 0.509, + "mean_token_accuracy": 0.8377981185913086, + "num_tokens": 40180724.0, + "step": 1054 + }, + { + "epoch": 0.13420684391298818, + "ewc_loss": 0.012204977683722973, + "ewc_loss_diag": 8.702278137207031e-06, + "ewc_loss_parallel": 3.476950587355532e-05, + "grad_norm": 2.8419337272644043, + "learning_rate": 4.4679949130987706e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.8424463868141174, + "num_tokens": 40220475.0, + "step": 1055 + }, + { + "epoch": 0.13433405419157868, + "ewc_loss": 0.012108873575925827, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.4418808354530483e-05, + "grad_norm": 2.92915415763855, + "learning_rate": 4.472233997456549e-07, + "loss": 0.5159, + "mean_token_accuracy": 0.8367197513580322, + "num_tokens": 40258208.0, + "step": 1056 + }, + { + "epoch": 0.13446126447016918, + "ewc_loss": 0.012151539325714111, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.484547414700501e-05, + "grad_norm": 2.8104615211486816, + "learning_rate": 4.476473081814328e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8386313319206238, + "num_tokens": 40299127.0, + "step": 1057 + }, + { + "epoch": 0.1345884747487597, + "ewc_loss": 0.012164841406047344, + "ewc_loss_diag": 8.702278137207031e-06, + "ewc_loss_parallel": 3.436814222368412e-05, + "grad_norm": 2.8458380699157715, + "learning_rate": 4.4807121661721065e-07, + "loss": 0.463, + "mean_token_accuracy": 0.849635660648346, + "num_tokens": 40339331.0, + "step": 1058 + }, + { + "epoch": 0.1347156850273502, + "ewc_loss": 0.012144295498728752, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.4773034712998196e-05, + "grad_norm": 2.897962808609009, + "learning_rate": 4.4849512505298855e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.83896803855896, + "num_tokens": 40379386.0, + "step": 1059 + }, + { + "epoch": 0.1348428953059407, + "ewc_loss": 0.01216073613613844, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.4937442251248285e-05, + "grad_norm": 2.852630615234375, + "learning_rate": 4.489190334887664e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.8343989849090576, + "num_tokens": 40419868.0, + "step": 1060 + }, + { + "epoch": 0.13497010558453124, + "ewc_loss": 0.012131091207265854, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.464098699623719e-05, + "grad_norm": 2.9011712074279785, + "learning_rate": 4.493429419245443e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8398473262786865, + "num_tokens": 40455346.0, + "step": 1061 + }, + { + "epoch": 0.13509731586312174, + "ewc_loss": 0.012167165987193584, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.500173625070602e-05, + "grad_norm": 2.926145076751709, + "learning_rate": 4.4976685036032214e-07, + "loss": 0.4686, + "mean_token_accuracy": 0.8482915163040161, + "num_tokens": 40490087.0, + "step": 1062 + }, + { + "epoch": 0.13522452614171224, + "ewc_loss": 0.012173601426184177, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.5066095733782277e-05, + "grad_norm": 2.9669618606567383, + "learning_rate": 4.5019075879610004e-07, + "loss": 0.526, + "mean_token_accuracy": 0.8296080827713013, + "num_tokens": 40524799.0, + "step": 1063 + }, + { + "epoch": 0.13535173642030277, + "ewc_loss": 0.012180833145976067, + "ewc_loss_diag": 8.64267349243164e-06, + "ewc_loss_parallel": 3.513840783853084e-05, + "grad_norm": 2.8696963787078857, + "learning_rate": 4.506146672318779e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8401326537132263, + "num_tokens": 40563740.0, + "step": 1064 + }, + { + "epoch": 0.13547894669889327, + "ewc_loss": 0.012208528816699982, + "ewc_loss_diag": 8.702278137207031e-06, + "ewc_loss_parallel": 3.480501618469134e-05, + "grad_norm": 2.8589632511138916, + "learning_rate": 4.5103857566765573e-07, + "loss": 0.5533, + "mean_token_accuracy": 0.8289345502853394, + "num_tokens": 40604743.0, + "step": 1065 + }, + { + "epoch": 0.1356061569774838, + "ewc_loss": 0.01223520003259182, + "ewc_loss_diag": 8.702278137207031e-06, + "ewc_loss_parallel": 3.507173096295446e-05, + "grad_norm": 2.9336793422698975, + "learning_rate": 4.5146248410343363e-07, + "loss": 0.4579, + "mean_token_accuracy": 0.8534177541732788, + "num_tokens": 40640909.0, + "step": 1066 + }, + { + "epoch": 0.1357333672560743, + "ewc_loss": 0.012253321707248688, + "ewc_loss_diag": 8.702278137207031e-06, + "ewc_loss_parallel": 3.5252945963293314e-05, + "grad_norm": 2.8365917205810547, + "learning_rate": 4.5188639253921153e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.8505374193191528, + "num_tokens": 40678930.0, + "step": 1067 + }, + { + "epoch": 0.1358605775346648, + "ewc_loss": 0.012211279943585396, + "ewc_loss_diag": 8.702278137207031e-06, + "ewc_loss_parallel": 3.483252658043057e-05, + "grad_norm": 2.8626294136047363, + "learning_rate": 4.523103009749894e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.8511638641357422, + "num_tokens": 40715609.0, + "step": 1068 + }, + { + "epoch": 0.13598778781325532, + "ewc_loss": 0.012295086868107319, + "ewc_loss_diag": 8.761882781982422e-06, + "ewc_loss_parallel": 3.5060242225881666e-05, + "grad_norm": 2.7973172664642334, + "learning_rate": 4.527342094107672e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.8435888886451721, + "num_tokens": 40760533.0, + "step": 1069 + }, + { + "epoch": 0.13611499809184582, + "ewc_loss": 0.012211460620164871, + "ewc_loss_diag": 8.702278137207031e-06, + "ewc_loss_parallel": 3.48343382938765e-05, + "grad_norm": 2.8350212574005127, + "learning_rate": 4.531581178465451e-07, + "loss": 0.5134, + "mean_token_accuracy": 0.8372443914413452, + "num_tokens": 40800039.0, + "step": 1070 + }, + { + "epoch": 0.13624220837043632, + "ewc_loss": 0.012240307405591011, + "ewc_loss_diag": 8.702278137207031e-06, + "ewc_loss_parallel": 3.512279727146961e-05, + "grad_norm": 2.8610360622406006, + "learning_rate": 4.53582026282323e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8476521372795105, + "num_tokens": 40840207.0, + "step": 1071 + }, + { + "epoch": 0.13636941864902685, + "ewc_loss": 0.012247399426996708, + "ewc_loss_diag": 8.702278137207031e-06, + "ewc_loss_parallel": 3.519372330629267e-05, + "grad_norm": 2.9682130813598633, + "learning_rate": 4.5400593471810087e-07, + "loss": 0.5579, + "mean_token_accuracy": 0.822540283203125, + "num_tokens": 40879559.0, + "step": 1072 + }, + { + "epoch": 0.13649662892761735, + "ewc_loss": 0.012400235049426556, + "ewc_loss_diag": 8.821487426757812e-06, + "ewc_loss_parallel": 3.5501372622093186e-05, + "grad_norm": 2.9110116958618164, + "learning_rate": 4.544298431538787e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.844050407409668, + "num_tokens": 40919608.0, + "step": 1073 + }, + { + "epoch": 0.13662383920620785, + "ewc_loss": 0.01230444386601448, + "ewc_loss_diag": 8.761882781982422e-06, + "ewc_loss_parallel": 3.515381831675768e-05, + "grad_norm": 2.918717861175537, + "learning_rate": 4.548537515896566e-07, + "loss": 0.5449, + "mean_token_accuracy": 0.8261623382568359, + "num_tokens": 40953648.0, + "step": 1074 + }, + { + "epoch": 0.13675104948479838, + "ewc_loss": 0.012321974150836468, + "ewc_loss_diag": 8.761882781982422e-06, + "ewc_loss_parallel": 3.53291179635562e-05, + "grad_norm": 2.936293125152588, + "learning_rate": 4.552776600254345e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8361971378326416, + "num_tokens": 40989675.0, + "step": 1075 + }, + { + "epoch": 0.13687825976338888, + "ewc_loss": 0.012393896467983723, + "ewc_loss_diag": 8.821487426757812e-06, + "ewc_loss_parallel": 3.543798811733723e-05, + "grad_norm": 2.87280535697937, + "learning_rate": 4.5570156846121236e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8497835397720337, + "num_tokens": 41027992.0, + "step": 1076 + }, + { + "epoch": 0.13700547004197938, + "ewc_loss": 0.012369602918624878, + "ewc_loss_diag": 8.821487426757812e-06, + "ewc_loss_parallel": 3.519505844451487e-05, + "grad_norm": 2.90187931060791, + "learning_rate": 4.561254768969902e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8472614288330078, + "num_tokens": 41066925.0, + "step": 1077 + }, + { + "epoch": 0.1371326803205699, + "ewc_loss": 0.012396426871418953, + "ewc_loss_diag": 8.821487426757812e-06, + "ewc_loss_parallel": 3.546329389791936e-05, + "grad_norm": 2.917257070541382, + "learning_rate": 4.565493853327681e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8472891449928284, + "num_tokens": 41105090.0, + "step": 1078 + }, + { + "epoch": 0.1372598905991604, + "ewc_loss": 0.012393258512020111, + "ewc_loss_diag": 8.821487426757812e-06, + "ewc_loss_parallel": 3.5431610740488395e-05, + "grad_norm": 2.9515504837036133, + "learning_rate": 4.56973293768546e-07, + "loss": 0.5198, + "mean_token_accuracy": 0.8379940986633301, + "num_tokens": 41141924.0, + "step": 1079 + }, + { + "epoch": 0.1373871008777509, + "ewc_loss": 0.012402164749801159, + "ewc_loss_diag": 8.821487426757812e-06, + "ewc_loss_parallel": 3.5520668461686e-05, + "grad_norm": 2.925201416015625, + "learning_rate": 4.573972022043238e-07, + "loss": 0.5798, + "mean_token_accuracy": 0.8163557052612305, + "num_tokens": 41181512.0, + "step": 1080 + }, + { + "epoch": 0.13751431115634144, + "ewc_loss": 0.012392004951834679, + "ewc_loss_diag": 8.821487426757812e-06, + "ewc_loss_parallel": 3.5419077903497964e-05, + "grad_norm": 2.8346521854400635, + "learning_rate": 4.578211106401017e-07, + "loss": 0.5286, + "mean_token_accuracy": 0.8318880796432495, + "num_tokens": 41224015.0, + "step": 1081 + }, + { + "epoch": 0.13764152143493194, + "ewc_loss": 0.012438269332051277, + "ewc_loss_diag": 8.881092071533203e-06, + "ewc_loss_parallel": 3.5271368687972426e-05, + "grad_norm": 2.8882813453674316, + "learning_rate": 4.582450190758796e-07, + "loss": 0.5308, + "mean_token_accuracy": 0.833314061164856, + "num_tokens": 41265568.0, + "step": 1082 + }, + { + "epoch": 0.13776873171352244, + "ewc_loss": 0.012475023046135902, + "ewc_loss_diag": 8.881092071533203e-06, + "ewc_loss_parallel": 3.5638899134937674e-05, + "grad_norm": 2.8770110607147217, + "learning_rate": 4.586689275116575e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8570095896720886, + "num_tokens": 41303595.0, + "step": 1083 + }, + { + "epoch": 0.13789594199211297, + "ewc_loss": 0.012455105781555176, + "ewc_loss_diag": 8.881092071533203e-06, + "ewc_loss_parallel": 3.543973434716463e-05, + "grad_norm": 2.9053778648376465, + "learning_rate": 4.590928359474353e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8276870250701904, + "num_tokens": 41340119.0, + "step": 1084 + }, + { + "epoch": 0.13802315227070347, + "ewc_loss": 0.012421231716871262, + "ewc_loss_diag": 8.821487426757812e-06, + "ewc_loss_parallel": 3.571133493096568e-05, + "grad_norm": 2.880086898803711, + "learning_rate": 4.595167443832132e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8444281816482544, + "num_tokens": 41380184.0, + "step": 1085 + }, + { + "epoch": 0.13815036254929397, + "ewc_loss": 0.012481634505093098, + "ewc_loss_diag": 8.940696716308594e-06, + "ewc_loss_parallel": 3.5705019399756566e-05, + "grad_norm": 2.8521294593811035, + "learning_rate": 4.599406528189911e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.857043981552124, + "num_tokens": 41419038.0, + "step": 1086 + }, + { + "epoch": 0.1382775728278845, + "ewc_loss": 0.012471719644963741, + "ewc_loss_diag": 8.940696716308594e-06, + "ewc_loss_parallel": 3.560586628736928e-05, + "grad_norm": 2.8809351921081543, + "learning_rate": 4.60364561254769e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8409915566444397, + "num_tokens": 41455767.0, + "step": 1087 + }, + { + "epoch": 0.138404783106475, + "ewc_loss": 0.012494001537561417, + "ewc_loss_diag": 8.881092071533203e-06, + "ewc_loss_parallel": 3.582869248930365e-05, + "grad_norm": 2.8450536727905273, + "learning_rate": 4.607884696905468e-07, + "loss": 0.4827, + "mean_token_accuracy": 0.8447431921958923, + "num_tokens": 41497493.0, + "step": 1088 + }, + { + "epoch": 0.1385319933850655, + "ewc_loss": 0.012482251971960068, + "ewc_loss_diag": 8.940696716308594e-06, + "ewc_loss_parallel": 3.57111930497922e-05, + "grad_norm": 2.841566324234009, + "learning_rate": 4.612123781263247e-07, + "loss": 0.5155, + "mean_token_accuracy": 0.8344197869300842, + "num_tokens": 41542940.0, + "step": 1089 + }, + { + "epoch": 0.13865920366365603, + "ewc_loss": 0.012491609901189804, + "ewc_loss_diag": 8.940696716308594e-06, + "ewc_loss_parallel": 3.5804765502689406e-05, + "grad_norm": 2.851323366165161, + "learning_rate": 4.616362865621026e-07, + "loss": 0.4156, + "mean_token_accuracy": 0.8640072345733643, + "num_tokens": 41579302.0, + "step": 1090 + }, + { + "epoch": 0.13878641394224653, + "ewc_loss": 0.012515682727098465, + "ewc_loss_diag": 8.940696716308594e-06, + "ewc_loss_parallel": 3.604549419833347e-05, + "grad_norm": 2.9365074634552, + "learning_rate": 4.620601949978805e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8384180068969727, + "num_tokens": 41617555.0, + "step": 1091 + }, + { + "epoch": 0.13891362422083706, + "ewc_loss": 0.012536609545350075, + "ewc_loss_diag": 8.940696716308594e-06, + "ewc_loss_parallel": 3.625476892921142e-05, + "grad_norm": 2.9970273971557617, + "learning_rate": 4.6248410343365827e-07, + "loss": 0.566, + "mean_token_accuracy": 0.8201227188110352, + "num_tokens": 41654660.0, + "step": 1092 + }, + { + "epoch": 0.13904083449942756, + "ewc_loss": 0.012543671764433384, + "ewc_loss_diag": 8.940696716308594e-06, + "ewc_loss_parallel": 3.632538937381469e-05, + "grad_norm": 2.9088134765625, + "learning_rate": 4.6290801186943617e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8474972248077393, + "num_tokens": 41693773.0, + "step": 1093 + }, + { + "epoch": 0.13916804477801806, + "ewc_loss": 0.012510853819549084, + "ewc_loss_diag": 8.940696716308594e-06, + "ewc_loss_parallel": 3.5997207305626944e-05, + "grad_norm": 2.9710800647735596, + "learning_rate": 4.6333192030521407e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8461726307868958, + "num_tokens": 41728864.0, + "step": 1094 + }, + { + "epoch": 0.13929525505660859, + "ewc_loss": 0.012547394260764122, + "ewc_loss_diag": 8.940696716308594e-06, + "ewc_loss_parallel": 3.636261681094766e-05, + "grad_norm": 2.8758625984191895, + "learning_rate": 4.6375582874099196e-07, + "loss": 0.4613, + "mean_token_accuracy": 0.8521634340286255, + "num_tokens": 41770596.0, + "step": 1095 + }, + { + "epoch": 0.1394224653351991, + "ewc_loss": 0.012510080821812153, + "ewc_loss_diag": 8.940696716308594e-06, + "ewc_loss_parallel": 3.598948023864068e-05, + "grad_norm": 2.923633575439453, + "learning_rate": 4.6417973717676976e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.8494009375572205, + "num_tokens": 41806585.0, + "step": 1096 + }, + { + "epoch": 0.1395496756137896, + "ewc_loss": 0.012603375129401684, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.631207437138073e-05, + "grad_norm": 2.8962016105651855, + "learning_rate": 4.6460364561254766e-07, + "loss": 0.5073, + "mean_token_accuracy": 0.8375219106674194, + "num_tokens": 41845488.0, + "step": 1097 + }, + { + "epoch": 0.13967688589238011, + "ewc_loss": 0.012597799301147461, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.625631507020444e-05, + "grad_norm": 2.9839165210723877, + "learning_rate": 4.6502755404832556e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8468302488327026, + "num_tokens": 41880674.0, + "step": 1098 + }, + { + "epoch": 0.13980409617097062, + "ewc_loss": 0.012615972198545933, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.643803938757628e-05, + "grad_norm": 2.9317800998687744, + "learning_rate": 4.654514624841034e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8430470824241638, + "num_tokens": 41913020.0, + "step": 1099 + }, + { + "epoch": 0.13993130644956112, + "ewc_loss": 0.01260553952306509, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.633371306932531e-05, + "grad_norm": 2.9351272583007812, + "learning_rate": 4.6587537091988125e-07, + "loss": 0.4939, + "mean_token_accuracy": 0.8431698083877563, + "num_tokens": 41948483.0, + "step": 1100 + }, + { + "epoch": 0.14005851672815164, + "ewc_loss": 0.012612979859113693, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.640811701188795e-05, + "grad_norm": 2.91890549659729, + "learning_rate": 4.6629927935565915e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8510213494300842, + "num_tokens": 41985659.0, + "step": 1101 + }, + { + "epoch": 0.14018572700674214, + "ewc_loss": 0.012611901387572289, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.6397330404724926e-05, + "grad_norm": 2.909557580947876, + "learning_rate": 4.6672318779143705e-07, + "loss": 0.5604, + "mean_token_accuracy": 0.8218635320663452, + "num_tokens": 42027746.0, + "step": 1102 + }, + { + "epoch": 0.14031293728533265, + "ewc_loss": 0.012621239759027958, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.649072095868178e-05, + "grad_norm": 3.01708722114563, + "learning_rate": 4.671470962272149e-07, + "loss": 0.5158, + "mean_token_accuracy": 0.835383951663971, + "num_tokens": 42060903.0, + "step": 1103 + }, + { + "epoch": 0.14044014756392317, + "ewc_loss": 0.012657713145017624, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.6855450161965564e-05, + "grad_norm": 2.857243299484253, + "learning_rate": 4.6757100466299274e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.8406769037246704, + "num_tokens": 42102722.0, + "step": 1104 + }, + { + "epoch": 0.14056735784251367, + "ewc_loss": 0.012605083175003529, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.632915104390122e-05, + "grad_norm": 2.995357036590576, + "learning_rate": 4.6799491309877064e-07, + "loss": 0.5215, + "mean_token_accuracy": 0.8316022157669067, + "num_tokens": 42143418.0, + "step": 1105 + }, + { + "epoch": 0.14069456812110417, + "ewc_loss": 0.012677067890763283, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.704899427248165e-05, + "grad_norm": 2.9798831939697266, + "learning_rate": 4.6841882153454854e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.8493599891662598, + "num_tokens": 42177313.0, + "step": 1106 + }, + { + "epoch": 0.1408217783996947, + "ewc_loss": 0.012648837640881538, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.676669439300895e-05, + "grad_norm": 2.925285816192627, + "learning_rate": 4.688427299703264e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8545752763748169, + "num_tokens": 42214158.0, + "step": 1107 + }, + { + "epoch": 0.1409489886782852, + "ewc_loss": 0.012632525525987148, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.660357469925657e-05, + "grad_norm": 2.970201015472412, + "learning_rate": 4.6926663840610423e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8417235612869263, + "num_tokens": 42254189.0, + "step": 1108 + }, + { + "epoch": 0.1410761989568757, + "ewc_loss": 0.012673171237111092, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.7010035157436505e-05, + "grad_norm": 2.8882057666778564, + "learning_rate": 4.6969054684188213e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8313999772071838, + "num_tokens": 42296853.0, + "step": 1109 + }, + { + "epoch": 0.14120340923546623, + "ewc_loss": 0.012629610486328602, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.657442721305415e-05, + "grad_norm": 2.949873685836792, + "learning_rate": 4.7011445527766003e-07, + "loss": 0.4929, + "mean_token_accuracy": 0.8397560119628906, + "num_tokens": 42331983.0, + "step": 1110 + }, + { + "epoch": 0.14133061951405673, + "ewc_loss": 0.012728875502943993, + "ewc_loss_diag": 9.059906005859375e-06, + "ewc_loss_parallel": 3.695672785397619e-05, + "grad_norm": 2.9605329036712646, + "learning_rate": 4.7053836371343787e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8410794138908386, + "num_tokens": 42370496.0, + "step": 1111 + }, + { + "epoch": 0.14145782979264723, + "ewc_loss": 0.012670505791902542, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.698338332469575e-05, + "grad_norm": 2.9114396572113037, + "learning_rate": 4.709622721492157e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.847618579864502, + "num_tokens": 42406489.0, + "step": 1112 + }, + { + "epoch": 0.14158504007123776, + "ewc_loss": 0.01271942537277937, + "ewc_loss_diag": 9.059906005859375e-06, + "ewc_loss_parallel": 3.686222044052556e-05, + "grad_norm": 3.012643814086914, + "learning_rate": 4.713861805849936e-07, + "loss": 0.4801, + "mean_token_accuracy": 0.8444755673408508, + "num_tokens": 42440679.0, + "step": 1113 + }, + { + "epoch": 0.14171225034982826, + "ewc_loss": 0.01270347647368908, + "ewc_loss_diag": 9.000301361083984e-06, + "ewc_loss_parallel": 3.731308606802486e-05, + "grad_norm": 2.9935293197631836, + "learning_rate": 4.718100890207715e-07, + "loss": 0.5518, + "mean_token_accuracy": 0.8248707056045532, + "num_tokens": 42476699.0, + "step": 1114 + }, + { + "epoch": 0.1418394606284188, + "ewc_loss": 0.012737227603793144, + "ewc_loss_diag": 9.059906005859375e-06, + "ewc_loss_parallel": 3.7040244933450595e-05, + "grad_norm": 3.0305685997009277, + "learning_rate": 4.7223399745654936e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8379735946655273, + "num_tokens": 42513348.0, + "step": 1115 + }, + { + "epoch": 0.1419666709070093, + "ewc_loss": 0.012761352583765984, + "ewc_loss_diag": 9.059906005859375e-06, + "ewc_loss_parallel": 3.7281493860064074e-05, + "grad_norm": 2.9582090377807617, + "learning_rate": 4.726579058923272e-07, + "loss": 0.5016, + "mean_token_accuracy": 0.841859757900238, + "num_tokens": 42550633.0, + "step": 1116 + }, + { + "epoch": 0.1420938811855998, + "ewc_loss": 0.0127280093729496, + "ewc_loss_diag": 9.059906005859375e-06, + "ewc_loss_parallel": 3.69480621884577e-05, + "grad_norm": 2.9606142044067383, + "learning_rate": 4.730818143281051e-07, + "loss": 0.5262, + "mean_token_accuracy": 0.8342571258544922, + "num_tokens": 42593649.0, + "step": 1117 + }, + { + "epoch": 0.14222109146419032, + "ewc_loss": 0.012747049331665039, + "ewc_loss_diag": 9.059906005859375e-06, + "ewc_loss_parallel": 3.713845944730565e-05, + "grad_norm": 2.9749584197998047, + "learning_rate": 4.7350572276388295e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8483882546424866, + "num_tokens": 42632379.0, + "step": 1118 + }, + { + "epoch": 0.14234830174278082, + "ewc_loss": 0.012754308059811592, + "ewc_loss_diag": 9.059906005859375e-06, + "ewc_loss_parallel": 3.721104440046474e-05, + "grad_norm": 2.9822463989257812, + "learning_rate": 4.7392963119966085e-07, + "loss": 0.4991, + "mean_token_accuracy": 0.8397818207740784, + "num_tokens": 42670319.0, + "step": 1119 + }, + { + "epoch": 0.14247551202137132, + "ewc_loss": 0.012753641232848167, + "ewc_loss_diag": 9.059906005859375e-06, + "ewc_loss_parallel": 3.7204375985311344e-05, + "grad_norm": 2.8512604236602783, + "learning_rate": 4.7435353963543875e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8458279967308044, + "num_tokens": 42712680.0, + "step": 1120 + }, + { + "epoch": 0.14260272229996185, + "ewc_loss": 0.012722408398985863, + "ewc_loss_diag": 9.059906005859375e-06, + "ewc_loss_parallel": 3.6892051866743714e-05, + "grad_norm": 2.9486725330352783, + "learning_rate": 4.747774480712166e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.840133786201477, + "num_tokens": 42748241.0, + "step": 1121 + }, + { + "epoch": 0.14272993257855235, + "ewc_loss": 0.01278620958328247, + "ewc_loss_diag": 9.059906005859375e-06, + "ewc_loss_parallel": 3.753006240003742e-05, + "grad_norm": 2.9904041290283203, + "learning_rate": 4.7520135650699444e-07, + "loss": 0.5288, + "mean_token_accuracy": 0.8318547010421753, + "num_tokens": 42783208.0, + "step": 1122 + }, + { + "epoch": 0.14285714285714285, + "ewc_loss": 0.01285607647150755, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.7618385249515995e-05, + "grad_norm": 3.0354764461517334, + "learning_rate": 4.7562526494277234e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8409876823425293, + "num_tokens": 42818570.0, + "step": 1123 + }, + { + "epoch": 0.14298435313573338, + "ewc_loss": 0.012867909856140614, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.773671778617427e-05, + "grad_norm": 3.028038501739502, + "learning_rate": 4.7604917337855024e-07, + "loss": 0.443, + "mean_token_accuracy": 0.8557970523834229, + "num_tokens": 42852891.0, + "step": 1124 + }, + { + "epoch": 0.14311156341432388, + "ewc_loss": 0.012863604351878166, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.7693665944971144e-05, + "grad_norm": 2.928833246231079, + "learning_rate": 4.764730818143281e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.8365460634231567, + "num_tokens": 42887359.0, + "step": 1125 + }, + { + "epoch": 0.14323877369291438, + "ewc_loss": 0.012844021432101727, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.74978335457854e-05, + "grad_norm": 2.925165891647339, + "learning_rate": 4.768969902501059e-07, + "loss": 0.5443, + "mean_token_accuracy": 0.8268707394599915, + "num_tokens": 42929308.0, + "step": 1126 + }, + { + "epoch": 0.1433659839715049, + "ewc_loss": 0.012863228097558022, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.7689900636905804e-05, + "grad_norm": 2.8633909225463867, + "learning_rate": 4.773208986858838e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.839148759841919, + "num_tokens": 42969373.0, + "step": 1127 + }, + { + "epoch": 0.1434931942500954, + "ewc_loss": 0.012844952754676342, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.750714677153155e-05, + "grad_norm": 2.9402880668640137, + "learning_rate": 4.777448071216617e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8420966863632202, + "num_tokens": 43010133.0, + "step": 1128 + }, + { + "epoch": 0.1436204045286859, + "ewc_loss": 0.012884044088423252, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.789805850828998e-05, + "grad_norm": 2.888331174850464, + "learning_rate": 4.781687155574396e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8319876194000244, + "num_tokens": 43053743.0, + "step": 1129 + }, + { + "epoch": 0.14374761480727644, + "ewc_loss": 0.012859775684773922, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.7655376218026504e-05, + "grad_norm": 2.878948926925659, + "learning_rate": 4.785926239932175e-07, + "loss": 0.4656, + "mean_token_accuracy": 0.8502311110496521, + "num_tokens": 43094125.0, + "step": 1130 + }, + { + "epoch": 0.14387482508586694, + "ewc_loss": 0.012868255376815796, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.77401702280622e-05, + "grad_norm": 2.9528610706329346, + "learning_rate": 4.790165324289953e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.8418142795562744, + "num_tokens": 43132399.0, + "step": 1131 + }, + { + "epoch": 0.14400203536445744, + "ewc_loss": 0.012909827753901482, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.81559002562426e-05, + "grad_norm": 2.9979515075683594, + "learning_rate": 4.794404408647732e-07, + "loss": 0.5054, + "mean_token_accuracy": 0.8408887386322021, + "num_tokens": 43169036.0, + "step": 1132 + }, + { + "epoch": 0.14412924564304797, + "ewc_loss": 0.012907000258564949, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.812762224697508e-05, + "grad_norm": 2.91919207572937, + "learning_rate": 4.798643493005511e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8573082685470581, + "num_tokens": 43209397.0, + "step": 1133 + }, + { + "epoch": 0.14425645592163847, + "ewc_loss": 0.012872863560914993, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.778625250561163e-05, + "grad_norm": 2.950090169906616, + "learning_rate": 4.80288257736329e-07, + "loss": 0.5345, + "mean_token_accuracy": 0.8317341208457947, + "num_tokens": 43246788.0, + "step": 1134 + }, + { + "epoch": 0.14438366620022897, + "ewc_loss": 0.012905953451991081, + "ewc_loss_diag": 9.119510650634766e-06, + "ewc_loss_parallel": 3.8117155781947076e-05, + "grad_norm": 2.9183709621429443, + "learning_rate": 4.807121661721068e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8555904030799866, + "num_tokens": 43282774.0, + "step": 1135 + }, + { + "epoch": 0.1445108764788195, + "ewc_loss": 0.012940675020217896, + "ewc_loss_diag": 9.179115295410156e-06, + "ewc_loss_parallel": 3.7854017136851326e-05, + "grad_norm": 2.9224040508270264, + "learning_rate": 4.811360746078847e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8437745571136475, + "num_tokens": 43322303.0, + "step": 1136 + }, + { + "epoch": 0.14463808675741, + "ewc_loss": 0.012953589670360088, + "ewc_loss_diag": 9.179115295410156e-06, + "ewc_loss_parallel": 3.7983161746524274e-05, + "grad_norm": 3.080857038497925, + "learning_rate": 4.815599830436625e-07, + "loss": 0.4969, + "mean_token_accuracy": 0.8415042757987976, + "num_tokens": 43354611.0, + "step": 1137 + }, + { + "epoch": 0.1447652970360005, + "ewc_loss": 0.013015477918088436, + "ewc_loss_diag": 9.179115295410156e-06, + "ewc_loss_parallel": 3.86020474252291e-05, + "grad_norm": 2.9431140422821045, + "learning_rate": 4.819838914794405e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.84736168384552, + "num_tokens": 43390640.0, + "step": 1138 + }, + { + "epoch": 0.14489250731459102, + "ewc_loss": 0.012931350618600845, + "ewc_loss_diag": 9.179115295410156e-06, + "ewc_loss_parallel": 3.776076846406795e-05, + "grad_norm": 2.9469408988952637, + "learning_rate": 4.824077999152183e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.8374496102333069, + "num_tokens": 43424513.0, + "step": 1139 + }, + { + "epoch": 0.14501971759318152, + "ewc_loss": 0.01297883503139019, + "ewc_loss_diag": 9.179115295410156e-06, + "ewc_loss_parallel": 3.82356156478636e-05, + "grad_norm": 2.9642112255096436, + "learning_rate": 4.828317083509962e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8382766842842102, + "num_tokens": 43461699.0, + "step": 1140 + }, + { + "epoch": 0.14514692787177205, + "ewc_loss": 0.012994088232517242, + "ewc_loss_diag": 9.179115295410156e-06, + "ewc_loss_parallel": 3.8388152461266145e-05, + "grad_norm": 3.050250768661499, + "learning_rate": 4.83255616786774e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8519399166107178, + "num_tokens": 43494681.0, + "step": 1141 + }, + { + "epoch": 0.14527413815036255, + "ewc_loss": 0.013017243705689907, + "ewc_loss_diag": 9.179115295410156e-06, + "ewc_loss_parallel": 3.861970253637992e-05, + "grad_norm": 2.9564225673675537, + "learning_rate": 4.83679525222552e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8538399934768677, + "num_tokens": 43531318.0, + "step": 1142 + }, + { + "epoch": 0.14540134842895305, + "ewc_loss": 0.01297064870595932, + "ewc_loss_diag": 9.179115295410156e-06, + "ewc_loss_parallel": 3.815375384874642e-05, + "grad_norm": 2.892622232437134, + "learning_rate": 4.841034336583298e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8539247512817383, + "num_tokens": 43569991.0, + "step": 1143 + }, + { + "epoch": 0.14552855870754358, + "ewc_loss": 0.01303664967417717, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.820341225946322e-05, + "grad_norm": 2.9343628883361816, + "learning_rate": 4.845273420941076e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8549147844314575, + "num_tokens": 43609978.0, + "step": 1144 + }, + { + "epoch": 0.14565576898613408, + "ewc_loss": 0.013066756539046764, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.850448047160171e-05, + "grad_norm": 3.0526702404022217, + "learning_rate": 4.849512505298855e-07, + "loss": 0.506, + "mean_token_accuracy": 0.8324475288391113, + "num_tokens": 43640419.0, + "step": 1145 + }, + { + "epoch": 0.14578297926472458, + "ewc_loss": 0.0131138376891613, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.897528586094268e-05, + "grad_norm": 2.9001762866973877, + "learning_rate": 4.853751589656634e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.849697470664978, + "num_tokens": 43682408.0, + "step": 1146 + }, + { + "epoch": 0.1459101895433151, + "ewc_loss": 0.013034619390964508, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.818311233771965e-05, + "grad_norm": 2.8854875564575195, + "learning_rate": 4.857990674014413e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8382247686386108, + "num_tokens": 43726158.0, + "step": 1147 + }, + { + "epoch": 0.1460373998219056, + "ewc_loss": 0.01307263970375061, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.856331386487e-05, + "grad_norm": 3.096233606338501, + "learning_rate": 4.862229758372191e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.8272955417633057, + "num_tokens": 43766466.0, + "step": 1148 + }, + { + "epoch": 0.1461646101004961, + "ewc_loss": 0.013212902471423149, + "ewc_loss_diag": 9.298324584960938e-06, + "ewc_loss_parallel": 3.935558925149962e-05, + "grad_norm": 2.932908535003662, + "learning_rate": 4.86646884272997e-07, + "loss": 0.5232, + "mean_token_accuracy": 0.8314030170440674, + "num_tokens": 43807255.0, + "step": 1149 + }, + { + "epoch": 0.14629182037908664, + "ewc_loss": 0.013049265369772911, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.832956645055674e-05, + "grad_norm": 2.9153892993927, + "learning_rate": 4.870707927087749e-07, + "loss": 0.502, + "mean_token_accuracy": 0.8397765159606934, + "num_tokens": 43845419.0, + "step": 1150 + }, + { + "epoch": 0.14641903065767714, + "ewc_loss": 0.01309683546423912, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.880526855937205e-05, + "grad_norm": 2.948608636856079, + "learning_rate": 4.874947011445528e-07, + "loss": 0.509, + "mean_token_accuracy": 0.8399785757064819, + "num_tokens": 43882942.0, + "step": 1151 + }, + { + "epoch": 0.14654624093626764, + "ewc_loss": 0.013091868720948696, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.8755602872697636e-05, + "grad_norm": 2.8485758304595947, + "learning_rate": 4.879186095803306e-07, + "loss": 0.4323, + "mean_token_accuracy": 0.8615263104438782, + "num_tokens": 43923109.0, + "step": 1152 + }, + { + "epoch": 0.14667345121485817, + "ewc_loss": 0.013065780512988567, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.8494716136483476e-05, + "grad_norm": 3.1128954887390137, + "learning_rate": 4.883425180161085e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8487865924835205, + "num_tokens": 43959752.0, + "step": 1153 + }, + { + "epoch": 0.14680066149344867, + "ewc_loss": 0.013252941891551018, + "ewc_loss_diag": 9.298324584960938e-06, + "ewc_loss_parallel": 3.975598519900814e-05, + "grad_norm": 3.0208449363708496, + "learning_rate": 4.887664264518864e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8401970267295837, + "num_tokens": 43996247.0, + "step": 1154 + }, + { + "epoch": 0.14692787177203917, + "ewc_loss": 0.013154711574316025, + "ewc_loss_diag": 9.298324584960938e-06, + "ewc_loss_parallel": 3.877367271343246e-05, + "grad_norm": 2.905241012573242, + "learning_rate": 4.891903348876643e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8435070514678955, + "num_tokens": 44035751.0, + "step": 1155 + }, + { + "epoch": 0.1470550820506297, + "ewc_loss": 0.013077415525913239, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.86110732506495e-05, + "grad_norm": 2.9764363765716553, + "learning_rate": 4.896142433234421e-07, + "loss": 0.5327, + "mean_token_accuracy": 0.8362622261047363, + "num_tokens": 44073225.0, + "step": 1156 + }, + { + "epoch": 0.1471822923292202, + "ewc_loss": 0.01319240964949131, + "ewc_loss_diag": 9.298324584960938e-06, + "ewc_loss_parallel": 3.915065826731734e-05, + "grad_norm": 3.0610976219177246, + "learning_rate": 4.9003815175922e-07, + "loss": 0.5362, + "mean_token_accuracy": 0.8351141214370728, + "num_tokens": 44111761.0, + "step": 1157 + }, + { + "epoch": 0.1473095026078107, + "ewc_loss": 0.013145428150892258, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.9291189750656486e-05, + "grad_norm": 2.9215762615203857, + "learning_rate": 4.904620601949979e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.844518780708313, + "num_tokens": 44151311.0, + "step": 1158 + }, + { + "epoch": 0.14743671288640123, + "ewc_loss": 0.013210478238761425, + "ewc_loss_diag": 9.357929229736328e-06, + "ewc_loss_parallel": 3.872099478030577e-05, + "grad_norm": 3.0916383266448975, + "learning_rate": 4.908859686307758e-07, + "loss": 0.5301, + "mean_token_accuracy": 0.8346213698387146, + "num_tokens": 44184623.0, + "step": 1159 + }, + { + "epoch": 0.14756392316499173, + "ewc_loss": 0.013179809786379337, + "ewc_loss_diag": 9.238719940185547e-06, + "ewc_loss_parallel": 3.9635011489735916e-05, + "grad_norm": 2.9840004444122314, + "learning_rate": 4.913098770665536e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.8394254446029663, + "num_tokens": 44221753.0, + "step": 1160 + }, + { + "epoch": 0.14769113344358223, + "ewc_loss": 0.013167154043912888, + "ewc_loss_diag": 9.298324584960938e-06, + "ewc_loss_parallel": 3.8898106140550226e-05, + "grad_norm": 2.9300503730773926, + "learning_rate": 4.917337855023314e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8456773161888123, + "num_tokens": 44262542.0, + "step": 1161 + }, + { + "epoch": 0.14781834372217276, + "ewc_loss": 0.0131763257086277, + "ewc_loss_diag": 9.298324584960938e-06, + "ewc_loss_parallel": 3.898981958627701e-05, + "grad_norm": 2.9963529109954834, + "learning_rate": 4.921576939381094e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8472555875778198, + "num_tokens": 44300294.0, + "step": 1162 + }, + { + "epoch": 0.14794555400076326, + "ewc_loss": 0.013221393339335918, + "ewc_loss_diag": 9.298324584960938e-06, + "ewc_loss_parallel": 3.9440496038878337e-05, + "grad_norm": 2.9846818447113037, + "learning_rate": 4.925816023738872e-07, + "loss": 0.5379, + "mean_token_accuracy": 0.8291310667991638, + "num_tokens": 44338476.0, + "step": 1163 + }, + { + "epoch": 0.14807276427935376, + "ewc_loss": 0.01327175460755825, + "ewc_loss_diag": 9.357929229736328e-06, + "ewc_loss_parallel": 3.9333757740678266e-05, + "grad_norm": 3.2401769161224365, + "learning_rate": 4.930055108096651e-07, + "loss": 0.4383, + "mean_token_accuracy": 0.8597240447998047, + "num_tokens": 44367437.0, + "step": 1164 + }, + { + "epoch": 0.1481999745579443, + "ewc_loss": 0.013368622399866581, + "ewc_loss_diag": 9.357929229736328e-06, + "ewc_loss_parallel": 4.030243508168496e-05, + "grad_norm": 3.0376219749450684, + "learning_rate": 4.934294192454429e-07, + "loss": 0.5514, + "mean_token_accuracy": 0.8251424431800842, + "num_tokens": 44402636.0, + "step": 1165 + }, + { + "epoch": 0.1483271848365348, + "ewc_loss": 0.013250717893242836, + "ewc_loss_diag": 9.357929229736328e-06, + "ewc_loss_parallel": 3.912338797817938e-05, + "grad_norm": 2.9574458599090576, + "learning_rate": 4.938533276812209e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8441916108131409, + "num_tokens": 44438187.0, + "step": 1166 + }, + { + "epoch": 0.14845439511512531, + "ewc_loss": 0.013321436941623688, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 3.922023461200297e-05, + "grad_norm": 2.906991720199585, + "learning_rate": 4.942772361169987e-07, + "loss": 0.4163, + "mean_token_accuracy": 0.8618009090423584, + "num_tokens": 44478345.0, + "step": 1167 + }, + { + "epoch": 0.14858160539371582, + "ewc_loss": 0.013336176984012127, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 3.93676309613511e-05, + "grad_norm": 2.9200448989868164, + "learning_rate": 4.947011445527766e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8428823947906494, + "num_tokens": 44520272.0, + "step": 1168 + }, + { + "epoch": 0.14870881567230632, + "ewc_loss": 0.013346008956432343, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 3.9465954614570364e-05, + "grad_norm": 2.907485008239746, + "learning_rate": 4.951250529885544e-07, + "loss": 0.4492, + "mean_token_accuracy": 0.85854172706604, + "num_tokens": 44561058.0, + "step": 1169 + }, + { + "epoch": 0.14883602595089684, + "ewc_loss": 0.013351424597203732, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 3.9520105929113925e-05, + "grad_norm": 3.067887783050537, + "learning_rate": 4.955489614243324e-07, + "loss": 0.5804, + "mean_token_accuracy": 0.8144752383232117, + "num_tokens": 44599371.0, + "step": 1170 + }, + { + "epoch": 0.14896323622948734, + "ewc_loss": 0.013414808548986912, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 4.01539437007159e-05, + "grad_norm": 2.9218332767486572, + "learning_rate": 4.959728698601102e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.835229754447937, + "num_tokens": 44640499.0, + "step": 1171 + }, + { + "epoch": 0.14909044650807785, + "ewc_loss": 0.013341482728719711, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 3.942069088225253e-05, + "grad_norm": 2.9138681888580322, + "learning_rate": 4.963967782958881e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8477144837379456, + "num_tokens": 44683164.0, + "step": 1172 + }, + { + "epoch": 0.14921765678666837, + "ewc_loss": 0.013385195285081863, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 3.985781586379744e-05, + "grad_norm": 2.9767181873321533, + "learning_rate": 4.968206867316659e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8426386713981628, + "num_tokens": 44721938.0, + "step": 1173 + }, + { + "epoch": 0.14934486706525887, + "ewc_loss": 0.013401911593973637, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 4.00249773520045e-05, + "grad_norm": 3.1086196899414062, + "learning_rate": 4.972445951674439e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.8504800796508789, + "num_tokens": 44753506.0, + "step": 1174 + }, + { + "epoch": 0.14947207734384937, + "ewc_loss": 0.013440627604722977, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 4.04121310566552e-05, + "grad_norm": 3.03998064994812, + "learning_rate": 4.976685036032216e-07, + "loss": 0.5522, + "mean_token_accuracy": 0.8243738412857056, + "num_tokens": 44792170.0, + "step": 1175 + }, + { + "epoch": 0.1495992876224399, + "ewc_loss": 0.0133905578404665, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 3.991144330939278e-05, + "grad_norm": 2.9071266651153564, + "learning_rate": 4.980924120389996e-07, + "loss": 0.4306, + "mean_token_accuracy": 0.8628201484680176, + "num_tokens": 44830733.0, + "step": 1176 + }, + { + "epoch": 0.1497264979010304, + "ewc_loss": 0.01337197795510292, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 3.972563717979938e-05, + "grad_norm": 2.9651570320129395, + "learning_rate": 4.985163204747774e-07, + "loss": 0.443, + "mean_token_accuracy": 0.8582863807678223, + "num_tokens": 44869743.0, + "step": 1177 + }, + { + "epoch": 0.1498537081796209, + "ewc_loss": 0.013409709557890892, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 4.0102950151776895e-05, + "grad_norm": 2.9557042121887207, + "learning_rate": 4.989402289105554e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.8497079610824585, + "num_tokens": 44913358.0, + "step": 1178 + }, + { + "epoch": 0.14998091845821143, + "ewc_loss": 0.013414867222309113, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 4.015453305328265e-05, + "grad_norm": 3.09120512008667, + "learning_rate": 4.993641373463331e-07, + "loss": 0.549, + "mean_token_accuracy": 0.8256444334983826, + "num_tokens": 44949963.0, + "step": 1179 + }, + { + "epoch": 0.15010812873680193, + "ewc_loss": 0.013464193791151047, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 4.0647792047820985e-05, + "grad_norm": 3.0327699184417725, + "learning_rate": 4.997880457821111e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.838061511516571, + "num_tokens": 44988569.0, + "step": 1180 + }, + { + "epoch": 0.15023533901539243, + "ewc_loss": 0.013483495451509953, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.023046130896546e-05, + "grad_norm": 2.9232370853424072, + "learning_rate": 5.002119542178889e-07, + "loss": 0.4807, + "mean_token_accuracy": 0.8456502556800842, + "num_tokens": 45032634.0, + "step": 1181 + }, + { + "epoch": 0.15036254929398296, + "ewc_loss": 0.013395337387919426, + "ewc_loss_diag": 9.417533874511719e-06, + "ewc_loss_parallel": 3.995922816102393e-05, + "grad_norm": 2.9625632762908936, + "learning_rate": 5.006358626536667e-07, + "loss": 0.4505, + "mean_token_accuracy": 0.8563975691795349, + "num_tokens": 45069685.0, + "step": 1182 + }, + { + "epoch": 0.15048975957257346, + "ewc_loss": 0.01351041067391634, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.049961353302933e-05, + "grad_norm": 2.953718423843384, + "learning_rate": 5.010597710894446e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8496079444885254, + "num_tokens": 45107481.0, + "step": 1183 + }, + { + "epoch": 0.15061696985116396, + "ewc_loss": 0.013488752767443657, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.028303374070674e-05, + "grad_norm": 2.9964210987091064, + "learning_rate": 5.014836795252225e-07, + "loss": 0.4579, + "mean_token_accuracy": 0.8523973226547241, + "num_tokens": 45145747.0, + "step": 1184 + }, + { + "epoch": 0.1507441801297545, + "ewc_loss": 0.013515487313270569, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.0550374251324683e-05, + "grad_norm": 3.026446580886841, + "learning_rate": 5.019075879610004e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8544151782989502, + "num_tokens": 45183140.0, + "step": 1185 + }, + { + "epoch": 0.150871390408345, + "ewc_loss": 0.013521090149879456, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.060640640091151e-05, + "grad_norm": 2.984473943710327, + "learning_rate": 5.023314963967783e-07, + "loss": 0.4989, + "mean_token_accuracy": 0.8400620222091675, + "num_tokens": 45224911.0, + "step": 1186 + }, + { + "epoch": 0.1509986006869355, + "ewc_loss": 0.013498473912477493, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.038024417241104e-05, + "grad_norm": 3.099670648574829, + "learning_rate": 5.027554048325562e-07, + "loss": 0.5514, + "mean_token_accuracy": 0.825386106967926, + "num_tokens": 45257388.0, + "step": 1187 + }, + { + "epoch": 0.15112581096552602, + "ewc_loss": 0.0135490158572793, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.088566856808029e-05, + "grad_norm": 3.005143880844116, + "learning_rate": 5.03179313268334e-07, + "loss": 0.5376, + "mean_token_accuracy": 0.8303568363189697, + "num_tokens": 45297876.0, + "step": 1188 + }, + { + "epoch": 0.15125302124411652, + "ewc_loss": 0.013502823188900948, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.042373984702863e-05, + "grad_norm": 3.0742828845977783, + "learning_rate": 5.036032217041119e-07, + "loss": 0.5134, + "mean_token_accuracy": 0.8341737985610962, + "num_tokens": 45334040.0, + "step": 1189 + }, + { + "epoch": 0.15138023152270705, + "ewc_loss": 0.013538932427763939, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.078483470948413e-05, + "grad_norm": 2.918309211730957, + "learning_rate": 5.040271301398897e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8386120200157166, + "num_tokens": 45375985.0, + "step": 1190 + }, + { + "epoch": 0.15150744180129755, + "ewc_loss": 0.013480346649885178, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.0198970964411274e-05, + "grad_norm": 2.9336493015289307, + "learning_rate": 5.044510385756676e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8480010032653809, + "num_tokens": 45419951.0, + "step": 1191 + }, + { + "epoch": 0.15163465207988805, + "ewc_loss": 0.013530844822525978, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.0703958802623674e-05, + "grad_norm": 2.9828293323516846, + "learning_rate": 5.048749470114455e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8474774360656738, + "num_tokens": 45456988.0, + "step": 1192 + }, + { + "epoch": 0.15176186235847858, + "ewc_loss": 0.013542240485548973, + "ewc_loss_diag": 9.47713851928711e-06, + "ewc_loss_parallel": 4.081791121279821e-05, + "grad_norm": 3.159374952316284, + "learning_rate": 5.052988554472234e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8397430181503296, + "num_tokens": 45488226.0, + "step": 1193 + }, + { + "epoch": 0.15188907263706908, + "ewc_loss": 0.013666527345776558, + "ewc_loss_diag": 9.5367431640625e-06, + "ewc_loss_parallel": 4.1450432036072016e-05, + "grad_norm": 3.0331194400787354, + "learning_rate": 5.057227638830013e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8420451283454895, + "num_tokens": 45524661.0, + "step": 1194 + }, + { + "epoch": 0.15201628291565958, + "ewc_loss": 0.013576903380453587, + "ewc_loss_diag": 9.5367431640625e-06, + "ewc_loss_parallel": 4.055419049109332e-05, + "grad_norm": 3.064640760421753, + "learning_rate": 5.061466723187792e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8424040675163269, + "num_tokens": 45563491.0, + "step": 1195 + }, + { + "epoch": 0.1521434931942501, + "ewc_loss": 0.013627206906676292, + "ewc_loss_diag": 9.5367431640625e-06, + "ewc_loss_parallel": 4.105723201064393e-05, + "grad_norm": 3.081514358520508, + "learning_rate": 5.065705807545569e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.851495087146759, + "num_tokens": 45596771.0, + "step": 1196 + }, + { + "epoch": 0.1522707034728406, + "ewc_loss": 0.013694248162209988, + "ewc_loss_diag": 9.59634780883789e-06, + "ewc_loss_parallel": 4.111728412681259e-05, + "grad_norm": 2.933875799179077, + "learning_rate": 5.069944891903349e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8444463014602661, + "num_tokens": 45640049.0, + "step": 1197 + }, + { + "epoch": 0.1523979137514311, + "ewc_loss": 0.01363433338701725, + "ewc_loss_diag": 9.59634780883789e-06, + "ewc_loss_parallel": 4.0518138121115044e-05, + "grad_norm": 3.0383434295654297, + "learning_rate": 5.074183976261127e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8418469429016113, + "num_tokens": 45677375.0, + "step": 1198 + }, + { + "epoch": 0.15252512403002164, + "ewc_loss": 0.013718357309699059, + "ewc_loss_diag": 9.59634780883789e-06, + "ewc_loss_parallel": 4.1358380258316174e-05, + "grad_norm": 3.0453591346740723, + "learning_rate": 5.078423060618906e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8450286388397217, + "num_tokens": 45719253.0, + "step": 1199 + }, + { + "epoch": 0.15265233430861214, + "ewc_loss": 0.013707660138607025, + "ewc_loss_diag": 9.59634780883789e-06, + "ewc_loss_parallel": 4.1251412767451257e-05, + "grad_norm": 3.019160032272339, + "learning_rate": 5.082662144976685e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8418957591056824, + "num_tokens": 45756065.0, + "step": 1200 + }, + { + "epoch": 0.15277954458720264, + "ewc_loss": 0.013760793954133987, + "ewc_loss_diag": 9.655952453613281e-06, + "ewc_loss_parallel": 4.117239950574003e-05, + "grad_norm": 3.0076804161071777, + "learning_rate": 5.086901229334464e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8494595289230347, + "num_tokens": 45791707.0, + "step": 1201 + }, + { + "epoch": 0.15290675486579317, + "ewc_loss": 0.013843802735209465, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.139212251175195e-05, + "grad_norm": 3.052239418029785, + "learning_rate": 5.091140313692243e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.841920018196106, + "num_tokens": 45828464.0, + "step": 1202 + }, + { + "epoch": 0.15303396514438367, + "ewc_loss": 0.013848381116986275, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.143791375099681e-05, + "grad_norm": 2.992593288421631, + "learning_rate": 5.095379398050022e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8470788598060608, + "num_tokens": 45867046.0, + "step": 1203 + }, + { + "epoch": 0.15316117542297417, + "ewc_loss": 0.013826625421643257, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.122035898035392e-05, + "grad_norm": 3.097036838531494, + "learning_rate": 5.099618482407799e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.8382923007011414, + "num_tokens": 45900530.0, + "step": 1204 + }, + { + "epoch": 0.1532883857015647, + "ewc_loss": 0.013878143392503262, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.1735536797204986e-05, + "grad_norm": 3.016400098800659, + "learning_rate": 5.103857566765578e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8471083641052246, + "num_tokens": 45940002.0, + "step": 1205 + }, + { + "epoch": 0.1534155959801552, + "ewc_loss": 0.013841181062161922, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.1365910874446854e-05, + "grad_norm": 3.0581281185150146, + "learning_rate": 5.108096651123357e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8580360412597656, + "num_tokens": 45975299.0, + "step": 1206 + }, + { + "epoch": 0.1535428062587457, + "ewc_loss": 0.013863164000213146, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.158573938184418e-05, + "grad_norm": 3.060227155685425, + "learning_rate": 5.112335735481135e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8442502021789551, + "num_tokens": 46012693.0, + "step": 1207 + }, + { + "epoch": 0.15367001653733622, + "ewc_loss": 0.013863202184438705, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.158612864557654e-05, + "grad_norm": 2.9790380001068115, + "learning_rate": 5.116574819838915e-07, + "loss": 0.3911, + "mean_token_accuracy": 0.8743869066238403, + "num_tokens": 46051558.0, + "step": 1208 + }, + { + "epoch": 0.15379722681592672, + "ewc_loss": 0.013836164027452469, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.1315746784675866e-05, + "grad_norm": 3.062863349914551, + "learning_rate": 5.120813904196693e-07, + "loss": 0.473, + "mean_token_accuracy": 0.850189208984375, + "num_tokens": 46087173.0, + "step": 1209 + }, + { + "epoch": 0.15392443709451722, + "ewc_loss": 0.013900171965360641, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.195581641397439e-05, + "grad_norm": 3.126868486404419, + "learning_rate": 5.125052988554473e-07, + "loss": 0.5289, + "mean_token_accuracy": 0.8346551060676575, + "num_tokens": 46120482.0, + "step": 1210 + }, + { + "epoch": 0.15405164737310775, + "ewc_loss": 0.013903144747018814, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.198554597678594e-05, + "grad_norm": 3.0452897548675537, + "learning_rate": 5.12929207291225e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8472634553909302, + "num_tokens": 46154478.0, + "step": 1211 + }, + { + "epoch": 0.15417885765169825, + "ewc_loss": 0.013851447030901909, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.146856736042537e-05, + "grad_norm": 2.9350173473358154, + "learning_rate": 5.133531157270029e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.85617995262146, + "num_tokens": 46195012.0, + "step": 1212 + }, + { + "epoch": 0.15430606793028875, + "ewc_loss": 0.013858361169695854, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.153771806159057e-05, + "grad_norm": 2.9975321292877197, + "learning_rate": 5.137770241627808e-07, + "loss": 0.418, + "mean_token_accuracy": 0.8653432726860046, + "num_tokens": 46230641.0, + "step": 1213 + }, + { + "epoch": 0.15443327820887928, + "ewc_loss": 0.013903165236115456, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.198574970359914e-05, + "grad_norm": 3.0946598052978516, + "learning_rate": 5.142009325985587e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8555865287780762, + "num_tokens": 46268993.0, + "step": 1214 + }, + { + "epoch": 0.15456048848746978, + "ewc_loss": 0.013932310044765472, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.227719546179287e-05, + "grad_norm": 2.979652166366577, + "learning_rate": 5.146248410343365e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.8438371419906616, + "num_tokens": 46309310.0, + "step": 1215 + }, + { + "epoch": 0.1546876987660603, + "ewc_loss": 0.013867808505892754, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.163218545727432e-05, + "grad_norm": 2.9783804416656494, + "learning_rate": 5.150487494701145e-07, + "loss": 0.4315, + "mean_token_accuracy": 0.8609529137611389, + "num_tokens": 46352415.0, + "step": 1216 + }, + { + "epoch": 0.1548149090446508, + "ewc_loss": 0.0139611242339015, + "ewc_loss_diag": 9.775161743164062e-06, + "ewc_loss_parallel": 4.195499423076399e-05, + "grad_norm": 3.052187442779541, + "learning_rate": 5.154726579058923e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8439699411392212, + "num_tokens": 46387838.0, + "step": 1217 + }, + { + "epoch": 0.1549421193232413, + "ewc_loss": 0.013985859230160713, + "ewc_loss_diag": 9.775161743164062e-06, + "ewc_loss_parallel": 4.2202347685815766e-05, + "grad_norm": 2.945296049118042, + "learning_rate": 5.158965663416703e-07, + "loss": 0.4823, + "mean_token_accuracy": 0.8459124565124512, + "num_tokens": 46432669.0, + "step": 1218 + }, + { + "epoch": 0.15506932960183184, + "ewc_loss": 0.013928020372986794, + "ewc_loss_diag": 9.775161743164062e-06, + "ewc_loss_parallel": 4.1623956349212676e-05, + "grad_norm": 2.9817943572998047, + "learning_rate": 5.16320474777448e-07, + "loss": 0.4391, + "mean_token_accuracy": 0.8586649894714355, + "num_tokens": 46474383.0, + "step": 1219 + }, + { + "epoch": 0.15519653988042234, + "ewc_loss": 0.013989749364554882, + "ewc_loss_diag": 9.775161743164062e-06, + "ewc_loss_parallel": 4.224124495522119e-05, + "grad_norm": 3.0409843921661377, + "learning_rate": 5.167443832132259e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8432411551475525, + "num_tokens": 46514263.0, + "step": 1220 + }, + { + "epoch": 0.15532375015901284, + "ewc_loss": 0.013997431844472885, + "ewc_loss_diag": 9.775161743164062e-06, + "ewc_loss_parallel": 4.2318071791669354e-05, + "grad_norm": 3.0283734798431396, + "learning_rate": 5.171682916490038e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.8525173664093018, + "num_tokens": 46549835.0, + "step": 1221 + }, + { + "epoch": 0.15545096043760337, + "ewc_loss": 0.01392214372754097, + "ewc_loss_diag": 9.715557098388672e-06, + "ewc_loss_parallel": 4.217553941998631e-05, + "grad_norm": 3.065885543823242, + "learning_rate": 5.175922000847816e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8569810390472412, + "num_tokens": 46584811.0, + "step": 1222 + }, + { + "epoch": 0.15557817071619387, + "ewc_loss": 0.013991884887218475, + "ewc_loss_diag": 9.775161743164062e-06, + "ewc_loss_parallel": 4.2262592614861205e-05, + "grad_norm": 3.0411641597747803, + "learning_rate": 5.180161085205595e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8515372276306152, + "num_tokens": 46621441.0, + "step": 1223 + }, + { + "epoch": 0.15570538099478437, + "ewc_loss": 0.01398525107651949, + "ewc_loss_diag": 9.775161743164062e-06, + "ewc_loss_parallel": 4.21962613472715e-05, + "grad_norm": 3.0305519104003906, + "learning_rate": 5.184400169563374e-07, + "loss": 0.517, + "mean_token_accuracy": 0.8358856439590454, + "num_tokens": 46661572.0, + "step": 1224 + }, + { + "epoch": 0.1558325912733749, + "ewc_loss": 0.013983283191919327, + "ewc_loss_diag": 9.775161743164062e-06, + "ewc_loss_parallel": 4.2176579881925136e-05, + "grad_norm": 3.06355619430542, + "learning_rate": 5.188639253921153e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.8419638872146606, + "num_tokens": 46702157.0, + "step": 1225 + }, + { + "epoch": 0.1559598015519654, + "ewc_loss": 0.014125609770417213, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.2379138903925195e-05, + "grad_norm": 3.0681779384613037, + "learning_rate": 5.192878338278932e-07, + "loss": 0.4668, + "mean_token_accuracy": 0.8502082824707031, + "num_tokens": 46739119.0, + "step": 1226 + }, + { + "epoch": 0.1560870118305559, + "ewc_loss": 0.0141162583604455, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.228563193464652e-05, + "grad_norm": 3.0263679027557373, + "learning_rate": 5.19711742263671e-07, + "loss": 0.5266, + "mean_token_accuracy": 0.8334335088729858, + "num_tokens": 46783990.0, + "step": 1227 + }, + { + "epoch": 0.15621422210914643, + "ewc_loss": 0.014094042591750622, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.206347148283385e-05, + "grad_norm": 3.087764024734497, + "learning_rate": 5.201356506994488e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8452800512313843, + "num_tokens": 46816131.0, + "step": 1228 + }, + { + "epoch": 0.15634143238773693, + "ewc_loss": 0.014142017811536789, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.254322266206145e-05, + "grad_norm": 3.0583691596984863, + "learning_rate": 5.205595591352268e-07, + "loss": 0.5475, + "mean_token_accuracy": 0.8263067007064819, + "num_tokens": 46856453.0, + "step": 1229 + }, + { + "epoch": 0.15646864266632743, + "ewc_loss": 0.014115124940872192, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.227429599268362e-05, + "grad_norm": 3.0107085704803467, + "learning_rate": 5.209834675710046e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8447265625, + "num_tokens": 46897961.0, + "step": 1230 + }, + { + "epoch": 0.15659585294491796, + "ewc_loss": 0.014105715788900852, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.2180203308817e-05, + "grad_norm": 3.103532314300537, + "learning_rate": 5.214073760067825e-07, + "loss": 0.5604, + "mean_token_accuracy": 0.8245775103569031, + "num_tokens": 46931961.0, + "step": 1231 + }, + { + "epoch": 0.15672306322350846, + "ewc_loss": 0.014160048216581345, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.2723524529719725e-05, + "grad_norm": 3.1362271308898926, + "learning_rate": 5.218312844425604e-07, + "loss": 0.482, + "mean_token_accuracy": 0.8485333919525146, + "num_tokens": 46967822.0, + "step": 1232 + }, + { + "epoch": 0.15685027350209896, + "ewc_loss": 0.01416259165853262, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.274896491551772e-05, + "grad_norm": 3.0943713188171387, + "learning_rate": 5.222551928783383e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8509388566017151, + "num_tokens": 47004511.0, + "step": 1233 + }, + { + "epoch": 0.1569774837806895, + "ewc_loss": 0.014144914224743843, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.257218461134471e-05, + "grad_norm": 3.070060968399048, + "learning_rate": 5.226791013141161e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.837826132774353, + "num_tokens": 47047244.0, + "step": 1234 + }, + { + "epoch": 0.15710469405928, + "ewc_loss": 0.014140034094452858, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.252338840160519e-05, + "grad_norm": 3.0008761882781982, + "learning_rate": 5.23103009749894e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8497737050056458, + "num_tokens": 47088206.0, + "step": 1235 + }, + { + "epoch": 0.1572319043378705, + "ewc_loss": 0.014126444235444069, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.2387484427308664e-05, + "grad_norm": 2.9660279750823975, + "learning_rate": 5.235269181856718e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8453614711761475, + "num_tokens": 47132733.0, + "step": 1236 + }, + { + "epoch": 0.15735911461646102, + "ewc_loss": 0.014149200171232224, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.261504363967106e-05, + "grad_norm": 3.130244255065918, + "learning_rate": 5.239508266214498e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.82679682970047, + "num_tokens": 47166840.0, + "step": 1237 + }, + { + "epoch": 0.15748632489505152, + "ewc_loss": 0.014224743470549583, + "ewc_loss_diag": 9.894371032714844e-06, + "ewc_loss_parallel": 4.337048812885769e-05, + "grad_norm": 3.0875041484832764, + "learning_rate": 5.243747350572276e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.842132031917572, + "num_tokens": 47203444.0, + "step": 1238 + }, + { + "epoch": 0.15761353517364202, + "ewc_loss": 0.014246162958443165, + "ewc_loss_diag": 9.953975677490234e-06, + "ewc_loss_parallel": 4.297432678868063e-05, + "grad_norm": 3.067068099975586, + "learning_rate": 5.247986434930056e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8462816476821899, + "num_tokens": 47240380.0, + "step": 1239 + }, + { + "epoch": 0.15774074545223254, + "ewc_loss": 0.014249470084905624, + "ewc_loss_diag": 9.953975677490234e-06, + "ewc_loss_parallel": 4.30073996540159e-05, + "grad_norm": 3.1146717071533203, + "learning_rate": 5.252225519287834e-07, + "loss": 0.5523, + "mean_token_accuracy": 0.8247624635696411, + "num_tokens": 47277886.0, + "step": 1240 + }, + { + "epoch": 0.15786795573082305, + "ewc_loss": 0.014283625409007072, + "ewc_loss_diag": 9.953975677490234e-06, + "ewc_loss_parallel": 4.3348951294319704e-05, + "grad_norm": 3.088921546936035, + "learning_rate": 5.256464603645613e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8498149514198303, + "num_tokens": 47315261.0, + "step": 1241 + }, + { + "epoch": 0.15799516600941357, + "ewc_loss": 0.014266446232795715, + "ewc_loss_diag": 9.953975677490234e-06, + "ewc_loss_parallel": 4.317715865909122e-05, + "grad_norm": 3.0004308223724365, + "learning_rate": 5.260703688003391e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8453501462936401, + "num_tokens": 47357489.0, + "step": 1242 + }, + { + "epoch": 0.15812237628800407, + "ewc_loss": 0.014233043417334557, + "ewc_loss_diag": 9.953975677490234e-06, + "ewc_loss_parallel": 4.284312672098167e-05, + "grad_norm": 3.1618354320526123, + "learning_rate": 5.26494277236117e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.8426435589790344, + "num_tokens": 47389851.0, + "step": 1243 + }, + { + "epoch": 0.15824958656659457, + "ewc_loss": 0.014334820210933685, + "ewc_loss_diag": 9.953975677490234e-06, + "ewc_loss_parallel": 4.3860894948011264e-05, + "grad_norm": 3.156010389328003, + "learning_rate": 5.269181856718948e-07, + "loss": 0.5251, + "mean_token_accuracy": 0.8321676254272461, + "num_tokens": 47427491.0, + "step": 1244 + }, + { + "epoch": 0.1583767968451851, + "ewc_loss": 0.014287039637565613, + "ewc_loss_diag": 9.953975677490234e-06, + "ewc_loss_parallel": 4.338309736340307e-05, + "grad_norm": 3.0511679649353027, + "learning_rate": 5.273420941076727e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8523003458976746, + "num_tokens": 47462790.0, + "step": 1245 + }, + { + "epoch": 0.1585040071237756, + "ewc_loss": 0.014255726709961891, + "ewc_loss_diag": 9.953975677490234e-06, + "ewc_loss_parallel": 4.3069961975561455e-05, + "grad_norm": 2.992326259613037, + "learning_rate": 5.277660025434506e-07, + "loss": 0.4299, + "mean_token_accuracy": 0.8616368770599365, + "num_tokens": 47502240.0, + "step": 1246 + }, + { + "epoch": 0.1586312174023661, + "ewc_loss": 0.014393084682524204, + "ewc_loss_diag": 1.0073184967041016e-05, + "ewc_loss_parallel": 4.322283712099306e-05, + "grad_norm": 3.009202718734741, + "learning_rate": 5.281899109792285e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8541214466094971, + "num_tokens": 47542493.0, + "step": 1247 + }, + { + "epoch": 0.15875842768095663, + "ewc_loss": 0.014403634704649448, + "ewc_loss_diag": 1.0073184967041016e-05, + "ewc_loss_parallel": 4.332833850639872e-05, + "grad_norm": 3.055593729019165, + "learning_rate": 5.286138194150064e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.8454444408416748, + "num_tokens": 47583378.0, + "step": 1248 + }, + { + "epoch": 0.15888563795954713, + "ewc_loss": 0.014431254006922245, + "ewc_loss_diag": 1.0073184967041016e-05, + "ewc_loss_parallel": 4.360453021945432e-05, + "grad_norm": 3.075422525405884, + "learning_rate": 5.290377278507841e-07, + "loss": 0.5155, + "mean_token_accuracy": 0.8417023420333862, + "num_tokens": 47622671.0, + "step": 1249 + }, + { + "epoch": 0.15901284823813763, + "ewc_loss": 0.014425747096538544, + "ewc_loss_diag": 1.0073184967041016e-05, + "ewc_loss_parallel": 4.354946213425137e-05, + "grad_norm": 3.037907361984253, + "learning_rate": 5.294616362865621e-07, + "loss": 0.4737, + "mean_token_accuracy": 0.8521316647529602, + "num_tokens": 47663774.0, + "step": 1250 + }, + { + "epoch": 0.15914005851672816, + "ewc_loss": 0.01441700104624033, + "ewc_loss_diag": 1.0073184967041016e-05, + "ewc_loss_parallel": 4.346200148575008e-05, + "grad_norm": 3.1151058673858643, + "learning_rate": 5.298855447223399e-07, + "loss": 0.5441, + "mean_token_accuracy": 0.836605966091156, + "num_tokens": 47706143.0, + "step": 1251 + }, + { + "epoch": 0.15926726879531866, + "ewc_loss": 0.014516270719468594, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.384434942039661e-05, + "grad_norm": 3.1127631664276123, + "learning_rate": 5.303094531581178e-07, + "loss": 0.5055, + "mean_token_accuracy": 0.8397320508956909, + "num_tokens": 47741197.0, + "step": 1252 + }, + { + "epoch": 0.15939447907390916, + "ewc_loss": 0.014508010819554329, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.37617527495604e-05, + "grad_norm": 3.021815538406372, + "learning_rate": 5.307333615938957e-07, + "loss": 0.429, + "mean_token_accuracy": 0.861477792263031, + "num_tokens": 47779306.0, + "step": 1253 + }, + { + "epoch": 0.1595216893524997, + "ewc_loss": 0.014474453404545784, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.342617103247903e-05, + "grad_norm": 3.172459125518799, + "learning_rate": 5.311572700296736e-07, + "loss": 0.5144, + "mean_token_accuracy": 0.8360962867736816, + "num_tokens": 47811056.0, + "step": 1254 + }, + { + "epoch": 0.1596488996310902, + "ewc_loss": 0.014568019658327103, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.4361840991768986e-05, + "grad_norm": 3.122861623764038, + "learning_rate": 5.315811784654515e-07, + "loss": 0.4233, + "mean_token_accuracy": 0.8596723079681396, + "num_tokens": 47847075.0, + "step": 1255 + }, + { + "epoch": 0.1597761099096807, + "ewc_loss": 0.014509819447994232, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.377983350423165e-05, + "grad_norm": 2.9945805072784424, + "learning_rate": 5.320050869012294e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8473615646362305, + "num_tokens": 47892639.0, + "step": 1256 + }, + { + "epoch": 0.15990332018827122, + "ewc_loss": 0.014478340744972229, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.346504283603281e-05, + "grad_norm": 3.0523645877838135, + "learning_rate": 5.324289953370071e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.8321732878684998, + "num_tokens": 47933522.0, + "step": 1257 + }, + { + "epoch": 0.16003053046686172, + "ewc_loss": 0.014549965970218182, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.418130265548825e-05, + "grad_norm": 3.0763821601867676, + "learning_rate": 5.328529037727851e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.847344696521759, + "num_tokens": 47977307.0, + "step": 1258 + }, + { + "epoch": 0.16015774074545222, + "ewc_loss": 0.01452934741973877, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.397512020659633e-05, + "grad_norm": 3.106685161590576, + "learning_rate": 5.332768122085629e-07, + "loss": 0.4708, + "mean_token_accuracy": 0.8485512137413025, + "num_tokens": 48015129.0, + "step": 1259 + }, + { + "epoch": 0.16028495102404275, + "ewc_loss": 0.014547237195074558, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.415401053847745e-05, + "grad_norm": 3.147883653640747, + "learning_rate": 5.337007206443408e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8367145657539368, + "num_tokens": 48051904.0, + "step": 1260 + }, + { + "epoch": 0.16041216130263325, + "ewc_loss": 0.014549973420798779, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.4181375415064394e-05, + "grad_norm": 3.4278414249420166, + "learning_rate": 5.341246290801187e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8542183041572571, + "num_tokens": 48094001.0, + "step": 1261 + }, + { + "epoch": 0.16053937158122375, + "ewc_loss": 0.014629533514380455, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.49769759143237e-05, + "grad_norm": 3.044440984725952, + "learning_rate": 5.345485375158966e-07, + "loss": 0.4598, + "mean_token_accuracy": 0.8501101732254028, + "num_tokens": 48134124.0, + "step": 1262 + }, + { + "epoch": 0.16066658185981428, + "ewc_loss": 0.014449920505285263, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.31808439316228e-05, + "grad_norm": 3.0675175189971924, + "learning_rate": 5.349724459516745e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8548320531845093, + "num_tokens": 48170730.0, + "step": 1263 + }, + { + "epoch": 0.16079379213840478, + "ewc_loss": 0.014525926671922207, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.394090865389444e-05, + "grad_norm": 3.104102849960327, + "learning_rate": 5.353963543874522e-07, + "loss": 0.521, + "mean_token_accuracy": 0.836312472820282, + "num_tokens": 48212640.0, + "step": 1264 + }, + { + "epoch": 0.1609210024169953, + "ewc_loss": 0.014525575563311577, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.393739436636679e-05, + "grad_norm": 3.1949026584625244, + "learning_rate": 5.358202628232301e-07, + "loss": 0.4972, + "mean_token_accuracy": 0.8427140712738037, + "num_tokens": 48243136.0, + "step": 1265 + }, + { + "epoch": 0.1610482126955858, + "ewc_loss": 0.014562238939106464, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.430402987054549e-05, + "grad_norm": 3.0888798236846924, + "learning_rate": 5.36244171259008e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8476820588111877, + "num_tokens": 48282905.0, + "step": 1266 + }, + { + "epoch": 0.1611754229741763, + "ewc_loss": 0.014510675333440304, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.3788393668364733e-05, + "grad_norm": 3.093096971511841, + "learning_rate": 5.366680796947859e-07, + "loss": 0.5169, + "mean_token_accuracy": 0.8382808566093445, + "num_tokens": 48323325.0, + "step": 1267 + }, + { + "epoch": 0.16130263325276684, + "ewc_loss": 0.01454512495547533, + "ewc_loss_diag": 1.0132789611816406e-05, + "ewc_loss_parallel": 4.413289207150228e-05, + "grad_norm": 3.179184675216675, + "learning_rate": 5.370919881305637e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8460854291915894, + "num_tokens": 48358400.0, + "step": 1268 + }, + { + "epoch": 0.16142984353135734, + "ewc_loss": 0.0146348737180233, + "ewc_loss_diag": 1.0192394256591797e-05, + "ewc_loss_parallel": 4.442002682480961e-05, + "grad_norm": 3.0937743186950684, + "learning_rate": 5.375158965663417e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8442862033843994, + "num_tokens": 48393018.0, + "step": 1269 + }, + { + "epoch": 0.16155705380994784, + "ewc_loss": 0.01458660326898098, + "ewc_loss_diag": 1.0192394256591797e-05, + "ewc_loss_parallel": 4.3937325244769454e-05, + "grad_norm": 3.386221170425415, + "learning_rate": 5.379398050021195e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8466933369636536, + "num_tokens": 48432952.0, + "step": 1270 + }, + { + "epoch": 0.16168426408853837, + "ewc_loss": 0.01471896842122078, + "ewc_loss_diag": 1.0192394256591797e-05, + "ewc_loss_parallel": 4.5260978367878124e-05, + "grad_norm": 3.1507198810577393, + "learning_rate": 5.383637134378975e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8476614952087402, + "num_tokens": 48466913.0, + "step": 1271 + }, + { + "epoch": 0.16181147436712887, + "ewc_loss": 0.014553504064679146, + "ewc_loss_diag": 1.0192394256591797e-05, + "ewc_loss_parallel": 4.3606334656942636e-05, + "grad_norm": 3.02518892288208, + "learning_rate": 5.387876218736752e-07, + "loss": 0.5505, + "mean_token_accuracy": 0.8222644329071045, + "num_tokens": 48512658.0, + "step": 1272 + }, + { + "epoch": 0.16193868464571937, + "ewc_loss": 0.014576872810721397, + "ewc_loss_diag": 1.0192394256591797e-05, + "ewc_loss_parallel": 4.384001294965856e-05, + "grad_norm": 3.016028881072998, + "learning_rate": 5.392115303094531e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8557744026184082, + "num_tokens": 48552283.0, + "step": 1273 + }, + { + "epoch": 0.1620658949243099, + "ewc_loss": 0.014607297256588936, + "ewc_loss_diag": 1.0192394256591797e-05, + "ewc_loss_parallel": 4.414426803123206e-05, + "grad_norm": 3.038607120513916, + "learning_rate": 5.39635438745231e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8629034757614136, + "num_tokens": 48590504.0, + "step": 1274 + }, + { + "epoch": 0.1621931052029004, + "ewc_loss": 0.01461716741323471, + "ewc_loss_diag": 1.0192394256591797e-05, + "ewc_loss_parallel": 4.4242970034247264e-05, + "grad_norm": 3.111536741256714, + "learning_rate": 5.400593471810089e-07, + "loss": 0.516, + "mean_token_accuracy": 0.8353626132011414, + "num_tokens": 48629993.0, + "step": 1275 + }, + { + "epoch": 0.1623203154814909, + "ewc_loss": 0.014712650328874588, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.458743569557555e-05, + "grad_norm": 3.2297468185424805, + "learning_rate": 5.404832556167867e-07, + "loss": 0.5183, + "mean_token_accuracy": 0.8359630107879639, + "num_tokens": 48665074.0, + "step": 1276 + }, + { + "epoch": 0.16244752576008142, + "ewc_loss": 0.014750957489013672, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.497051486396231e-05, + "grad_norm": 3.0596444606781006, + "learning_rate": 5.409071640525647e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8505739569664001, + "num_tokens": 48705524.0, + "step": 1277 + }, + { + "epoch": 0.16257473603867192, + "ewc_loss": 0.014674377627670765, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.420471304911189e-05, + "grad_norm": 3.1359667778015137, + "learning_rate": 5.413310724883425e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8270634412765503, + "num_tokens": 48744030.0, + "step": 1278 + }, + { + "epoch": 0.16270194631726242, + "ewc_loss": 0.014693608507514, + "ewc_loss_diag": 1.0192394256591797e-05, + "ewc_loss_parallel": 4.5007371227256954e-05, + "grad_norm": 3.047914505004883, + "learning_rate": 5.417549809241205e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.8399524688720703, + "num_tokens": 48788401.0, + "step": 1279 + }, + { + "epoch": 0.16282915659585295, + "ewc_loss": 0.014637259766459465, + "ewc_loss_diag": 1.0192394256591797e-05, + "ewc_loss_parallel": 4.444388105184771e-05, + "grad_norm": 3.179575204849243, + "learning_rate": 5.421788893598982e-07, + "loss": 0.5184, + "mean_token_accuracy": 0.8357712030410767, + "num_tokens": 48823046.0, + "step": 1280 + }, + { + "epoch": 0.16295636687444345, + "ewc_loss": 0.014774061739444733, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.5201559260021895e-05, + "grad_norm": 3.040778636932373, + "learning_rate": 5.42602797795676e-07, + "loss": 0.5249, + "mean_token_accuracy": 0.831935703754425, + "num_tokens": 48868293.0, + "step": 1281 + }, + { + "epoch": 0.16308357715303395, + "ewc_loss": 0.014698782935738564, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.444876321940683e-05, + "grad_norm": 3.11337947845459, + "learning_rate": 5.43026706231454e-07, + "loss": 0.5523, + "mean_token_accuracy": 0.8247815370559692, + "num_tokens": 48908775.0, + "step": 1282 + }, + { + "epoch": 0.16321078743162448, + "ewc_loss": 0.014763331972062588, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.5094257075106725e-05, + "grad_norm": 3.090606212615967, + "learning_rate": 5.434506146672319e-07, + "loss": 0.5303, + "mean_token_accuracy": 0.8313065767288208, + "num_tokens": 48957640.0, + "step": 1283 + }, + { + "epoch": 0.16333799771021498, + "ewc_loss": 0.014735081233084202, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.481174983084202e-05, + "grad_norm": 3.1583027839660645, + "learning_rate": 5.438745231030097e-07, + "loss": 0.5219, + "mean_token_accuracy": 0.8312674164772034, + "num_tokens": 48993601.0, + "step": 1284 + }, + { + "epoch": 0.16346520798880548, + "ewc_loss": 0.01478432398289442, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.530417572823353e-05, + "grad_norm": 3.144049882888794, + "learning_rate": 5.442984315387876e-07, + "loss": 0.4268, + "mean_token_accuracy": 0.861539363861084, + "num_tokens": 49028411.0, + "step": 1285 + }, + { + "epoch": 0.163592418267396, + "ewc_loss": 0.014761561527848244, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.5076558308210224e-05, + "grad_norm": 3.0644798278808594, + "learning_rate": 5.447223399745655e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.8474611043930054, + "num_tokens": 49067449.0, + "step": 1286 + }, + { + "epoch": 0.1637196285459865, + "ewc_loss": 0.0147396856918931, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.485779209062457e-05, + "grad_norm": 3.154088020324707, + "learning_rate": 5.451462484103433e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8515702486038208, + "num_tokens": 49102650.0, + "step": 1287 + }, + { + "epoch": 0.163846838824577, + "ewc_loss": 0.014793544076383114, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.539638030109927e-05, + "grad_norm": 3.1923158168792725, + "learning_rate": 5.455701568461212e-07, + "loss": 0.489, + "mean_token_accuracy": 0.8434791564941406, + "num_tokens": 49137110.0, + "step": 1288 + }, + { + "epoch": 0.16397404910316754, + "ewc_loss": 0.014785269275307655, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.531363083515316e-05, + "grad_norm": 3.1942367553710938, + "learning_rate": 5.45994065281899e-07, + "loss": 0.5613, + "mean_token_accuracy": 0.8197985887527466, + "num_tokens": 49177765.0, + "step": 1289 + }, + { + "epoch": 0.16410125938175804, + "ewc_loss": 0.014784792438149452, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.530885780695826e-05, + "grad_norm": 3.1904218196868896, + "learning_rate": 5.46417973717677e-07, + "loss": 0.5018, + "mean_token_accuracy": 0.8383246660232544, + "num_tokens": 49207860.0, + "step": 1290 + }, + { + "epoch": 0.16422846966034857, + "ewc_loss": 0.014783362857997417, + "ewc_loss_diag": 1.0251998901367188e-05, + "ewc_loss_parallel": 4.5294564188225195e-05, + "grad_norm": 3.206233024597168, + "learning_rate": 5.468418821534548e-07, + "loss": 0.4359, + "mean_token_accuracy": 0.8599711656570435, + "num_tokens": 49239994.0, + "step": 1291 + }, + { + "epoch": 0.16435567993893907, + "ewc_loss": 0.014914902858436108, + "ewc_loss_diag": 1.0371208190917969e-05, + "ewc_loss_parallel": 4.538926077657379e-05, + "grad_norm": 3.165271282196045, + "learning_rate": 5.472657905892327e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8565306067466736, + "num_tokens": 49273464.0, + "step": 1292 + }, + { + "epoch": 0.16448289021752957, + "ewc_loss": 0.014904236420989037, + "ewc_loss_diag": 1.0371208190917969e-05, + "ewc_loss_parallel": 4.528259887592867e-05, + "grad_norm": 3.1233654022216797, + "learning_rate": 5.476896990250106e-07, + "loss": 0.563, + "mean_token_accuracy": 0.8219500780105591, + "num_tokens": 49313144.0, + "step": 1293 + }, + { + "epoch": 0.1646101004961201, + "ewc_loss": 0.01490301638841629, + "ewc_loss_diag": 1.0371208190917969e-05, + "ewc_loss_parallel": 4.527039345703088e-05, + "grad_norm": 3.046067237854004, + "learning_rate": 5.481136074607885e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8524261713027954, + "num_tokens": 49351392.0, + "step": 1294 + }, + { + "epoch": 0.1647373107747106, + "ewc_loss": 0.014882706105709076, + "ewc_loss_diag": 1.0371208190917969e-05, + "ewc_loss_parallel": 4.5067299652146176e-05, + "grad_norm": 3.0457122325897217, + "learning_rate": 5.485375158965663e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.8551501035690308, + "num_tokens": 49396726.0, + "step": 1295 + }, + { + "epoch": 0.1648645210533011, + "ewc_loss": 0.014922302216291428, + "ewc_loss_diag": 1.0371208190917969e-05, + "ewc_loss_parallel": 4.5463260903488845e-05, + "grad_norm": 3.094633102416992, + "learning_rate": 5.489614243323442e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.843716561794281, + "num_tokens": 49435856.0, + "step": 1296 + }, + { + "epoch": 0.16499173133189163, + "ewc_loss": 0.014948302879929543, + "ewc_loss_diag": 1.0371208190917969e-05, + "ewc_loss_parallel": 4.572326361085288e-05, + "grad_norm": 3.054690361022949, + "learning_rate": 5.49385332768122e-07, + "loss": 0.4272, + "mean_token_accuracy": 0.8601807951927185, + "num_tokens": 49473695.0, + "step": 1297 + }, + { + "epoch": 0.16511894161048213, + "ewc_loss": 0.01492651179432869, + "ewc_loss_diag": 1.0371208190917969e-05, + "ewc_loss_parallel": 4.55053559562657e-05, + "grad_norm": 3.1641063690185547, + "learning_rate": 5.498092412039e-07, + "loss": 0.5525, + "mean_token_accuracy": 0.8301904797554016, + "num_tokens": 49509165.0, + "step": 1298 + }, + { + "epoch": 0.16524615188907263, + "ewc_loss": 0.015002837404608727, + "ewc_loss_diag": 1.0371208190917969e-05, + "ewc_loss_parallel": 4.626861118595116e-05, + "grad_norm": 3.0117104053497314, + "learning_rate": 5.502331496396778e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8520980477333069, + "num_tokens": 49553790.0, + "step": 1299 + }, + { + "epoch": 0.16537336216766316, + "ewc_loss": 0.01499713584780693, + "ewc_loss_diag": 1.043081283569336e-05, + "ewc_loss_parallel": 4.5601245801663026e-05, + "grad_norm": 3.1072299480438232, + "learning_rate": 5.506570580754557e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8454785943031311, + "num_tokens": 49593124.0, + "step": 1300 + }, + { + "epoch": 0.16550057244625366, + "ewc_loss": 0.015071731992065907, + "ewc_loss_diag": 1.043081283569336e-05, + "ewc_loss_parallel": 4.634720244212076e-05, + "grad_norm": 3.2424445152282715, + "learning_rate": 5.510809665112336e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.8530804514884949, + "num_tokens": 49632016.0, + "step": 1301 + }, + { + "epoch": 0.16562778272484416, + "ewc_loss": 0.015100246295332909, + "ewc_loss_diag": 1.043081283569336e-05, + "ewc_loss_parallel": 4.663234722102061e-05, + "grad_norm": 3.083167791366577, + "learning_rate": 5.515048749470113e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8470368981361389, + "num_tokens": 49670689.0, + "step": 1302 + }, + { + "epoch": 0.1657549930034347, + "ewc_loss": 0.015025789849460125, + "ewc_loss_diag": 1.043081283569336e-05, + "ewc_loss_parallel": 4.5887780288467184e-05, + "grad_norm": 3.124107599258423, + "learning_rate": 5.519287833827893e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.8488781452178955, + "num_tokens": 49709230.0, + "step": 1303 + }, + { + "epoch": 0.1658822032820252, + "ewc_loss": 0.015081724151968956, + "ewc_loss_diag": 1.043081283569336e-05, + "ewc_loss_parallel": 4.6447119530057535e-05, + "grad_norm": 3.2725167274475098, + "learning_rate": 5.523526918185671e-07, + "loss": 0.524, + "mean_token_accuracy": 0.8323001861572266, + "num_tokens": 49741648.0, + "step": 1304 + }, + { + "epoch": 0.1660094135606157, + "ewc_loss": 0.015134179964661598, + "ewc_loss_diag": 1.043081283569336e-05, + "ewc_loss_parallel": 4.697167605627328e-05, + "grad_norm": 3.2229223251342773, + "learning_rate": 5.52776600254345e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8533427715301514, + "num_tokens": 49773906.0, + "step": 1305 + }, + { + "epoch": 0.16613662383920622, + "ewc_loss": 0.015132243745028973, + "ewc_loss_diag": 1.049041748046875e-05, + "ewc_loss_parallel": 4.6341967390617356e-05, + "grad_norm": 3.255305767059326, + "learning_rate": 5.532005086901229e-07, + "loss": 0.5042, + "mean_token_accuracy": 0.8441376686096191, + "num_tokens": 49812645.0, + "step": 1306 + }, + { + "epoch": 0.16626383411779672, + "ewc_loss": 0.01514936052262783, + "ewc_loss_diag": 1.049041748046875e-05, + "ewc_loss_parallel": 4.6513130655512214e-05, + "grad_norm": 3.0618278980255127, + "learning_rate": 5.536244171259008e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8504605293273926, + "num_tokens": 49854500.0, + "step": 1307 + }, + { + "epoch": 0.16639104439638722, + "ewc_loss": 0.015063541010022163, + "ewc_loss_diag": 1.049041748046875e-05, + "ewc_loss_parallel": 4.5654935092898086e-05, + "grad_norm": 3.3340296745300293, + "learning_rate": 5.540483255616786e-07, + "loss": 0.5656, + "mean_token_accuracy": 0.8227949142456055, + "num_tokens": 49889408.0, + "step": 1308 + }, + { + "epoch": 0.16651825467497774, + "ewc_loss": 0.015220575034618378, + "ewc_loss_diag": 1.049041748046875e-05, + "ewc_loss_parallel": 4.722527592093684e-05, + "grad_norm": 3.196850299835205, + "learning_rate": 5.544722339974566e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.8451218605041504, + "num_tokens": 49923581.0, + "step": 1309 + }, + { + "epoch": 0.16664546495356825, + "ewc_loss": 0.015095239505171776, + "ewc_loss_diag": 1.049041748046875e-05, + "ewc_loss_parallel": 4.5971926738275215e-05, + "grad_norm": 3.102173328399658, + "learning_rate": 5.548961424332343e-07, + "loss": 0.476, + "mean_token_accuracy": 0.8458511829376221, + "num_tokens": 49959433.0, + "step": 1310 + }, + { + "epoch": 0.16677267523215875, + "ewc_loss": 0.015149164944887161, + "ewc_loss_diag": 1.055002212524414e-05, + "ewc_loss_parallel": 4.590083335642703e-05, + "grad_norm": 3.149948835372925, + "learning_rate": 5.553200508690123e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8593633770942688, + "num_tokens": 50000307.0, + "step": 1311 + }, + { + "epoch": 0.16689988551074927, + "ewc_loss": 0.01514548808336258, + "ewc_loss_diag": 1.049041748046875e-05, + "ewc_loss_parallel": 4.647440800908953e-05, + "grad_norm": 3.214815139770508, + "learning_rate": 5.557439593047901e-07, + "loss": 0.5375, + "mean_token_accuracy": 0.8282520771026611, + "num_tokens": 50035587.0, + "step": 1312 + }, + { + "epoch": 0.16702709578933977, + "ewc_loss": 0.015200534835457802, + "ewc_loss_diag": 1.055002212524414e-05, + "ewc_loss_parallel": 4.641452323994599e-05, + "grad_norm": 3.0648818016052246, + "learning_rate": 5.56167867740568e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.8479503393173218, + "num_tokens": 50079425.0, + "step": 1313 + }, + { + "epoch": 0.16715430606793028, + "ewc_loss": 0.01515001431107521, + "ewc_loss_diag": 1.055002212524414e-05, + "ewc_loss_parallel": 4.590931712300517e-05, + "grad_norm": 3.2031025886535645, + "learning_rate": 5.565917761763459e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.853099524974823, + "num_tokens": 50110674.0, + "step": 1314 + }, + { + "epoch": 0.1672815163465208, + "ewc_loss": 0.015305910259485245, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.685793101089075e-05, + "grad_norm": 3.1287343502044678, + "learning_rate": 5.570156846121238e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.8412359952926636, + "num_tokens": 50149033.0, + "step": 1315 + }, + { + "epoch": 0.1674087266251113, + "ewc_loss": 0.015243466943502426, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.6233501052483916e-05, + "grad_norm": 3.172598361968994, + "learning_rate": 5.574395930479016e-07, + "loss": 0.5189, + "mean_token_accuracy": 0.8362191319465637, + "num_tokens": 50185728.0, + "step": 1316 + }, + { + "epoch": 0.16753593690370183, + "ewc_loss": 0.01528768241405487, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.667565372074023e-05, + "grad_norm": 3.189234495162964, + "learning_rate": 5.578635014836796e-07, + "loss": 0.4691, + "mean_token_accuracy": 0.8512492179870605, + "num_tokens": 50219671.0, + "step": 1317 + }, + { + "epoch": 0.16766314718229233, + "ewc_loss": 0.015280707739293575, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.6605906391050667e-05, + "grad_norm": 3.328474521636963, + "learning_rate": 5.582874099194573e-07, + "loss": 0.5265, + "mean_token_accuracy": 0.8359026908874512, + "num_tokens": 50251989.0, + "step": 1318 + }, + { + "epoch": 0.16779035746088283, + "ewc_loss": 0.015338234603404999, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.7181172703858465e-05, + "grad_norm": 3.2356104850769043, + "learning_rate": 5.587113183552353e-07, + "loss": 0.5131, + "mean_token_accuracy": 0.8372553586959839, + "num_tokens": 50291079.0, + "step": 1319 + }, + { + "epoch": 0.16791756773947336, + "ewc_loss": 0.01526360772550106, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.6434899559244514e-05, + "grad_norm": 3.0817439556121826, + "learning_rate": 5.591352267910131e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8499664664268494, + "num_tokens": 50332262.0, + "step": 1320 + }, + { + "epoch": 0.16804477801806386, + "ewc_loss": 0.015246582217514515, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.6264649427030236e-05, + "grad_norm": 3.1717441082000732, + "learning_rate": 5.59559135226791e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8437778949737549, + "num_tokens": 50371278.0, + "step": 1321 + }, + { + "epoch": 0.16817198829665436, + "ewc_loss": 0.015330799855291843, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.710682696895674e-05, + "grad_norm": 3.213482618331909, + "learning_rate": 5.599830436625689e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.8493512868881226, + "num_tokens": 50409463.0, + "step": 1322 + }, + { + "epoch": 0.1682991985752449, + "ewc_loss": 0.01530240848660469, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.682291182689369e-05, + "grad_norm": 3.121927261352539, + "learning_rate": 5.604069520983468e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8519389033317566, + "num_tokens": 50446745.0, + "step": 1323 + }, + { + "epoch": 0.1684264088538354, + "ewc_loss": 0.015283772721886635, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.66365490865428e-05, + "grad_norm": 3.2184197902679443, + "learning_rate": 5.608308605341246e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.8503401875495911, + "num_tokens": 50479725.0, + "step": 1324 + }, + { + "epoch": 0.1685536191324259, + "ewc_loss": 0.015339275822043419, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.7191591875161976e-05, + "grad_norm": 3.3498098850250244, + "learning_rate": 5.612547689699024e-07, + "loss": 0.5249, + "mean_token_accuracy": 0.8331277370452881, + "num_tokens": 50514734.0, + "step": 1325 + }, + { + "epoch": 0.16868082941101642, + "ewc_loss": 0.015371188521385193, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.751071537612006e-05, + "grad_norm": 3.2750422954559326, + "learning_rate": 5.616786774056803e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8423302173614502, + "num_tokens": 50555165.0, + "step": 1326 + }, + { + "epoch": 0.16880803968960692, + "ewc_loss": 0.015310753136873245, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.690635614679195e-05, + "grad_norm": 3.0935308933258057, + "learning_rate": 5.621025858414582e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.8391126990318298, + "num_tokens": 50598251.0, + "step": 1327 + }, + { + "epoch": 0.16893524996819742, + "ewc_loss": 0.015263045206665993, + "ewc_loss_diag": 1.0609626770019531e-05, + "ewc_loss_parallel": 4.6429282519966364e-05, + "grad_norm": 3.0799217224121094, + "learning_rate": 5.625264942772361e-07, + "loss": 0.4969, + "mean_token_accuracy": 0.8436193466186523, + "num_tokens": 50638423.0, + "step": 1328 + }, + { + "epoch": 0.16906246024678795, + "ewc_loss": 0.015368455089628696, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.687302862294018e-05, + "grad_norm": 3.1746034622192383, + "learning_rate": 5.629504027130139e-07, + "loss": 0.5535, + "mean_token_accuracy": 0.8337963223457336, + "num_tokens": 50678226.0, + "step": 1329 + }, + { + "epoch": 0.16918967052537845, + "ewc_loss": 0.015415048226714134, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.733895548270084e-05, + "grad_norm": 3.13959002494812, + "learning_rate": 5.633743111487919e-07, + "loss": 0.5541, + "mean_token_accuracy": 0.8230895400047302, + "num_tokens": 50717542.0, + "step": 1330 + }, + { + "epoch": 0.16931688080396895, + "ewc_loss": 0.01537902932614088, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.697877011494711e-05, + "grad_norm": 3.2039105892181396, + "learning_rate": 5.637982195845697e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.859794020652771, + "num_tokens": 50752914.0, + "step": 1331 + }, + { + "epoch": 0.16944409108255948, + "ewc_loss": 0.015424035489559174, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.742882811115123e-05, + "grad_norm": 3.1058459281921387, + "learning_rate": 5.642221280203476e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.8628172278404236, + "num_tokens": 50789276.0, + "step": 1332 + }, + { + "epoch": 0.16957130136114998, + "ewc_loss": 0.015370731242001057, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.689578781835735e-05, + "grad_norm": 3.26366925239563, + "learning_rate": 5.646460364561254e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8530330657958984, + "num_tokens": 50820759.0, + "step": 1333 + }, + { + "epoch": 0.16969851163974048, + "ewc_loss": 0.015474919229745865, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.793767220689915e-05, + "grad_norm": 3.137742042541504, + "learning_rate": 5.650699448919033e-07, + "loss": 0.5453, + "mean_token_accuracy": 0.8275371193885803, + "num_tokens": 50863241.0, + "step": 1334 + }, + { + "epoch": 0.169825721918331, + "ewc_loss": 0.015396174043416977, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.715022078016773e-05, + "grad_norm": 3.2016842365264893, + "learning_rate": 5.654938533276812e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.8443161249160767, + "num_tokens": 50904296.0, + "step": 1335 + }, + { + "epoch": 0.1699529321969215, + "ewc_loss": 0.015451423823833466, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.770272062160075e-05, + "grad_norm": 3.16192364692688, + "learning_rate": 5.659177617634591e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.850556492805481, + "num_tokens": 50942180.0, + "step": 1336 + }, + { + "epoch": 0.170080142475512, + "ewc_loss": 0.015422802418470383, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.7416502638952807e-05, + "grad_norm": 3.1431729793548584, + "learning_rate": 5.663416701992369e-07, + "loss": 0.4963, + "mean_token_accuracy": 0.8431616425514221, + "num_tokens": 50980287.0, + "step": 1337 + }, + { + "epoch": 0.17020735275410254, + "ewc_loss": 0.01541934534907341, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.73819236503914e-05, + "grad_norm": 3.220381736755371, + "learning_rate": 5.667655786350149e-07, + "loss": 0.5576, + "mean_token_accuracy": 0.8177272081375122, + "num_tokens": 51014218.0, + "step": 1338 + }, + { + "epoch": 0.17033456303269304, + "ewc_loss": 0.015469105914235115, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.787953002960421e-05, + "grad_norm": 3.084930419921875, + "learning_rate": 5.671894870707927e-07, + "loss": 0.4769, + "mean_token_accuracy": 0.8508941531181335, + "num_tokens": 51055915.0, + "step": 1339 + }, + { + "epoch": 0.17046177331128357, + "ewc_loss": 0.015401950106024742, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.720797369373031e-05, + "grad_norm": 3.129735231399536, + "learning_rate": 5.676133955065705e-07, + "loss": 0.422, + "mean_token_accuracy": 0.8634214401245117, + "num_tokens": 51090684.0, + "step": 1340 + }, + { + "epoch": 0.17058898358987407, + "ewc_loss": 0.015470998361706734, + "ewc_loss_diag": 1.0669231414794922e-05, + "ewc_loss_parallel": 4.789845843333751e-05, + "grad_norm": 3.129868507385254, + "learning_rate": 5.680373039423484e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8510708212852478, + "num_tokens": 51130156.0, + "step": 1341 + }, + { + "epoch": 0.17071619386846457, + "ewc_loss": 0.01551876775920391, + "ewc_loss_diag": 1.0728836059570312e-05, + "ewc_loss_parallel": 4.7765806812094525e-05, + "grad_norm": 3.2082021236419678, + "learning_rate": 5.684612123781263e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8375019431114197, + "num_tokens": 51171282.0, + "step": 1342 + }, + { + "epoch": 0.1708434041470551, + "ewc_loss": 0.015559443272650242, + "ewc_loss_diag": 1.0728836059570312e-05, + "ewc_loss_parallel": 4.817255830857903e-05, + "grad_norm": 3.194326877593994, + "learning_rate": 5.688851208139042e-07, + "loss": 0.5227, + "mean_token_accuracy": 0.8321266174316406, + "num_tokens": 51210616.0, + "step": 1343 + }, + { + "epoch": 0.1709706144256456, + "ewc_loss": 0.015588047914206982, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.784825068782084e-05, + "grad_norm": 3.158040761947632, + "learning_rate": 5.69309029249682e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8438786864280701, + "num_tokens": 51253357.0, + "step": 1344 + }, + { + "epoch": 0.1710978247042361, + "ewc_loss": 0.015588469803333282, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.785247438121587e-05, + "grad_norm": 3.1542255878448486, + "learning_rate": 5.697329376854599e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8443677425384521, + "num_tokens": 51292671.0, + "step": 1345 + }, + { + "epoch": 0.17122503498282662, + "ewc_loss": 0.015593483112752438, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.7902605729177594e-05, + "grad_norm": 3.176145076751709, + "learning_rate": 5.701568461212378e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8628808259963989, + "num_tokens": 51329145.0, + "step": 1346 + }, + { + "epoch": 0.17135224526141712, + "ewc_loss": 0.015615446493029594, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.812224142369814e-05, + "grad_norm": 3.1144373416900635, + "learning_rate": 5.705807545570157e-07, + "loss": 0.4397, + "mean_token_accuracy": 0.8582344055175781, + "num_tokens": 51371265.0, + "step": 1347 + }, + { + "epoch": 0.17147945554000762, + "ewc_loss": 0.015580566599965096, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.7773435653652996e-05, + "grad_norm": 3.1574196815490723, + "learning_rate": 5.710046629927934e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8469988107681274, + "num_tokens": 51410262.0, + "step": 1348 + }, + { + "epoch": 0.17160666581859815, + "ewc_loss": 0.015631787478923798, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.8285655793733895e-05, + "grad_norm": 3.1855673789978027, + "learning_rate": 5.714285714285714e-07, + "loss": 0.447, + "mean_token_accuracy": 0.8569323420524597, + "num_tokens": 51447796.0, + "step": 1349 + }, + { + "epoch": 0.17173387609718865, + "ewc_loss": 0.015626484528183937, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.823262133868411e-05, + "grad_norm": 3.3336246013641357, + "learning_rate": 5.718524798643492e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.8385246992111206, + "num_tokens": 51482677.0, + "step": 1350 + }, + { + "epoch": 0.17186108637577915, + "ewc_loss": 0.01568034291267395, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.8771205911180004e-05, + "grad_norm": 3.166761636734009, + "learning_rate": 5.722763883001272e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8389499187469482, + "num_tokens": 51524111.0, + "step": 1351 + }, + { + "epoch": 0.17198829665436968, + "ewc_loss": 0.01558070257306099, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.777479261974804e-05, + "grad_norm": 3.1886534690856934, + "learning_rate": 5.72700296735905e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8447879552841187, + "num_tokens": 51561008.0, + "step": 1352 + }, + { + "epoch": 0.17211550693296018, + "ewc_loss": 0.015698743984103203, + "ewc_loss_diag": 1.0848045349121094e-05, + "ewc_loss_parallel": 4.8344860260840505e-05, + "grad_norm": 3.1514134407043457, + "learning_rate": 5.731242051716829e-07, + "loss": 0.5385, + "mean_token_accuracy": 0.8265039324760437, + "num_tokens": 51603091.0, + "step": 1353 + }, + { + "epoch": 0.17224271721155068, + "ewc_loss": 0.01567883789539337, + "ewc_loss_diag": 1.0848045349121094e-05, + "ewc_loss_parallel": 4.814579733647406e-05, + "grad_norm": 3.198005437850952, + "learning_rate": 5.735481136074608e-07, + "loss": 0.4537, + "mean_token_accuracy": 0.8530038595199585, + "num_tokens": 51637789.0, + "step": 1354 + }, + { + "epoch": 0.1723699274901412, + "ewc_loss": 0.01570092886686325, + "ewc_loss_diag": 1.0848045349121094e-05, + "ewc_loss_parallel": 4.8366713599534705e-05, + "grad_norm": 3.2291672229766846, + "learning_rate": 5.739720220432386e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8503297567367554, + "num_tokens": 51674207.0, + "step": 1355 + }, + { + "epoch": 0.1724971377687317, + "ewc_loss": 0.015644272789359093, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.8410496674478054e-05, + "grad_norm": 3.1720950603485107, + "learning_rate": 5.743959304790164e-07, + "loss": 0.5526, + "mean_token_accuracy": 0.8276189565658569, + "num_tokens": 51715109.0, + "step": 1356 + }, + { + "epoch": 0.1726243480473222, + "ewc_loss": 0.015633728355169296, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.8305068048648536e-05, + "grad_norm": 3.2163033485412598, + "learning_rate": 5.748198389147944e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8434287309646606, + "num_tokens": 51753373.0, + "step": 1357 + }, + { + "epoch": 0.17275155832591274, + "ewc_loss": 0.01565936952829361, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.856146188103594e-05, + "grad_norm": 3.1791837215423584, + "learning_rate": 5.752437473505722e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.8338770270347595, + "num_tokens": 51795100.0, + "step": 1358 + }, + { + "epoch": 0.17287876860450324, + "ewc_loss": 0.01564452424645424, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.8413010517833754e-05, + "grad_norm": 3.2048017978668213, + "learning_rate": 5.756676557863502e-07, + "loss": 0.5111, + "mean_token_accuracy": 0.8369350433349609, + "num_tokens": 51831840.0, + "step": 1359 + }, + { + "epoch": 0.17300597888309374, + "ewc_loss": 0.015667444095015526, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.864221773459576e-05, + "grad_norm": 3.125608205795288, + "learning_rate": 5.76091564222128e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8525323867797852, + "num_tokens": 51870227.0, + "step": 1360 + }, + { + "epoch": 0.17313318916168427, + "ewc_loss": 0.01565595343708992, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.8527315811952576e-05, + "grad_norm": 3.2705934047698975, + "learning_rate": 5.765154726579059e-07, + "loss": 0.489, + "mean_token_accuracy": 0.8451974987983704, + "num_tokens": 51908251.0, + "step": 1361 + }, + { + "epoch": 0.17326039944027477, + "ewc_loss": 0.015723848715424538, + "ewc_loss_diag": 1.0788440704345703e-05, + "ewc_loss_parallel": 4.9206257244804874e-05, + "grad_norm": 3.2143795490264893, + "learning_rate": 5.769393810936838e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8469268083572388, + "num_tokens": 51944531.0, + "step": 1362 + }, + { + "epoch": 0.17338760971886527, + "ewc_loss": 0.015790825709700584, + "ewc_loss_diag": 1.0907649993896484e-05, + "ewc_loss_parallel": 4.8655321734258905e-05, + "grad_norm": 3.1732864379882812, + "learning_rate": 5.773632895294616e-07, + "loss": 0.5314, + "mean_token_accuracy": 0.8308275938034058, + "num_tokens": 51985742.0, + "step": 1363 + }, + { + "epoch": 0.1735148199974558, + "ewc_loss": 0.015795759856700897, + "ewc_loss_diag": 1.0907649993896484e-05, + "ewc_loss_parallel": 4.870467819273472e-05, + "grad_norm": 3.1843268871307373, + "learning_rate": 5.777871979652394e-07, + "loss": 0.47, + "mean_token_accuracy": 0.8501267433166504, + "num_tokens": 52022036.0, + "step": 1364 + }, + { + "epoch": 0.1736420302760463, + "ewc_loss": 0.01582525297999382, + "ewc_loss_diag": 1.0907649993896484e-05, + "ewc_loss_parallel": 4.8999594582710415e-05, + "grad_norm": 3.1317012310028076, + "learning_rate": 5.782111064010173e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.842107892036438, + "num_tokens": 52066077.0, + "step": 1365 + }, + { + "epoch": 0.17376924055463683, + "ewc_loss": 0.01580127142369747, + "ewc_loss_diag": 1.0907649993896484e-05, + "ewc_loss_parallel": 4.8759782657725736e-05, + "grad_norm": 3.158396005630493, + "learning_rate": 5.786350148367952e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8463597297668457, + "num_tokens": 52111280.0, + "step": 1366 + }, + { + "epoch": 0.17389645083322733, + "ewc_loss": 0.01589713618159294, + "ewc_loss_diag": 1.0967254638671875e-05, + "ewc_loss_parallel": 4.910807547275908e-05, + "grad_norm": 3.174262523651123, + "learning_rate": 5.790589232725731e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8445292711257935, + "num_tokens": 52153148.0, + "step": 1367 + }, + { + "epoch": 0.17402366111181783, + "ewc_loss": 0.01587684452533722, + "ewc_loss_diag": 1.0967254638671875e-05, + "ewc_loss_parallel": 4.890516720479354e-05, + "grad_norm": 3.2444262504577637, + "learning_rate": 5.79482831708351e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8498974442481995, + "num_tokens": 52186722.0, + "step": 1368 + }, + { + "epoch": 0.17415087139040836, + "ewc_loss": 0.015925157815217972, + "ewc_loss_diag": 1.0967254638671875e-05, + "ewc_loss_parallel": 4.938829442835413e-05, + "grad_norm": 3.3164103031158447, + "learning_rate": 5.799067401441288e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8454914093017578, + "num_tokens": 52220837.0, + "step": 1369 + }, + { + "epoch": 0.17427808166899886, + "ewc_loss": 0.015938911586999893, + "ewc_loss_diag": 1.0967254638671875e-05, + "ewc_loss_parallel": 4.952582821715623e-05, + "grad_norm": 3.20504093170166, + "learning_rate": 5.803306485799068e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8536348342895508, + "num_tokens": 52257049.0, + "step": 1370 + }, + { + "epoch": 0.17440529194758936, + "ewc_loss": 0.015869265422225, + "ewc_loss_diag": 1.0967254638671875e-05, + "ewc_loss_parallel": 4.88293771923054e-05, + "grad_norm": 3.1516146659851074, + "learning_rate": 5.807545570156845e-07, + "loss": 0.4409, + "mean_token_accuracy": 0.85779869556427, + "num_tokens": 52297632.0, + "step": 1371 + }, + { + "epoch": 0.1745325022261799, + "ewc_loss": 0.015881488099694252, + "ewc_loss_diag": 1.0967254638671875e-05, + "ewc_loss_parallel": 4.8951598728308454e-05, + "grad_norm": 3.335508108139038, + "learning_rate": 5.811784654514624e-07, + "loss": 0.5369, + "mean_token_accuracy": 0.8298963308334351, + "num_tokens": 52331008.0, + "step": 1372 + }, + { + "epoch": 0.1746597125047704, + "ewc_loss": 0.015972500666975975, + "ewc_loss_diag": 1.0967254638671875e-05, + "ewc_loss_parallel": 4.98617191624362e-05, + "grad_norm": 3.150634527206421, + "learning_rate": 5.816023738872403e-07, + "loss": 0.4687, + "mean_token_accuracy": 0.8513772487640381, + "num_tokens": 52369420.0, + "step": 1373 + }, + { + "epoch": 0.1747869227833609, + "ewc_loss": 0.015915971249341965, + "ewc_loss_diag": 1.1026859283447266e-05, + "ewc_loss_parallel": 4.868608448305167e-05, + "grad_norm": 3.197770833969116, + "learning_rate": 5.820262823230182e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8416388034820557, + "num_tokens": 52411759.0, + "step": 1374 + }, + { + "epoch": 0.17491413306195142, + "ewc_loss": 0.01599240116775036, + "ewc_loss_diag": 1.1026859283447266e-05, + "ewc_loss_parallel": 4.9450383812654763e-05, + "grad_norm": 3.3023204803466797, + "learning_rate": 5.824501907587961e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.852459192276001, + "num_tokens": 52446459.0, + "step": 1375 + }, + { + "epoch": 0.17504134334054192, + "ewc_loss": 0.01601666957139969, + "ewc_loss_diag": 1.1026859283447266e-05, + "ewc_loss_parallel": 4.969306610291824e-05, + "grad_norm": 3.1914455890655518, + "learning_rate": 5.82874099194574e-07, + "loss": 0.544, + "mean_token_accuracy": 0.827163815498352, + "num_tokens": 52484615.0, + "step": 1376 + }, + { + "epoch": 0.17516855361913242, + "ewc_loss": 0.015948647633194923, + "ewc_loss_diag": 1.1026859283447266e-05, + "ewc_loss_parallel": 4.901284410152584e-05, + "grad_norm": 3.134716510772705, + "learning_rate": 5.832980076303518e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.8236509561538696, + "num_tokens": 52525195.0, + "step": 1377 + }, + { + "epoch": 0.17529576389772294, + "ewc_loss": 0.01597229391336441, + "ewc_loss_diag": 1.1026859283447266e-05, + "ewc_loss_parallel": 4.924930181005038e-05, + "grad_norm": 3.258354663848877, + "learning_rate": 5.837219160661297e-07, + "loss": 0.44, + "mean_token_accuracy": 0.857669472694397, + "num_tokens": 52558783.0, + "step": 1378 + }, + { + "epoch": 0.17542297417631345, + "ewc_loss": 0.016098909080028534, + "ewc_loss_diag": 1.1086463928222656e-05, + "ewc_loss_parallel": 4.9905098421731964e-05, + "grad_norm": 3.1506412029266357, + "learning_rate": 5.841458245019075e-07, + "loss": 0.4343, + "mean_token_accuracy": 0.8585200309753418, + "num_tokens": 52599176.0, + "step": 1379 + }, + { + "epoch": 0.17555018445490395, + "ewc_loss": 0.016025464981794357, + "ewc_loss_diag": 1.1086463928222656e-05, + "ewc_loss_parallel": 4.917066326015629e-05, + "grad_norm": 3.306321382522583, + "learning_rate": 5.845697329376855e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8470500111579895, + "num_tokens": 52630865.0, + "step": 1380 + }, + { + "epoch": 0.17567739473349447, + "ewc_loss": 0.016125531867146492, + "ewc_loss_diag": 1.1086463928222656e-05, + "ewc_loss_parallel": 5.017133298679255e-05, + "grad_norm": 3.2222347259521484, + "learning_rate": 5.849936413734633e-07, + "loss": 0.5425, + "mean_token_accuracy": 0.8277870416641235, + "num_tokens": 52671702.0, + "step": 1381 + }, + { + "epoch": 0.17580460501208497, + "ewc_loss": 0.01611793041229248, + "ewc_loss_diag": 1.1146068572998047e-05, + "ewc_loss_parallel": 4.948497371515259e-05, + "grad_norm": 3.149789571762085, + "learning_rate": 5.854175498092412e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8571757674217224, + "num_tokens": 52712288.0, + "step": 1382 + }, + { + "epoch": 0.17593181529067548, + "ewc_loss": 0.016194023191928864, + "ewc_loss_diag": 1.1205673217773438e-05, + "ewc_loss_parallel": 4.9635553295956925e-05, + "grad_norm": 3.1651511192321777, + "learning_rate": 5.858414582450191e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.8454356789588928, + "num_tokens": 52756569.0, + "step": 1383 + }, + { + "epoch": 0.176059025569266, + "ewc_loss": 0.01622079685330391, + "ewc_loss_diag": 1.1205673217773438e-05, + "ewc_loss_parallel": 4.990327579434961e-05, + "grad_norm": 3.199697494506836, + "learning_rate": 5.86265366680797e-07, + "loss": 0.5123, + "mean_token_accuracy": 0.8383734822273254, + "num_tokens": 52796195.0, + "step": 1384 + }, + { + "epoch": 0.1761862358478565, + "ewc_loss": 0.016224725171923637, + "ewc_loss_diag": 1.1205673217773438e-05, + "ewc_loss_parallel": 4.9942562327487394e-05, + "grad_norm": 3.148937702178955, + "learning_rate": 5.866892751165748e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.847622811794281, + "num_tokens": 52839687.0, + "step": 1385 + }, + { + "epoch": 0.176313446126447, + "ewc_loss": 0.01620566099882126, + "ewc_loss_diag": 1.1205673217773438e-05, + "ewc_loss_parallel": 4.975192860001698e-05, + "grad_norm": 3.298469305038452, + "learning_rate": 5.871131835523526e-07, + "loss": 0.5376, + "mean_token_accuracy": 0.8295401334762573, + "num_tokens": 52875660.0, + "step": 1386 + }, + { + "epoch": 0.17644065640503753, + "ewc_loss": 0.016300512477755547, + "ewc_loss_diag": 1.1205673217773438e-05, + "ewc_loss_parallel": 5.070043698651716e-05, + "grad_norm": 3.150785207748413, + "learning_rate": 5.875370919881305e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8419914245605469, + "num_tokens": 52921220.0, + "step": 1387 + }, + { + "epoch": 0.17656786668362803, + "ewc_loss": 0.01619746722280979, + "ewc_loss_diag": 1.1205673217773438e-05, + "ewc_loss_parallel": 4.9669990403344855e-05, + "grad_norm": 3.2801198959350586, + "learning_rate": 5.879610004239084e-07, + "loss": 0.547, + "mean_token_accuracy": 0.8309175968170166, + "num_tokens": 52960042.0, + "step": 1388 + }, + { + "epoch": 0.17669507696221853, + "ewc_loss": 0.016301961615681648, + "ewc_loss_diag": 1.1205673217773438e-05, + "ewc_loss_parallel": 5.0714923418127e-05, + "grad_norm": 3.2739338874816895, + "learning_rate": 5.883849088596863e-07, + "loss": 0.5312, + "mean_token_accuracy": 0.8338138461112976, + "num_tokens": 53002817.0, + "step": 1389 + }, + { + "epoch": 0.17682228724080906, + "ewc_loss": 0.016251930966973305, + "ewc_loss_diag": 1.1205673217773438e-05, + "ewc_loss_parallel": 5.021462493459694e-05, + "grad_norm": 3.265519857406616, + "learning_rate": 5.888088172954641e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8415738344192505, + "num_tokens": 53037751.0, + "step": 1390 + }, + { + "epoch": 0.17694949751939956, + "ewc_loss": 0.016322795301675797, + "ewc_loss_diag": 1.1265277862548828e-05, + "ewc_loss_parallel": 5.0312904932070524e-05, + "grad_norm": 3.2394158840179443, + "learning_rate": 5.892327257312421e-07, + "loss": 0.4201, + "mean_token_accuracy": 0.8669539093971252, + "num_tokens": 53074286.0, + "step": 1391 + }, + { + "epoch": 0.1770767077979901, + "ewc_loss": 0.01637166552245617, + "ewc_loss_diag": 1.1324882507324219e-05, + "ewc_loss_parallel": 5.01912618346978e-05, + "grad_norm": 3.2051241397857666, + "learning_rate": 5.896566341670199e-07, + "loss": 0.4403, + "mean_token_accuracy": 0.8546310663223267, + "num_tokens": 53113525.0, + "step": 1392 + }, + { + "epoch": 0.1772039180765806, + "ewc_loss": 0.016379252076148987, + "ewc_loss_diag": 1.1324882507324219e-05, + "ewc_loss_parallel": 5.026713915867731e-05, + "grad_norm": 3.256728410720825, + "learning_rate": 5.900805426027977e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.845782458782196, + "num_tokens": 53150137.0, + "step": 1393 + }, + { + "epoch": 0.1773311283551711, + "ewc_loss": 0.01639282889664173, + "ewc_loss_diag": 1.1324882507324219e-05, + "ewc_loss_parallel": 5.0402893975842744e-05, + "grad_norm": 3.168236017227173, + "learning_rate": 5.905044510385756e-07, + "loss": 0.4377, + "mean_token_accuracy": 0.8598669171333313, + "num_tokens": 53189519.0, + "step": 1394 + }, + { + "epoch": 0.17745833863376162, + "ewc_loss": 0.01627206802368164, + "ewc_loss_diag": 1.1265277862548828e-05, + "ewc_loss_parallel": 4.98056469950825e-05, + "grad_norm": 3.216839551925659, + "learning_rate": 5.909283594743535e-07, + "loss": 0.5458, + "mean_token_accuracy": 0.827987790107727, + "num_tokens": 53227452.0, + "step": 1395 + }, + { + "epoch": 0.17758554891235212, + "ewc_loss": 0.016344351693987846, + "ewc_loss_diag": 1.1265277862548828e-05, + "ewc_loss_parallel": 5.0528484280221164e-05, + "grad_norm": 3.250509738922119, + "learning_rate": 5.913522679101314e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8441353440284729, + "num_tokens": 53265875.0, + "step": 1396 + }, + { + "epoch": 0.17771275919094262, + "ewc_loss": 0.016396470367908478, + "ewc_loss_diag": 1.1324882507324219e-05, + "ewc_loss_parallel": 5.043931378168054e-05, + "grad_norm": 3.3186028003692627, + "learning_rate": 5.917761763459093e-07, + "loss": 0.538, + "mean_token_accuracy": 0.8295900225639343, + "num_tokens": 53305477.0, + "step": 1397 + }, + { + "epoch": 0.17783996946953315, + "ewc_loss": 0.016407696530222893, + "ewc_loss_diag": 1.1324882507324219e-05, + "ewc_loss_parallel": 5.055157816968858e-05, + "grad_norm": 3.1970698833465576, + "learning_rate": 5.922000847816871e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.8545709252357483, + "num_tokens": 53342800.0, + "step": 1398 + }, + { + "epoch": 0.17796717974812365, + "ewc_loss": 0.016359688714146614, + "ewc_loss_diag": 1.1324882507324219e-05, + "ewc_loss_parallel": 5.0071492296410725e-05, + "grad_norm": 3.1905007362365723, + "learning_rate": 5.926239932174651e-07, + "loss": 0.5236, + "mean_token_accuracy": 0.8354456424713135, + "num_tokens": 53387476.0, + "step": 1399 + }, + { + "epoch": 0.17809439002671415, + "ewc_loss": 0.016451425850391388, + "ewc_loss_diag": 1.138448715209961e-05, + "ewc_loss_parallel": 5.037851951783523e-05, + "grad_norm": 3.2133431434631348, + "learning_rate": 5.930479016532429e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.846436619758606, + "num_tokens": 53424628.0, + "step": 1400 + }, + { + "epoch": 0.17822160030530468, + "ewc_loss": 0.01645856723189354, + "ewc_loss_diag": 1.138448715209961e-05, + "ewc_loss_parallel": 5.044992212788202e-05, + "grad_norm": 3.2366209030151367, + "learning_rate": 5.934718100890207e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8505085110664368, + "num_tokens": 53464459.0, + "step": 1401 + }, + { + "epoch": 0.17834881058389518, + "ewc_loss": 0.016482029110193253, + "ewc_loss_diag": 1.138448715209961e-05, + "ewc_loss_parallel": 5.068454993306659e-05, + "grad_norm": 3.27036714553833, + "learning_rate": 5.938957185247986e-07, + "loss": 0.5103, + "mean_token_accuracy": 0.8349012136459351, + "num_tokens": 53501721.0, + "step": 1402 + }, + { + "epoch": 0.17847602086248568, + "ewc_loss": 0.01650407165288925, + "ewc_loss_diag": 1.138448715209961e-05, + "ewc_loss_parallel": 5.0904978706967086e-05, + "grad_norm": 3.330584764480591, + "learning_rate": 5.943196269605765e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8459129929542542, + "num_tokens": 53536607.0, + "step": 1403 + }, + { + "epoch": 0.1786032311410762, + "ewc_loss": 0.016503162682056427, + "ewc_loss_diag": 1.138448715209961e-05, + "ewc_loss_parallel": 5.089589467388578e-05, + "grad_norm": 3.3416757583618164, + "learning_rate": 5.947435353963544e-07, + "loss": 0.5216, + "mean_token_accuracy": 0.8390560150146484, + "num_tokens": 53569809.0, + "step": 1404 + }, + { + "epoch": 0.1787304414196667, + "ewc_loss": 0.016515854746103287, + "ewc_loss_diag": 1.138448715209961e-05, + "ewc_loss_parallel": 5.102281647850759e-05, + "grad_norm": 3.250972270965576, + "learning_rate": 5.951674438321323e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8419957160949707, + "num_tokens": 53604423.0, + "step": 1405 + }, + { + "epoch": 0.1788576516982572, + "ewc_loss": 0.016464080661535263, + "ewc_loss_diag": 1.138448715209961e-05, + "ewc_loss_parallel": 5.05050593346823e-05, + "grad_norm": 3.2589666843414307, + "learning_rate": 5.955913522679101e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8388532400131226, + "num_tokens": 53642145.0, + "step": 1406 + }, + { + "epoch": 0.17898486197684774, + "ewc_loss": 0.016576625406742096, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.102016802993603e-05, + "grad_norm": 3.239353656768799, + "learning_rate": 5.96015260703688e-07, + "loss": 0.569, + "mean_token_accuracy": 0.8185956478118896, + "num_tokens": 53682998.0, + "step": 1407 + }, + { + "epoch": 0.17911207225543824, + "ewc_loss": 0.016562415286898613, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.08780540258158e-05, + "grad_norm": 3.1582789421081543, + "learning_rate": 5.964391691394659e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8435359001159668, + "num_tokens": 53729764.0, + "step": 1408 + }, + { + "epoch": 0.17923928253402874, + "ewc_loss": 0.016536090523004532, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.0614809879334643e-05, + "grad_norm": 3.280923366546631, + "learning_rate": 5.968630775752436e-07, + "loss": 0.5345, + "mean_token_accuracy": 0.8309985399246216, + "num_tokens": 53764522.0, + "step": 1409 + }, + { + "epoch": 0.17936649281261927, + "ewc_loss": 0.01663072593510151, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.156116822035983e-05, + "grad_norm": 3.3212947845458984, + "learning_rate": 5.972869860110216e-07, + "loss": 0.5178, + "mean_token_accuracy": 0.8321837186813354, + "num_tokens": 53799791.0, + "step": 1410 + }, + { + "epoch": 0.17949370309120977, + "ewc_loss": 0.01659945398569107, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.1248440286144614e-05, + "grad_norm": 3.2556443214416504, + "learning_rate": 5.977108944467994e-07, + "loss": 0.5556, + "mean_token_accuracy": 0.8285506963729858, + "num_tokens": 53840181.0, + "step": 1411 + }, + { + "epoch": 0.17962091336980027, + "ewc_loss": 0.016581367701292038, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.106759272166528e-05, + "grad_norm": 3.1729209423065186, + "learning_rate": 5.981348028825774e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8551050424575806, + "num_tokens": 53882813.0, + "step": 1412 + }, + { + "epoch": 0.1797481236483908, + "ewc_loss": 0.01657691039144993, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.102300929138437e-05, + "grad_norm": 3.2451345920562744, + "learning_rate": 5.985587113183552e-07, + "loss": 0.5101, + "mean_token_accuracy": 0.839328408241272, + "num_tokens": 53922198.0, + "step": 1413 + }, + { + "epoch": 0.1798753339269813, + "ewc_loss": 0.01663890853524208, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.164300091564655e-05, + "grad_norm": 3.4776570796966553, + "learning_rate": 5.989826197541331e-07, + "loss": 0.5062, + "mean_token_accuracy": 0.8403400778770447, + "num_tokens": 53959390.0, + "step": 1414 + }, + { + "epoch": 0.18000254420557182, + "ewc_loss": 0.01669890433549881, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.224294000072405e-05, + "grad_norm": 3.1953163146972656, + "learning_rate": 5.99406528189911e-07, + "loss": 0.5326, + "mean_token_accuracy": 0.8369069695472717, + "num_tokens": 54005707.0, + "step": 1415 + }, + { + "epoch": 0.18012975448416232, + "ewc_loss": 0.016545061022043228, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.070452607469633e-05, + "grad_norm": 3.2682948112487793, + "learning_rate": 5.998304366256888e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.8412243127822876, + "num_tokens": 54047808.0, + "step": 1416 + }, + { + "epoch": 0.18025696476275282, + "ewc_loss": 0.016640424728393555, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.165814218344167e-05, + "grad_norm": 3.2736315727233887, + "learning_rate": 6.002543450614666e-07, + "loss": 0.4229, + "mean_token_accuracy": 0.8675962686538696, + "num_tokens": 54082929.0, + "step": 1417 + }, + { + "epoch": 0.18038417504134335, + "ewc_loss": 0.016613353043794632, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.138743290444836e-05, + "grad_norm": 3.202256917953491, + "learning_rate": 6.006782534972446e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8463359475135803, + "num_tokens": 54120199.0, + "step": 1418 + }, + { + "epoch": 0.18051138531993385, + "ewc_loss": 0.016597211360931396, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.122602669871412e-05, + "grad_norm": 3.364208459854126, + "learning_rate": 6.011021619330224e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.8350180387496948, + "num_tokens": 54158629.0, + "step": 1419 + }, + { + "epoch": 0.18063859559852435, + "ewc_loss": 0.016678933054208755, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.2043236792087555e-05, + "grad_norm": 3.303107976913452, + "learning_rate": 6.015260703688004e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8397423028945923, + "num_tokens": 54194772.0, + "step": 1420 + }, + { + "epoch": 0.18076580587711488, + "ewc_loss": 0.016620714217424393, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.1461043767631054e-05, + "grad_norm": 3.3014492988586426, + "learning_rate": 6.019499788045782e-07, + "loss": 0.506, + "mean_token_accuracy": 0.840990424156189, + "num_tokens": 54229825.0, + "step": 1421 + }, + { + "epoch": 0.18089301615570538, + "ewc_loss": 0.016637805849313736, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.163197420188226e-05, + "grad_norm": 3.3263356685638428, + "learning_rate": 6.023738872403561e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.8518572449684143, + "num_tokens": 54260051.0, + "step": 1422 + }, + { + "epoch": 0.18102022643429588, + "ewc_loss": 0.016656197607517242, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.181587766855955e-05, + "grad_norm": 3.267437696456909, + "learning_rate": 6.02797795676134e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.848202109336853, + "num_tokens": 54297865.0, + "step": 1423 + }, + { + "epoch": 0.1811474367128864, + "ewc_loss": 0.016614381223917007, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.1397721108514816e-05, + "grad_norm": 3.3343873023986816, + "learning_rate": 6.032217041119118e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8352597951889038, + "num_tokens": 54334618.0, + "step": 1424 + }, + { + "epoch": 0.1812746469914769, + "ewc_loss": 0.016681130975484848, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.206521746004e-05, + "grad_norm": 3.300222873687744, + "learning_rate": 6.036456125476896e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8277899622917175, + "num_tokens": 54369761.0, + "step": 1425 + }, + { + "epoch": 0.1814018572700674, + "ewc_loss": 0.016639482229948044, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.164872345631011e-05, + "grad_norm": 3.343754529953003, + "learning_rate": 6.040695209834675e-07, + "loss": 0.4397, + "mean_token_accuracy": 0.8565112352371216, + "num_tokens": 54411549.0, + "step": 1426 + }, + { + "epoch": 0.18152906754865794, + "ewc_loss": 0.01666647382080555, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.191864329390228e-05, + "grad_norm": 3.2382419109344482, + "learning_rate": 6.044934294192454e-07, + "loss": 0.5256, + "mean_token_accuracy": 0.835458517074585, + "num_tokens": 54449137.0, + "step": 1427 + }, + { + "epoch": 0.18165627782724844, + "ewc_loss": 0.0166255421936512, + "ewc_loss_diag": 1.1444091796875e-05, + "ewc_loss_parallel": 5.1509337936295196e-05, + "grad_norm": 3.1621246337890625, + "learning_rate": 6.049173378550233e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8494847416877747, + "num_tokens": 54494604.0, + "step": 1428 + }, + { + "epoch": 0.18178348810583894, + "ewc_loss": 0.016631517559289932, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.156909173820168e-05, + "grad_norm": 3.215599536895752, + "learning_rate": 6.053412462908012e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.8517562747001648, + "num_tokens": 54533533.0, + "step": 1429 + }, + { + "epoch": 0.18191069838442947, + "ewc_loss": 0.016675181686878204, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.200572923058644e-05, + "grad_norm": 3.246427297592163, + "learning_rate": 6.05765154726579e-07, + "loss": 0.5324, + "mean_token_accuracy": 0.8330182433128357, + "num_tokens": 54575211.0, + "step": 1430 + }, + { + "epoch": 0.18203790866301997, + "ewc_loss": 0.016678184270858765, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.2035757107660174e-05, + "grad_norm": 3.1743197441101074, + "learning_rate": 6.061890631623569e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8590633869171143, + "num_tokens": 54618308.0, + "step": 1431 + }, + { + "epoch": 0.18216511894161047, + "ewc_loss": 0.016654012724757195, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.179403524380177e-05, + "grad_norm": 3.21041202545166, + "learning_rate": 6.066129715981347e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8471961617469788, + "num_tokens": 54660260.0, + "step": 1432 + }, + { + "epoch": 0.182292329220201, + "ewc_loss": 0.016698520630598068, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.223912376095541e-05, + "grad_norm": 3.299199342727661, + "learning_rate": 6.070368800339126e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.8557674884796143, + "num_tokens": 54692056.0, + "step": 1433 + }, + { + "epoch": 0.1824195394987915, + "ewc_loss": 0.01673012599349022, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.255517680780031e-05, + "grad_norm": 3.2953593730926514, + "learning_rate": 6.074607884696905e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8481701612472534, + "num_tokens": 54729252.0, + "step": 1434 + }, + { + "epoch": 0.182546749777382, + "ewc_loss": 0.01670614816248417, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.2315383072709665e-05, + "grad_norm": 3.322944402694702, + "learning_rate": 6.078846969054684e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8412883877754211, + "num_tokens": 54763422.0, + "step": 1435 + }, + { + "epoch": 0.18267396005597253, + "ewc_loss": 0.016731595620512962, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.256985605228692e-05, + "grad_norm": 3.255425453186035, + "learning_rate": 6.083086053412463e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8494096994400024, + "num_tokens": 54803494.0, + "step": 1436 + }, + { + "epoch": 0.18280117033456303, + "ewc_loss": 0.01668769307434559, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.213083932176232e-05, + "grad_norm": 3.3215622901916504, + "learning_rate": 6.087325137770242e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8532149195671082, + "num_tokens": 54839349.0, + "step": 1437 + }, + { + "epoch": 0.18292838061315353, + "ewc_loss": 0.01673838682472706, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.263776984065771e-05, + "grad_norm": 3.311016321182251, + "learning_rate": 6.09156422212802e-07, + "loss": 0.5306, + "mean_token_accuracy": 0.8361040353775024, + "num_tokens": 54881941.0, + "step": 1438 + }, + { + "epoch": 0.18305559089174406, + "ewc_loss": 0.016710694879293442, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.236085780779831e-05, + "grad_norm": 3.2488224506378174, + "learning_rate": 6.095803306485799e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8437752723693848, + "num_tokens": 54922927.0, + "step": 1439 + }, + { + "epoch": 0.18318280117033456, + "ewc_loss": 0.01670844294130802, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.2338338718982413e-05, + "grad_norm": 3.3252112865448, + "learning_rate": 6.100042390843577e-07, + "loss": 0.508, + "mean_token_accuracy": 0.837791919708252, + "num_tokens": 54963885.0, + "step": 1440 + }, + { + "epoch": 0.1833100114489251, + "ewc_loss": 0.01673794724047184, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.263337880023755e-05, + "grad_norm": 3.265869617462158, + "learning_rate": 6.104281475201356e-07, + "loss": 0.4597, + "mean_token_accuracy": 0.8517081141471863, + "num_tokens": 55001284.0, + "step": 1441 + }, + { + "epoch": 0.1834372217275156, + "ewc_loss": 0.016712907701730728, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.2382980356924236e-05, + "grad_norm": 3.2967257499694824, + "learning_rate": 6.108520559559135e-07, + "loss": 0.5187, + "mean_token_accuracy": 0.8360110521316528, + "num_tokens": 55034813.0, + "step": 1442 + }, + { + "epoch": 0.1835644320061061, + "ewc_loss": 0.016740791499614716, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.26618241565302e-05, + "grad_norm": 3.237273931503296, + "learning_rate": 6.112759643916914e-07, + "loss": 0.4251, + "mean_token_accuracy": 0.8628554940223694, + "num_tokens": 55070620.0, + "step": 1443 + }, + { + "epoch": 0.18369164228469662, + "ewc_loss": 0.016712825745344162, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.238215817371383e-05, + "grad_norm": 3.360623836517334, + "learning_rate": 6.116998728274693e-07, + "loss": 0.5062, + "mean_token_accuracy": 0.8438907861709595, + "num_tokens": 55108394.0, + "step": 1444 + }, + { + "epoch": 0.18381885256328712, + "ewc_loss": 0.01679704152047634, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.322432116372511e-05, + "grad_norm": 3.294522523880005, + "learning_rate": 6.121237812632472e-07, + "loss": 0.5092, + "mean_token_accuracy": 0.8367646932601929, + "num_tokens": 55148170.0, + "step": 1445 + }, + { + "epoch": 0.18394606284187762, + "ewc_loss": 0.016716470941901207, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.2418617997318506e-05, + "grad_norm": 3.2126309871673584, + "learning_rate": 6.125476896990249e-07, + "loss": 0.4529, + "mean_token_accuracy": 0.8572472333908081, + "num_tokens": 55188102.0, + "step": 1446 + }, + { + "epoch": 0.18407327312046814, + "ewc_loss": 0.01673908904194832, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.2644805691670626e-05, + "grad_norm": 3.310511350631714, + "learning_rate": 6.129715981348028e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8484984636306763, + "num_tokens": 55225893.0, + "step": 1447 + }, + { + "epoch": 0.18420048339905865, + "ewc_loss": 0.016796177253127098, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.321568460203707e-05, + "grad_norm": 3.3480539321899414, + "learning_rate": 6.133955065705807e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8353408575057983, + "num_tokens": 55265321.0, + "step": 1448 + }, + { + "epoch": 0.18432769367764915, + "ewc_loss": 0.016795657575130463, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.321048229234293e-05, + "grad_norm": 3.2649290561676025, + "learning_rate": 6.138194150063585e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8509968519210815, + "num_tokens": 55304179.0, + "step": 1449 + }, + { + "epoch": 0.18445490395623967, + "ewc_loss": 0.01675749197602272, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.282883284962736e-05, + "grad_norm": 3.262500524520874, + "learning_rate": 6.142433234421365e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8386369347572327, + "num_tokens": 55343266.0, + "step": 1450 + }, + { + "epoch": 0.18458211423483017, + "ewc_loss": 0.016787977889180183, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.313368819770403e-05, + "grad_norm": 3.303765296936035, + "learning_rate": 6.146672318779143e-07, + "loss": 0.5073, + "mean_token_accuracy": 0.8384686708450317, + "num_tokens": 55381147.0, + "step": 1451 + }, + { + "epoch": 0.18470932451342068, + "ewc_loss": 0.016933515667915344, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.336835602065548e-05, + "grad_norm": 3.3675336837768555, + "learning_rate": 6.150911403136923e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.8410065174102783, + "num_tokens": 55414855.0, + "step": 1452 + }, + { + "epoch": 0.1848365347920112, + "ewc_loss": 0.01681930385529995, + "ewc_loss_diag": 1.150369644165039e-05, + "ewc_loss_parallel": 5.3446947276825085e-05, + "grad_norm": 3.2988202571868896, + "learning_rate": 6.155150487494701e-07, + "loss": 0.433, + "mean_token_accuracy": 0.861290693283081, + "num_tokens": 55449699.0, + "step": 1453 + }, + { + "epoch": 0.1849637450706017, + "ewc_loss": 0.01685037463903427, + "ewc_loss_diag": 1.1563301086425781e-05, + "ewc_loss_parallel": 5.3147294238442555e-05, + "grad_norm": 3.2343432903289795, + "learning_rate": 6.159389571852479e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.8540113568305969, + "num_tokens": 55493202.0, + "step": 1454 + }, + { + "epoch": 0.1850909553491922, + "ewc_loss": 0.01689903438091278, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.302354475134052e-05, + "grad_norm": 3.2575109004974365, + "learning_rate": 6.163628656210258e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8491994738578796, + "num_tokens": 55539873.0, + "step": 1455 + }, + { + "epoch": 0.18521816562778273, + "ewc_loss": 0.016871245577931404, + "ewc_loss_diag": 1.1563301086425781e-05, + "ewc_loss_parallel": 5.335600508260541e-05, + "grad_norm": 3.3360254764556885, + "learning_rate": 6.167867740568037e-07, + "loss": 0.5413, + "mean_token_accuracy": 0.8307304978370667, + "num_tokens": 55576338.0, + "step": 1456 + }, + { + "epoch": 0.18534537590637323, + "ewc_loss": 0.016965368762612343, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.3686893807025626e-05, + "grad_norm": 3.236421585083008, + "learning_rate": 6.172106824925815e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.8506685495376587, + "num_tokens": 55618330.0, + "step": 1457 + }, + { + "epoch": 0.18547258618496373, + "ewc_loss": 0.01691044121980667, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.31376208527945e-05, + "grad_norm": 3.2736799716949463, + "learning_rate": 6.176345909283595e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8460164070129395, + "num_tokens": 55657394.0, + "step": 1458 + }, + { + "epoch": 0.18559979646355426, + "ewc_loss": 0.016964010894298553, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.3673309594159946e-05, + "grad_norm": 3.3534600734710693, + "learning_rate": 6.180584993641373e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8517827987670898, + "num_tokens": 55695087.0, + "step": 1459 + }, + { + "epoch": 0.18572700674214476, + "ewc_loss": 0.01698516309261322, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.3884839871898293e-05, + "grad_norm": 3.3562397956848145, + "learning_rate": 6.184824077999153e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.855293869972229, + "num_tokens": 55730852.0, + "step": 1460 + }, + { + "epoch": 0.18585421702073526, + "ewc_loss": 0.0169534832239151, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.356804103939794e-05, + "grad_norm": 3.3722715377807617, + "learning_rate": 6.189063162356931e-07, + "loss": 0.545, + "mean_token_accuracy": 0.8298719525337219, + "num_tokens": 55765678.0, + "step": 1461 + }, + { + "epoch": 0.1859814272993258, + "ewc_loss": 0.01697785221040249, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.381172741181217e-05, + "grad_norm": 3.342862844467163, + "learning_rate": 6.193302246714709e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8524218797683716, + "num_tokens": 55804409.0, + "step": 1462 + }, + { + "epoch": 0.1861086375779163, + "ewc_loss": 0.016949418932199478, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.352739754016511e-05, + "grad_norm": 3.3119258880615234, + "learning_rate": 6.197541331072488e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8409723043441772, + "num_tokens": 55839520.0, + "step": 1463 + }, + { + "epoch": 0.1862358478565068, + "ewc_loss": 0.01695779711008072, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.361116927815601e-05, + "grad_norm": 3.275702953338623, + "learning_rate": 6.201780415430267e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8459527492523193, + "num_tokens": 55879089.0, + "step": 1464 + }, + { + "epoch": 0.18636305813509732, + "ewc_loss": 0.016957838088274002, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.3611591283697635e-05, + "grad_norm": 3.359004020690918, + "learning_rate": 6.206019499788045e-07, + "loss": 0.5222, + "mean_token_accuracy": 0.834191083908081, + "num_tokens": 55916655.0, + "step": 1465 + }, + { + "epoch": 0.18649026841368782, + "ewc_loss": 0.01699821837246418, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.4015381465433165e-05, + "grad_norm": 3.269416570663452, + "learning_rate": 6.210258584145825e-07, + "loss": 0.4595, + "mean_token_accuracy": 0.8516116142272949, + "num_tokens": 55953828.0, + "step": 1466 + }, + { + "epoch": 0.18661747869227835, + "ewc_loss": 0.017015481367707253, + "ewc_loss_diag": 1.1682510375976562e-05, + "ewc_loss_parallel": 5.35776634933427e-05, + "grad_norm": 3.3053500652313232, + "learning_rate": 6.214497668503603e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.8354433178901672, + "num_tokens": 55992037.0, + "step": 1467 + }, + { + "epoch": 0.18674468897086885, + "ewc_loss": 0.017005812376737595, + "ewc_loss_diag": 1.1622905731201172e-05, + "ewc_loss_parallel": 5.409131699707359e-05, + "grad_norm": 3.4074771404266357, + "learning_rate": 6.218736752861383e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8516159057617188, + "num_tokens": 56020424.0, + "step": 1468 + }, + { + "epoch": 0.18687189924945935, + "ewc_loss": 0.017166338860988617, + "ewc_loss_diag": 1.1742115020751953e-05, + "ewc_loss_parallel": 5.447589501272887e-05, + "grad_norm": 3.324795722961426, + "learning_rate": 6.22297583721916e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8449679613113403, + "num_tokens": 56060839.0, + "step": 1469 + }, + { + "epoch": 0.18699910952804988, + "ewc_loss": 0.017108706757426262, + "ewc_loss_diag": 1.1742115020751953e-05, + "ewc_loss_parallel": 5.3899573686067015e-05, + "grad_norm": 3.363744020462036, + "learning_rate": 6.227214921576938e-07, + "loss": 0.5304, + "mean_token_accuracy": 0.8357738852500916, + "num_tokens": 56099095.0, + "step": 1470 + }, + { + "epoch": 0.18712631980664038, + "ewc_loss": 0.01715943031013012, + "ewc_loss_diag": 1.1742115020751953e-05, + "ewc_loss_parallel": 5.440680615720339e-05, + "grad_norm": 3.2805819511413574, + "learning_rate": 6.231454005934718e-07, + "loss": 0.4288, + "mean_token_accuracy": 0.8639136552810669, + "num_tokens": 56133805.0, + "step": 1471 + }, + { + "epoch": 0.18725353008523088, + "ewc_loss": 0.01717667281627655, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.396888809627853e-05, + "grad_norm": 3.3539071083068848, + "learning_rate": 6.235693090292496e-07, + "loss": 0.547, + "mean_token_accuracy": 0.8305935859680176, + "num_tokens": 56173064.0, + "step": 1472 + }, + { + "epoch": 0.1873807403638214, + "ewc_loss": 0.017236053943634033, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.456268627312966e-05, + "grad_norm": 3.2147388458251953, + "learning_rate": 6.239932174650275e-07, + "loss": 0.4292, + "mean_token_accuracy": 0.8641571998596191, + "num_tokens": 56213285.0, + "step": 1473 + }, + { + "epoch": 0.1875079506424119, + "ewc_loss": 0.017159437760710716, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.3796524298377335e-05, + "grad_norm": 3.4556095600128174, + "learning_rate": 6.244171259008054e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.8363885879516602, + "num_tokens": 56248218.0, + "step": 1474 + }, + { + "epoch": 0.1876351609210024, + "ewc_loss": 0.017321623861789703, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.5418378906324506e-05, + "grad_norm": 3.2708475589752197, + "learning_rate": 6.248410343365833e-07, + "loss": 0.455, + "mean_token_accuracy": 0.8551276922225952, + "num_tokens": 56286098.0, + "step": 1475 + }, + { + "epoch": 0.18776237119959294, + "ewc_loss": 0.017166519537568092, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.386734483181499e-05, + "grad_norm": 3.230548858642578, + "learning_rate": 6.252649427723612e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.8550453186035156, + "num_tokens": 56329732.0, + "step": 1476 + }, + { + "epoch": 0.18788958147818344, + "ewc_loss": 0.01721530593931675, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.435520870378241e-05, + "grad_norm": 3.3085577487945557, + "learning_rate": 6.25688851208139e-07, + "loss": 0.4675, + "mean_token_accuracy": 0.8523505330085754, + "num_tokens": 56367838.0, + "step": 1477 + }, + { + "epoch": 0.18801679175677394, + "ewc_loss": 0.01725853607058525, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.478751700138673e-05, + "grad_norm": 3.3061769008636475, + "learning_rate": 6.261127596439168e-07, + "loss": 0.5214, + "mean_token_accuracy": 0.8333740234375, + "num_tokens": 56413059.0, + "step": 1478 + }, + { + "epoch": 0.18814400203536447, + "ewc_loss": 0.017228195443749428, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.448409865493886e-05, + "grad_norm": 3.2933998107910156, + "learning_rate": 6.265366680796948e-07, + "loss": 0.4237, + "mean_token_accuracy": 0.8601126074790955, + "num_tokens": 56453806.0, + "step": 1479 + }, + { + "epoch": 0.18827121231395497, + "ewc_loss": 0.017231576144695282, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.4517906391993165e-05, + "grad_norm": 3.2923545837402344, + "learning_rate": 6.269605765154726e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8486183881759644, + "num_tokens": 56497448.0, + "step": 1480 + }, + { + "epoch": 0.18839842259254547, + "ewc_loss": 0.017237573862075806, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.457788938656449e-05, + "grad_norm": 3.334549903869629, + "learning_rate": 6.273844849512505e-07, + "loss": 0.4745, + "mean_token_accuracy": 0.848259449005127, + "num_tokens": 56537731.0, + "step": 1481 + }, + { + "epoch": 0.188525632871136, + "ewc_loss": 0.017260780557990074, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.480994877871126e-05, + "grad_norm": 3.239180088043213, + "learning_rate": 6.278083933870284e-07, + "loss": 0.4135, + "mean_token_accuracy": 0.8686017990112305, + "num_tokens": 56580944.0, + "step": 1482 + }, + { + "epoch": 0.1886528431497265, + "ewc_loss": 0.017199095338582993, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.419310764409602e-05, + "grad_norm": 3.3820345401763916, + "learning_rate": 6.282323018228063e-07, + "loss": 0.4452, + "mean_token_accuracy": 0.8591534495353699, + "num_tokens": 56616306.0, + "step": 1483 + }, + { + "epoch": 0.188780053428317, + "ewc_loss": 0.017310503870248795, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.5307198636000976e-05, + "grad_norm": 3.2748475074768066, + "learning_rate": 6.286562102585841e-07, + "loss": 0.5237, + "mean_token_accuracy": 0.8375691175460815, + "num_tokens": 56661874.0, + "step": 1484 + }, + { + "epoch": 0.18890726370690752, + "ewc_loss": 0.017205707728862762, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.425922790891491e-05, + "grad_norm": 3.3176374435424805, + "learning_rate": 6.29080118694362e-07, + "loss": 0.4206, + "mean_token_accuracy": 0.8646081686019897, + "num_tokens": 56696697.0, + "step": 1485 + }, + { + "epoch": 0.18903447398549802, + "ewc_loss": 0.017277024686336517, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.497238817042671e-05, + "grad_norm": 3.334580183029175, + "learning_rate": 6.295040271301398e-07, + "loss": 0.4201, + "mean_token_accuracy": 0.8637829422950745, + "num_tokens": 56732581.0, + "step": 1486 + }, + { + "epoch": 0.18916168426408853, + "ewc_loss": 0.017258575186133385, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.478790626511909e-05, + "grad_norm": 3.2718911170959473, + "learning_rate": 6.299279355659178e-07, + "loss": 0.4687, + "mean_token_accuracy": 0.8516421914100647, + "num_tokens": 56777905.0, + "step": 1487 + }, + { + "epoch": 0.18928889454267905, + "ewc_loss": 0.01724213920533657, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.4623546020593494e-05, + "grad_norm": 3.3727962970733643, + "learning_rate": 6.303518440016956e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8444234132766724, + "num_tokens": 56812915.0, + "step": 1488 + }, + { + "epoch": 0.18941610482126955, + "ewc_loss": 0.017316093668341637, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.536308526643552e-05, + "grad_norm": 3.4201090335845947, + "learning_rate": 6.307757524374735e-07, + "loss": 0.4823, + "mean_token_accuracy": 0.8495873212814331, + "num_tokens": 56846907.0, + "step": 1489 + }, + { + "epoch": 0.18954331509986008, + "ewc_loss": 0.017284531146287918, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.5047461501089856e-05, + "grad_norm": 3.2952637672424316, + "learning_rate": 6.311996608732514e-07, + "loss": 0.5113, + "mean_token_accuracy": 0.8425076603889465, + "num_tokens": 56884614.0, + "step": 1490 + }, + { + "epoch": 0.18967052537845058, + "ewc_loss": 0.017235253006219864, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.4554682719754055e-05, + "grad_norm": 3.3639373779296875, + "learning_rate": 6.316235693090292e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8389089107513428, + "num_tokens": 56920508.0, + "step": 1491 + }, + { + "epoch": 0.18979773565704108, + "ewc_loss": 0.017313070595264435, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.533286457648501e-05, + "grad_norm": 3.3185653686523438, + "learning_rate": 6.320474777448071e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.8389542698860168, + "num_tokens": 56957629.0, + "step": 1492 + }, + { + "epoch": 0.1899249459356316, + "ewc_loss": 0.017276551574468613, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.4967655159998685e-05, + "grad_norm": 3.305436611175537, + "learning_rate": 6.324713861805849e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.8502812385559082, + "num_tokens": 56997265.0, + "step": 1493 + }, + { + "epoch": 0.1900521562142221, + "ewc_loss": 0.017282376065850258, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.502590647665784e-05, + "grad_norm": 3.441462278366089, + "learning_rate": 6.328952946163628e-07, + "loss": 0.5639, + "mean_token_accuracy": 0.8250960111618042, + "num_tokens": 57031519.0, + "step": 1494 + }, + { + "epoch": 0.1901793664928126, + "ewc_loss": 0.01736075058579445, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.5809661716921255e-05, + "grad_norm": 3.304936647415161, + "learning_rate": 6.333192030521407e-07, + "loss": 0.4368, + "mean_token_accuracy": 0.8566893339157104, + "num_tokens": 57067264.0, + "step": 1495 + }, + { + "epoch": 0.19030657677140314, + "ewc_loss": 0.017246074974536896, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.4662898037349805e-05, + "grad_norm": 3.3473803997039795, + "learning_rate": 6.337431114879186e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.8451068997383118, + "num_tokens": 57104906.0, + "step": 1496 + }, + { + "epoch": 0.19043378704999364, + "ewc_loss": 0.017342202365398407, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.5624164815526456e-05, + "grad_norm": 3.3413267135620117, + "learning_rate": 6.341670199236965e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.8547389507293701, + "num_tokens": 57142534.0, + "step": 1497 + }, + { + "epoch": 0.19056099732858414, + "ewc_loss": 0.017313389107584953, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.5336033256025985e-05, + "grad_norm": 3.2310495376586914, + "learning_rate": 6.345909283594744e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.8591296076774597, + "num_tokens": 57188735.0, + "step": 1498 + }, + { + "epoch": 0.19068820760717467, + "ewc_loss": 0.017275631427764893, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.495846562553197e-05, + "grad_norm": 3.329665184020996, + "learning_rate": 6.350148367952522e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8383133411407471, + "num_tokens": 57224610.0, + "step": 1499 + }, + { + "epoch": 0.19081541788576517, + "ewc_loss": 0.017382020130753517, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.602234887192026e-05, + "grad_norm": 3.3889641761779785, + "learning_rate": 6.354387452310301e-07, + "loss": 0.447, + "mean_token_accuracy": 0.8536599278450012, + "num_tokens": 57259553.0, + "step": 1500 + }, + { + "epoch": 0.19094262816435567, + "ewc_loss": 0.017355524003505707, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.575738396146335e-05, + "grad_norm": 3.3648102283477783, + "learning_rate": 6.358626536668079e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8563869595527649, + "num_tokens": 57295819.0, + "step": 1501 + }, + { + "epoch": 0.1910698384429462, + "ewc_loss": 0.017357291653752327, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.577507181442343e-05, + "grad_norm": 3.274977207183838, + "learning_rate": 6.362865621025858e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8604402542114258, + "num_tokens": 57339110.0, + "step": 1502 + }, + { + "epoch": 0.1911970487215367, + "ewc_loss": 0.01731308549642563, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.5333006457658485e-05, + "grad_norm": 3.3533356189727783, + "learning_rate": 6.367104705383637e-07, + "loss": 0.4676, + "mean_token_accuracy": 0.8506738543510437, + "num_tokens": 57377311.0, + "step": 1503 + }, + { + "epoch": 0.1913242590001272, + "ewc_loss": 0.017378568649291992, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.598783172899857e-05, + "grad_norm": 3.320223808288574, + "learning_rate": 6.371343789741416e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8432973623275757, + "num_tokens": 57419281.0, + "step": 1504 + }, + { + "epoch": 0.19145146927871773, + "ewc_loss": 0.017402974888682365, + "ewc_loss_diag": 1.1861324310302734e-05, + "ewc_loss_parallel": 5.562154183280654e-05, + "grad_norm": 3.681049346923828, + "learning_rate": 6.375582874099195e-07, + "loss": 0.468, + "mean_token_accuracy": 0.850002110004425, + "num_tokens": 57458134.0, + "step": 1505 + }, + { + "epoch": 0.19157867955730823, + "ewc_loss": 0.01750207506120205, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.722289643017575e-05, + "grad_norm": 3.442058563232422, + "learning_rate": 6.379821958456974e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8338040709495544, + "num_tokens": 57492931.0, + "step": 1506 + }, + { + "epoch": 0.19170588983589873, + "ewc_loss": 0.01728048175573349, + "ewc_loss_diag": 1.1801719665527344e-05, + "ewc_loss_parallel": 5.5006959883030504e-05, + "grad_norm": 3.369307518005371, + "learning_rate": 6.384061042814751e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.843451976776123, + "num_tokens": 57529581.0, + "step": 1507 + }, + { + "epoch": 0.19183310011448926, + "ewc_loss": 0.0174529068171978, + "ewc_loss_diag": 1.1920928955078125e-05, + "ewc_loss_parallel": 5.551051799557172e-05, + "grad_norm": 3.3497695922851562, + "learning_rate": 6.38830012717253e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8470724821090698, + "num_tokens": 57570302.0, + "step": 1508 + }, + { + "epoch": 0.19196031039307976, + "ewc_loss": 0.017453929409384727, + "ewc_loss_diag": 1.1920928955078125e-05, + "ewc_loss_parallel": 5.552073344006203e-05, + "grad_norm": 3.310232400894165, + "learning_rate": 6.392539211530309e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8423900008201599, + "num_tokens": 57615974.0, + "step": 1509 + }, + { + "epoch": 0.19208752067167026, + "ewc_loss": 0.01744568720459938, + "ewc_loss_diag": 1.1920928955078125e-05, + "ewc_loss_parallel": 5.543832594412379e-05, + "grad_norm": 3.3289124965667725, + "learning_rate": 6.396778295888087e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.8564152121543884, + "num_tokens": 57653737.0, + "step": 1510 + }, + { + "epoch": 0.1922147309502608, + "ewc_loss": 0.01746968924999237, + "ewc_loss_diag": 1.1920928955078125e-05, + "ewc_loss_parallel": 5.567834887187928e-05, + "grad_norm": 3.326582193374634, + "learning_rate": 6.401017380245867e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8432922959327698, + "num_tokens": 57693431.0, + "step": 1511 + }, + { + "epoch": 0.1923419412288513, + "ewc_loss": 0.017471488565206528, + "ewc_loss_diag": 1.1920928955078125e-05, + "ewc_loss_parallel": 5.569632776314393e-05, + "grad_norm": 3.2766199111938477, + "learning_rate": 6.405256464603645e-07, + "loss": 0.4737, + "mean_token_accuracy": 0.8481905460357666, + "num_tokens": 57736413.0, + "step": 1512 + }, + { + "epoch": 0.1924691515074418, + "ewc_loss": 0.017452973872423172, + "ewc_loss_diag": 1.1920928955078125e-05, + "ewc_loss_parallel": 5.5511172831756994e-05, + "grad_norm": 3.3455169200897217, + "learning_rate": 6.409495548961425e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.8558534383773804, + "num_tokens": 57773293.0, + "step": 1513 + }, + { + "epoch": 0.19259636178603232, + "ewc_loss": 0.017513921484351158, + "ewc_loss_diag": 1.1920928955078125e-05, + "ewc_loss_parallel": 5.612066524918191e-05, + "grad_norm": 3.397819995880127, + "learning_rate": 6.413734633319203e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8503740429878235, + "num_tokens": 57809294.0, + "step": 1514 + }, + { + "epoch": 0.19272357206462282, + "ewc_loss": 0.01752491481602192, + "ewc_loss_diag": 1.1920928955078125e-05, + "ewc_loss_parallel": 5.623059769277461e-05, + "grad_norm": 3.401013135910034, + "learning_rate": 6.417973717676981e-07, + "loss": 0.443, + "mean_token_accuracy": 0.8559684157371521, + "num_tokens": 57843610.0, + "step": 1515 + }, + { + "epoch": 0.19285078234321335, + "ewc_loss": 0.01752312295138836, + "ewc_loss_diag": 1.1920928955078125e-05, + "ewc_loss_parallel": 5.621266973321326e-05, + "grad_norm": 3.2825510501861572, + "learning_rate": 6.42221280203476e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8373810648918152, + "num_tokens": 57893098.0, + "step": 1516 + }, + { + "epoch": 0.19297799262180385, + "ewc_loss": 0.017535662278532982, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.5727716244291514e-05, + "grad_norm": 3.356306552886963, + "learning_rate": 6.426451886392539e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8367447853088379, + "num_tokens": 57933098.0, + "step": 1517 + }, + { + "epoch": 0.19310520290039435, + "ewc_loss": 0.017617346718907356, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.6544555263826624e-05, + "grad_norm": 3.392927408218384, + "learning_rate": 6.430690970750317e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.8525430560112, + "num_tokens": 57971502.0, + "step": 1518 + }, + { + "epoch": 0.19323241317898487, + "ewc_loss": 0.017606552690267563, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.643662007059902e-05, + "grad_norm": 3.495029926300049, + "learning_rate": 6.434930055108097e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.8400391936302185, + "num_tokens": 58001557.0, + "step": 1519 + }, + { + "epoch": 0.19335962345757537, + "ewc_loss": 0.0176534466445446, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.6905562814790756e-05, + "grad_norm": 3.4434335231781006, + "learning_rate": 6.439169139465875e-07, + "loss": 0.5184, + "mean_token_accuracy": 0.8345636129379272, + "num_tokens": 58038058.0, + "step": 1520 + }, + { + "epoch": 0.19348683373616588, + "ewc_loss": 0.017608296126127243, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.645404598908499e-05, + "grad_norm": 3.364332675933838, + "learning_rate": 6.443408223823655e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8432286977767944, + "num_tokens": 58073138.0, + "step": 1521 + }, + { + "epoch": 0.1936140440147564, + "ewc_loss": 0.017609519883990288, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.6466298701707274e-05, + "grad_norm": 3.3538596630096436, + "learning_rate": 6.447647308181432e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.8475463390350342, + "num_tokens": 58111793.0, + "step": 1522 + }, + { + "epoch": 0.1937412542933469, + "ewc_loss": 0.017628155648708344, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.665266144205816e-05, + "grad_norm": 3.3325798511505127, + "learning_rate": 6.451886392539211e-07, + "loss": 0.4665, + "mean_token_accuracy": 0.8503341674804688, + "num_tokens": 58152911.0, + "step": 1523 + }, + { + "epoch": 0.1938684645719374, + "ewc_loss": 0.017621038481593132, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.658147347276099e-05, + "grad_norm": 3.3491411209106445, + "learning_rate": 6.45612547689699e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8464593887329102, + "num_tokens": 58192095.0, + "step": 1524 + }, + { + "epoch": 0.19399567485052793, + "ewc_loss": 0.017649171873927116, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.686281656380743e-05, + "grad_norm": 3.339947462081909, + "learning_rate": 6.460364561254769e-07, + "loss": 0.4395, + "mean_token_accuracy": 0.8589904308319092, + "num_tokens": 58232651.0, + "step": 1525 + }, + { + "epoch": 0.19412288512911843, + "ewc_loss": 0.01765645667910576, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.693565253750421e-05, + "grad_norm": 3.3616514205932617, + "learning_rate": 6.464603645612547e-07, + "loss": 0.4377, + "mean_token_accuracy": 0.859582781791687, + "num_tokens": 58270802.0, + "step": 1526 + }, + { + "epoch": 0.19425009540770893, + "ewc_loss": 0.01766880601644516, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.7059151004068553e-05, + "grad_norm": 3.3301095962524414, + "learning_rate": 6.468842729970327e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8520521521568298, + "num_tokens": 58313179.0, + "step": 1527 + }, + { + "epoch": 0.19437730568629946, + "ewc_loss": 0.017644278705120087, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.6813871196936816e-05, + "grad_norm": 3.4385030269622803, + "learning_rate": 6.473081814328105e-07, + "loss": 0.5105, + "mean_token_accuracy": 0.843193769454956, + "num_tokens": 58354728.0, + "step": 1528 + }, + { + "epoch": 0.19450451596488996, + "ewc_loss": 0.01770465075969696, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.741760469391011e-05, + "grad_norm": 3.3412275314331055, + "learning_rate": 6.477320898685885e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8461407423019409, + "num_tokens": 58398649.0, + "step": 1529 + }, + { + "epoch": 0.19463172624348046, + "ewc_loss": 0.01768502965569496, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.6611050240462646e-05, + "grad_norm": 3.377896785736084, + "learning_rate": 6.481559983043662e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8394256830215454, + "num_tokens": 58438261.0, + "step": 1530 + }, + { + "epoch": 0.194758936522071, + "ewc_loss": 0.01767471805214882, + "ewc_loss_diag": 1.1980533599853516e-05, + "ewc_loss_parallel": 5.711826452170499e-05, + "grad_norm": 3.3596270084381104, + "learning_rate": 6.48579906740144e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8568514585494995, + "num_tokens": 58479141.0, + "step": 1531 + }, + { + "epoch": 0.1948861468006615, + "ewc_loss": 0.017726242542266846, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.702316775568761e-05, + "grad_norm": 3.399352788925171, + "learning_rate": 6.49003815175922e-07, + "loss": 0.4179, + "mean_token_accuracy": 0.8670610189437866, + "num_tokens": 58515700.0, + "step": 1532 + }, + { + "epoch": 0.195013357079252, + "ewc_loss": 0.01775343529880047, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.7295092119602486e-05, + "grad_norm": 3.4111101627349854, + "learning_rate": 6.494277236116998e-07, + "loss": 0.424, + "mean_token_accuracy": 0.8653063774108887, + "num_tokens": 58549825.0, + "step": 1533 + }, + { + "epoch": 0.19514056735784252, + "ewc_loss": 0.017751485109329224, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.727559982915409e-05, + "grad_norm": 3.328725814819336, + "learning_rate": 6.498516320474777e-07, + "loss": 0.5633, + "mean_token_accuracy": 0.826114296913147, + "num_tokens": 58596110.0, + "step": 1534 + }, + { + "epoch": 0.19526777763643302, + "ewc_loss": 0.01773025654256344, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.706330921384506e-05, + "grad_norm": 3.4097702503204346, + "learning_rate": 6.502755404832556e-07, + "loss": 0.4734, + "mean_token_accuracy": 0.8477004170417786, + "num_tokens": 58632410.0, + "step": 1535 + }, + { + "epoch": 0.19539498791502352, + "ewc_loss": 0.017795246094465256, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.7713208661880344e-05, + "grad_norm": 3.4380176067352295, + "learning_rate": 6.506994489190335e-07, + "loss": 0.5353, + "mean_token_accuracy": 0.8352293968200684, + "num_tokens": 58671339.0, + "step": 1536 + }, + { + "epoch": 0.19552219819361405, + "ewc_loss": 0.017775243148207664, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.751316712121479e-05, + "grad_norm": 3.3378353118896484, + "learning_rate": 6.511233573548114e-07, + "loss": 0.436, + "mean_token_accuracy": 0.8596804141998291, + "num_tokens": 58711786.0, + "step": 1537 + }, + { + "epoch": 0.19564940847220455, + "ewc_loss": 0.017749378457665443, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.7254528655903414e-05, + "grad_norm": 3.435187578201294, + "learning_rate": 6.515472657905892e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8468414545059204, + "num_tokens": 58743600.0, + "step": 1538 + }, + { + "epoch": 0.19577661875079505, + "ewc_loss": 0.017833083868026733, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.809158756164834e-05, + "grad_norm": 3.4337148666381836, + "learning_rate": 6.51971174226367e-07, + "loss": 0.5342, + "mean_token_accuracy": 0.8319717645645142, + "num_tokens": 58780622.0, + "step": 1539 + }, + { + "epoch": 0.19590382902938558, + "ewc_loss": 0.017795663326978683, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.771738142357208e-05, + "grad_norm": 3.3989851474761963, + "learning_rate": 6.52395082662145e-07, + "loss": 0.5264, + "mean_token_accuracy": 0.8334370255470276, + "num_tokens": 58819370.0, + "step": 1540 + }, + { + "epoch": 0.19603103930797608, + "ewc_loss": 0.01779796928167343, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.7740438933251426e-05, + "grad_norm": 3.4006974697113037, + "learning_rate": 6.528189910979228e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8557336330413818, + "num_tokens": 58856270.0, + "step": 1541 + }, + { + "epoch": 0.1961582495865666, + "ewc_loss": 0.0178083386272192, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.7844124967232347e-05, + "grad_norm": 3.3154799938201904, + "learning_rate": 6.532428995337007e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8498893976211548, + "num_tokens": 58898702.0, + "step": 1542 + }, + { + "epoch": 0.1962854598651571, + "ewc_loss": 0.017786622047424316, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.7626959460321814e-05, + "grad_norm": 3.446150541305542, + "learning_rate": 6.536668079694786e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.8415204882621765, + "num_tokens": 58934602.0, + "step": 1543 + }, + { + "epoch": 0.1964126701437476, + "ewc_loss": 0.017920993268489838, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.83603214181494e-05, + "grad_norm": 3.355163097381592, + "learning_rate": 6.540907164052565e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.8472824096679688, + "num_tokens": 58977766.0, + "step": 1544 + }, + { + "epoch": 0.19653988042233814, + "ewc_loss": 0.017794061452150345, + "ewc_loss_diag": 1.2040138244628906e-05, + "ewc_loss_parallel": 5.7701363402884454e-05, + "grad_norm": 3.3550798892974854, + "learning_rate": 6.545146248410343e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8450092673301697, + "num_tokens": 59019649.0, + "step": 1545 + }, + { + "epoch": 0.19666709070092864, + "ewc_loss": 0.01788835972547531, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.803398380521685e-05, + "grad_norm": 3.4210822582244873, + "learning_rate": 6.549385332768122e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8428571224212646, + "num_tokens": 59060543.0, + "step": 1546 + }, + { + "epoch": 0.19679430097951914, + "ewc_loss": 0.017908945679664612, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.823985338793136e-05, + "grad_norm": 3.368934392929077, + "learning_rate": 6.5536244171259e-07, + "loss": 0.4982, + "mean_token_accuracy": 0.8413122892379761, + "num_tokens": 59100049.0, + "step": 1547 + }, + { + "epoch": 0.19692151125810967, + "ewc_loss": 0.017872072756290436, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.787111876998097e-05, + "grad_norm": 3.4032278060913086, + "learning_rate": 6.55786350148368e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.839457631111145, + "num_tokens": 59137370.0, + "step": 1548 + }, + { + "epoch": 0.19704872153670017, + "ewc_loss": 0.017914408817887306, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.8294477639719844e-05, + "grad_norm": 3.471973419189453, + "learning_rate": 6.562102585841458e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8491469621658325, + "num_tokens": 59176196.0, + "step": 1549 + }, + { + "epoch": 0.19717593181529067, + "ewc_loss": 0.017939597368240356, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.854635310242884e-05, + "grad_norm": 3.359395980834961, + "learning_rate": 6.566341670199236e-07, + "loss": 0.4556, + "mean_token_accuracy": 0.8546842932701111, + "num_tokens": 59216215.0, + "step": 1550 + }, + { + "epoch": 0.1973031420938812, + "ewc_loss": 0.01786223240196705, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.7772711443249136e-05, + "grad_norm": 3.4167604446411133, + "learning_rate": 6.570580754557016e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8495810031890869, + "num_tokens": 59254173.0, + "step": 1551 + }, + { + "epoch": 0.1974303523724717, + "ewc_loss": 0.017954975366592407, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.870013774256222e-05, + "grad_norm": 3.418294906616211, + "learning_rate": 6.574819838914794e-07, + "loss": 0.4443, + "mean_token_accuracy": 0.857912540435791, + "num_tokens": 59288496.0, + "step": 1552 + }, + { + "epoch": 0.1975575626510622, + "ewc_loss": 0.01791355013847351, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.8285895647713915e-05, + "grad_norm": 3.340439558029175, + "learning_rate": 6.579058923272573e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8403000235557556, + "num_tokens": 59329043.0, + "step": 1553 + }, + { + "epoch": 0.19768477292965272, + "ewc_loss": 0.01808715984225273, + "ewc_loss_diag": 1.2278556823730469e-05, + "ewc_loss_parallel": 5.8190926210954785e-05, + "grad_norm": 13.639182090759277, + "learning_rate": 6.583298007630351e-07, + "loss": 0.4924, + "mean_token_accuracy": 0.8435638546943665, + "num_tokens": 59360743.0, + "step": 1554 + }, + { + "epoch": 0.19781198320824323, + "ewc_loss": 0.021256567910313606, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 9.171606507152319e-05, + "grad_norm": 4.502940654754639, + "learning_rate": 6.58753709198813e-07, + "loss": 0.4939, + "mean_token_accuracy": 0.8439156413078308, + "num_tokens": 59404167.0, + "step": 1555 + }, + { + "epoch": 0.19793919348683373, + "ewc_loss": 0.018536366522312164, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.451406079577282e-05, + "grad_norm": 3.109849214553833, + "learning_rate": 6.591776176345909e-07, + "loss": 0.4589, + "mean_token_accuracy": 0.8502733707427979, + "num_tokens": 59447770.0, + "step": 1556 + }, + { + "epoch": 0.19806640376542425, + "ewc_loss": 0.018233492970466614, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.148531974758953e-05, + "grad_norm": 3.810377836227417, + "learning_rate": 6.596015260703688e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.8544070720672607, + "num_tokens": 59494989.0, + "step": 1557 + }, + { + "epoch": 0.19819361404401475, + "ewc_loss": 0.019221793860197067, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 7.136834028642625e-05, + "grad_norm": 3.757722854614258, + "learning_rate": 6.600254345061466e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8559300899505615, + "num_tokens": 59534926.0, + "step": 1558 + }, + { + "epoch": 0.19832082432260525, + "ewc_loss": 0.01840389333665371, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.318932719295844e-05, + "grad_norm": 3.5409557819366455, + "learning_rate": 6.604493429419246e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8383587002754211, + "num_tokens": 59577832.0, + "step": 1559 + }, + { + "epoch": 0.19844803460119578, + "ewc_loss": 0.018292158842086792, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.207197293406352e-05, + "grad_norm": 3.56530499458313, + "learning_rate": 6.608732513777023e-07, + "loss": 0.5169, + "mean_token_accuracy": 0.843935489654541, + "num_tokens": 59624995.0, + "step": 1560 + }, + { + "epoch": 0.19857524487978628, + "ewc_loss": 0.018346963450312614, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.262002716539428e-05, + "grad_norm": 3.511695146560669, + "learning_rate": 6.612971598134803e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8498311042785645, + "num_tokens": 59663000.0, + "step": 1561 + }, + { + "epoch": 0.19870245515837678, + "ewc_loss": 0.018207309767603874, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.122348713688552e-05, + "grad_norm": 3.587510347366333, + "learning_rate": 6.617210682492581e-07, + "loss": 0.5226, + "mean_token_accuracy": 0.8353309035301208, + "num_tokens": 59700241.0, + "step": 1562 + }, + { + "epoch": 0.1988296654369673, + "ewc_loss": 0.018217658624053, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.132698035798967e-05, + "grad_norm": 3.505474090576172, + "learning_rate": 6.62144976685036e-07, + "loss": 0.444, + "mean_token_accuracy": 0.8591655492782593, + "num_tokens": 59737244.0, + "step": 1563 + }, + { + "epoch": 0.1989568757155578, + "ewc_loss": 0.01811886951327324, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.033907629898749e-05, + "grad_norm": 3.5106217861175537, + "learning_rate": 6.625688851208139e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.8443436622619629, + "num_tokens": 59775538.0, + "step": 1564 + }, + { + "epoch": 0.19908408599414834, + "ewc_loss": 0.018114764243364334, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.029803626006469e-05, + "grad_norm": 3.5453743934631348, + "learning_rate": 6.629927935565918e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8418523073196411, + "num_tokens": 59810373.0, + "step": 1565 + }, + { + "epoch": 0.19921129627273884, + "ewc_loss": 0.018098577857017517, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.013617530697957e-05, + "grad_norm": 3.4870645999908447, + "learning_rate": 6.634167019923696e-07, + "loss": 0.52, + "mean_token_accuracy": 0.8396811485290527, + "num_tokens": 59849992.0, + "step": 1566 + }, + { + "epoch": 0.19933850655132934, + "ewc_loss": 0.018044430762529373, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.9594687627395615e-05, + "grad_norm": 3.5521652698516846, + "learning_rate": 6.638406104281476e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8473111391067505, + "num_tokens": 59882552.0, + "step": 1567 + }, + { + "epoch": 0.19946571682991987, + "ewc_loss": 0.018068158999085426, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.983198570902459e-05, + "grad_norm": 3.3971595764160156, + "learning_rate": 6.642645188639253e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8528250455856323, + "num_tokens": 59925090.0, + "step": 1568 + }, + { + "epoch": 0.19959292710851037, + "ewc_loss": 0.01796901226043701, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.884050915483385e-05, + "grad_norm": 3.4514999389648438, + "learning_rate": 6.646884272997032e-07, + "loss": 0.5027, + "mean_token_accuracy": 0.8407567143440247, + "num_tokens": 59965406.0, + "step": 1569 + }, + { + "epoch": 0.19972013738710087, + "ewc_loss": 0.018050789833068848, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.965828677290119e-05, + "grad_norm": 3.477306842803955, + "learning_rate": 6.651123357354811e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.8442034125328064, + "num_tokens": 60001770.0, + "step": 1570 + }, + { + "epoch": 0.1998473476656914, + "ewc_loss": 0.018023556098341942, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.938594767940231e-05, + "grad_norm": 3.5040018558502197, + "learning_rate": 6.655362441712589e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8370993137359619, + "num_tokens": 60036911.0, + "step": 1571 + }, + { + "epoch": 0.1999745579442819, + "ewc_loss": 0.018030287697911263, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.9453261201269925e-05, + "grad_norm": 3.37729811668396, + "learning_rate": 6.659601526070369e-07, + "loss": 0.5341, + "mean_token_accuracy": 0.8342330455780029, + "num_tokens": 60080210.0, + "step": 1572 + }, + { + "epoch": 0.2001017682228724, + "ewc_loss": 0.01796228438615799, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.877322473679669e-05, + "grad_norm": 3.5531299114227295, + "learning_rate": 6.663840610428147e-07, + "loss": 0.5673, + "mean_token_accuracy": 0.826240599155426, + "num_tokens": 60111533.0, + "step": 1573 + }, + { + "epoch": 0.20022897850146293, + "ewc_loss": 0.01810438558459282, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 6.0194244724698365e-05, + "grad_norm": 3.408278703689575, + "learning_rate": 6.668079694785926e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.8576192855834961, + "num_tokens": 60150733.0, + "step": 1574 + }, + { + "epoch": 0.20035618878005343, + "ewc_loss": 0.0179764311760664, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.891470209462568e-05, + "grad_norm": 3.4541330337524414, + "learning_rate": 6.672318779143704e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8528127670288086, + "num_tokens": 60185034.0, + "step": 1575 + }, + { + "epoch": 0.20048339905864393, + "ewc_loss": 0.01806522347033024, + "ewc_loss_diag": 1.2099742889404297e-05, + "ewc_loss_parallel": 5.980263085803017e-05, + "grad_norm": 3.526762008666992, + "learning_rate": 6.676557863501483e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8431005477905273, + "num_tokens": 60216865.0, + "step": 1576 + }, + { + "epoch": 0.20061060933723446, + "ewc_loss": 0.018126724287867546, + "ewc_loss_diag": 1.2159347534179688e-05, + "ewc_loss_parallel": 5.980728383292444e-05, + "grad_norm": 3.580165386199951, + "learning_rate": 6.680796947859262e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8451125025749207, + "num_tokens": 60257685.0, + "step": 1577 + }, + { + "epoch": 0.20073781961582496, + "ewc_loss": 0.018136680126190186, + "ewc_loss_diag": 1.2159347534179688e-05, + "ewc_loss_parallel": 5.9906848036916927e-05, + "grad_norm": 3.448164224624634, + "learning_rate": 6.685036032217041e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8477364778518677, + "num_tokens": 60294258.0, + "step": 1578 + }, + { + "epoch": 0.20086502989441546, + "ewc_loss": 0.018058665096759796, + "ewc_loss_diag": 1.2159347534179688e-05, + "ewc_loss_parallel": 5.912669439567253e-05, + "grad_norm": 3.3912603855133057, + "learning_rate": 6.689275116574819e-07, + "loss": 0.4686, + "mean_token_accuracy": 0.8533843755722046, + "num_tokens": 60330319.0, + "step": 1579 + }, + { + "epoch": 0.200992240173006, + "ewc_loss": 0.018096808344125748, + "ewc_loss_diag": 1.2159347534179688e-05, + "ewc_loss_parallel": 5.950812919763848e-05, + "grad_norm": 3.400970220565796, + "learning_rate": 6.693514200932599e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8471338152885437, + "num_tokens": 60370482.0, + "step": 1580 + }, + { + "epoch": 0.2011194504515965, + "ewc_loss": 0.018124572932720184, + "ewc_loss_diag": 1.2159347534179688e-05, + "ewc_loss_parallel": 5.9785761550301686e-05, + "grad_norm": 3.3903892040252686, + "learning_rate": 6.697753285290377e-07, + "loss": 0.4913, + "mean_token_accuracy": 0.8434910774230957, + "num_tokens": 60412436.0, + "step": 1581 + }, + { + "epoch": 0.201246660730187, + "ewc_loss": 0.01818060129880905, + "ewc_loss_diag": 1.2218952178955078e-05, + "ewc_loss_parallel": 5.973570659989491e-05, + "grad_norm": 3.4591264724731445, + "learning_rate": 6.701992369648156e-07, + "loss": 0.4158, + "mean_token_accuracy": 0.8674668073654175, + "num_tokens": 60448982.0, + "step": 1582 + }, + { + "epoch": 0.20137387100877752, + "ewc_loss": 0.01822776347398758, + "ewc_loss_diag": 1.2218952178955078e-05, + "ewc_loss_parallel": 6.020731234457344e-05, + "grad_norm": 3.356792688369751, + "learning_rate": 6.706231454005934e-07, + "loss": 0.4329, + "mean_token_accuracy": 0.8598822355270386, + "num_tokens": 60489878.0, + "step": 1583 + }, + { + "epoch": 0.20150108128736802, + "ewc_loss": 0.018163301050662994, + "ewc_loss_diag": 1.2218952178955078e-05, + "ewc_loss_parallel": 5.956269524176605e-05, + "grad_norm": 3.418600082397461, + "learning_rate": 6.710470538363713e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8533704280853271, + "num_tokens": 60528979.0, + "step": 1584 + }, + { + "epoch": 0.20162829156595852, + "ewc_loss": 0.018245672807097435, + "ewc_loss_diag": 1.2218952178955078e-05, + "ewc_loss_parallel": 6.038641004124656e-05, + "grad_norm": 3.519502878189087, + "learning_rate": 6.714709622721492e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8419629335403442, + "num_tokens": 60561670.0, + "step": 1585 + }, + { + "epoch": 0.20175550184454905, + "ewc_loss": 0.0183310154825449, + "ewc_loss_diag": 1.2278556823730469e-05, + "ewc_loss_parallel": 6.0629496147157624e-05, + "grad_norm": 3.4012815952301025, + "learning_rate": 6.718948707079271e-07, + "loss": 0.4708, + "mean_token_accuracy": 0.8498332500457764, + "num_tokens": 60602061.0, + "step": 1586 + }, + { + "epoch": 0.20188271212313955, + "ewc_loss": 0.01824967935681343, + "ewc_loss_diag": 1.2278556823730469e-05, + "ewc_loss_parallel": 5.981613139738329e-05, + "grad_norm": 3.3898448944091797, + "learning_rate": 6.723187791437049e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8493237495422363, + "num_tokens": 60639805.0, + "step": 1587 + }, + { + "epoch": 0.20200992240173005, + "ewc_loss": 0.018361391499638557, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.032290548318997e-05, + "grad_norm": 3.4144351482391357, + "learning_rate": 6.727426875794829e-07, + "loss": 0.5004, + "mean_token_accuracy": 0.8418795466423035, + "num_tokens": 60681816.0, + "step": 1588 + }, + { + "epoch": 0.20213713268032057, + "ewc_loss": 0.018366998061537743, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.0378966736607254e-05, + "grad_norm": 3.4008498191833496, + "learning_rate": 6.731665960152607e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8434242010116577, + "num_tokens": 60726343.0, + "step": 1589 + }, + { + "epoch": 0.20226434295891108, + "ewc_loss": 0.018360747024416924, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.031644807080738e-05, + "grad_norm": 3.480764865875244, + "learning_rate": 6.735905044510385e-07, + "loss": 0.5296, + "mean_token_accuracy": 0.8266669511795044, + "num_tokens": 60764814.0, + "step": 1590 + }, + { + "epoch": 0.2023915532375016, + "ewc_loss": 0.018414365127682686, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.08526352152694e-05, + "grad_norm": 3.4782347679138184, + "learning_rate": 6.740144128868164e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8424015045166016, + "num_tokens": 60802384.0, + "step": 1591 + }, + { + "epoch": 0.2025187635160921, + "ewc_loss": 0.018361426889896393, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.032325836713426e-05, + "grad_norm": 3.418470621109009, + "learning_rate": 6.744383213225942e-07, + "loss": 0.5219, + "mean_token_accuracy": 0.8340935707092285, + "num_tokens": 60842085.0, + "step": 1592 + }, + { + "epoch": 0.2026459737946826, + "ewc_loss": 0.018362056463956833, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.0329541156534106e-05, + "grad_norm": 3.481290102005005, + "learning_rate": 6.748622297583722e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8495786190032959, + "num_tokens": 60874835.0, + "step": 1593 + }, + { + "epoch": 0.20277318407327313, + "ewc_loss": 0.018409285694360733, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.080185266910121e-05, + "grad_norm": 3.48494029045105, + "learning_rate": 6.7528613819415e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8441998362541199, + "num_tokens": 60910682.0, + "step": 1594 + }, + { + "epoch": 0.20290039435186363, + "ewc_loss": 0.01837782748043537, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.0487254813779145e-05, + "grad_norm": 3.404146671295166, + "learning_rate": 6.757100466299279e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8487660884857178, + "num_tokens": 60951352.0, + "step": 1595 + }, + { + "epoch": 0.20302760463045413, + "ewc_loss": 0.01836632750928402, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.0372261941665784e-05, + "grad_norm": 3.6569018363952637, + "learning_rate": 6.761339550657058e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8373212814331055, + "num_tokens": 60979055.0, + "step": 1596 + }, + { + "epoch": 0.20315481490904466, + "ewc_loss": 0.018506355583667755, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.177254545036703e-05, + "grad_norm": 3.5247058868408203, + "learning_rate": 6.765578635014837e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8471366167068481, + "num_tokens": 61017437.0, + "step": 1597 + }, + { + "epoch": 0.20328202518763516, + "ewc_loss": 0.018364617601037025, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.0355159803293645e-05, + "grad_norm": 3.4619405269622803, + "learning_rate": 6.769817719372614e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.8521677255630493, + "num_tokens": 61052286.0, + "step": 1598 + }, + { + "epoch": 0.20340923546622566, + "ewc_loss": 0.0183870792388916, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.057978680473752e-05, + "grad_norm": 3.5026726722717285, + "learning_rate": 6.774056803730394e-07, + "loss": 0.509, + "mean_token_accuracy": 0.8380661010742188, + "num_tokens": 61086110.0, + "step": 1599 + }, + { + "epoch": 0.2035364457448162, + "ewc_loss": 0.018427308648824692, + "ewc_loss_diag": 1.233816146850586e-05, + "ewc_loss_parallel": 6.098207813920453e-05, + "grad_norm": 3.3952226638793945, + "learning_rate": 6.778295888088172e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8476755619049072, + "num_tokens": 61124181.0, + "step": 1600 + }, + { + "epoch": 0.2036636560234067, + "ewc_loss": 0.01844225823879242, + "ewc_loss_diag": 1.239776611328125e-05, + "ewc_loss_parallel": 6.0521204432006925e-05, + "grad_norm": 3.5184295177459717, + "learning_rate": 6.782534972445952e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8401135802268982, + "num_tokens": 61160881.0, + "step": 1601 + }, + { + "epoch": 0.2037908663019972, + "ewc_loss": 0.018529310822486877, + "ewc_loss_diag": 1.239776611328125e-05, + "ewc_loss_parallel": 6.13917363807559e-05, + "grad_norm": 3.4560317993164062, + "learning_rate": 6.78677405680373e-07, + "loss": 0.498, + "mean_token_accuracy": 0.8448668122291565, + "num_tokens": 61200755.0, + "step": 1602 + }, + { + "epoch": 0.20391807658058772, + "ewc_loss": 0.018474385142326355, + "ewc_loss_diag": 1.239776611328125e-05, + "ewc_loss_parallel": 6.084248889237642e-05, + "grad_norm": 3.460160255432129, + "learning_rate": 6.791013141161509e-07, + "loss": 0.489, + "mean_token_accuracy": 0.8418406248092651, + "num_tokens": 61240045.0, + "step": 1603 + }, + { + "epoch": 0.20404528685917822, + "ewc_loss": 0.01849144697189331, + "ewc_loss_diag": 1.239776611328125e-05, + "ewc_loss_parallel": 6.1013110098429024e-05, + "grad_norm": 3.4153614044189453, + "learning_rate": 6.795252225519288e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8348808288574219, + "num_tokens": 61282832.0, + "step": 1604 + }, + { + "epoch": 0.20417249713776872, + "ewc_loss": 0.018473628908395767, + "ewc_loss_diag": 1.239776611328125e-05, + "ewc_loss_parallel": 6.0834929172415286e-05, + "grad_norm": 3.4266974925994873, + "learning_rate": 6.799491309877067e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.8423227667808533, + "num_tokens": 61319841.0, + "step": 1605 + }, + { + "epoch": 0.20429970741635925, + "ewc_loss": 0.018571263179183006, + "ewc_loss_diag": 1.245737075805664e-05, + "ewc_loss_parallel": 6.120090984040871e-05, + "grad_norm": 3.546419143676758, + "learning_rate": 6.803730394234844e-07, + "loss": 0.456, + "mean_token_accuracy": 0.8553497195243835, + "num_tokens": 61352732.0, + "step": 1606 + }, + { + "epoch": 0.20442691769494975, + "ewc_loss": 0.018614264205098152, + "ewc_loss_diag": 1.245737075805664e-05, + "ewc_loss_parallel": 6.163092621136457e-05, + "grad_norm": 3.495643138885498, + "learning_rate": 6.807969478592624e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8465703725814819, + "num_tokens": 61388324.0, + "step": 1607 + }, + { + "epoch": 0.20455412797354025, + "ewc_loss": 0.018563298508524895, + "ewc_loss_diag": 1.245737075805664e-05, + "ewc_loss_parallel": 6.112125993240625e-05, + "grad_norm": 3.3775415420532227, + "learning_rate": 6.812208562950402e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8431427478790283, + "num_tokens": 61431222.0, + "step": 1608 + }, + { + "epoch": 0.20468133825213078, + "ewc_loss": 0.018548237159848213, + "ewc_loss_diag": 1.245737075805664e-05, + "ewc_loss_parallel": 6.097065124777146e-05, + "grad_norm": 3.400296688079834, + "learning_rate": 6.816447647308182e-07, + "loss": 0.5425, + "mean_token_accuracy": 0.8285284638404846, + "num_tokens": 61475617.0, + "step": 1609 + }, + { + "epoch": 0.20480854853072128, + "ewc_loss": 0.01861298829317093, + "ewc_loss_diag": 1.245737075805664e-05, + "ewc_loss_parallel": 6.161816418170929e-05, + "grad_norm": 3.4787437915802, + "learning_rate": 6.82068673166596e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8515163660049438, + "num_tokens": 61511338.0, + "step": 1610 + }, + { + "epoch": 0.20493575880931178, + "ewc_loss": 0.018632028251886368, + "ewc_loss_diag": 1.245737075805664e-05, + "ewc_loss_parallel": 6.180855416459963e-05, + "grad_norm": 3.474595785140991, + "learning_rate": 6.824925816023738e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.8420838117599487, + "num_tokens": 61549093.0, + "step": 1611 + }, + { + "epoch": 0.2050629690879023, + "ewc_loss": 0.018602516502141953, + "ewc_loss_diag": 1.245737075805664e-05, + "ewc_loss_parallel": 6.151344132376835e-05, + "grad_norm": 3.3740200996398926, + "learning_rate": 6.829164900381518e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.85301274061203, + "num_tokens": 61593843.0, + "step": 1612 + }, + { + "epoch": 0.2051901793664928, + "ewc_loss": 0.018577061593532562, + "ewc_loss_diag": 1.245737075805664e-05, + "ewc_loss_parallel": 6.125890649855137e-05, + "grad_norm": 3.4761924743652344, + "learning_rate": 6.833403984739295e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8484454154968262, + "num_tokens": 61629362.0, + "step": 1613 + }, + { + "epoch": 0.2053173896450833, + "ewc_loss": 0.018669500946998596, + "ewc_loss_diag": 1.245737075805664e-05, + "ewc_loss_parallel": 6.21832805336453e-05, + "grad_norm": 3.5199031829833984, + "learning_rate": 6.837643069097074e-07, + "loss": 0.5526, + "mean_token_accuracy": 0.826160192489624, + "num_tokens": 61667037.0, + "step": 1614 + }, + { + "epoch": 0.20544459992367384, + "ewc_loss": 0.01883677765727043, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.202500662766397e-05, + "grad_norm": 3.4585108757019043, + "learning_rate": 6.841882153454853e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.847959041595459, + "num_tokens": 61701976.0, + "step": 1615 + }, + { + "epoch": 0.20557181020226434, + "ewc_loss": 0.018816249445080757, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.18197227595374e-05, + "grad_norm": 3.4544565677642822, + "learning_rate": 6.846121237812632e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.860630452632904, + "num_tokens": 61739068.0, + "step": 1616 + }, + { + "epoch": 0.20569902048085487, + "ewc_loss": 0.01883547753095627, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.201201176736504e-05, + "grad_norm": 3.424431800842285, + "learning_rate": 6.850360322170411e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8546008467674255, + "num_tokens": 61781515.0, + "step": 1617 + }, + { + "epoch": 0.20582623075944537, + "ewc_loss": 0.01882573403418064, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.19145721429959e-05, + "grad_norm": 3.5972797870635986, + "learning_rate": 6.85459940652819e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.848241925239563, + "num_tokens": 61815201.0, + "step": 1618 + }, + { + "epoch": 0.20595344103803587, + "ewc_loss": 0.01890263706445694, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.268360448302701e-05, + "grad_norm": 3.4625244140625, + "learning_rate": 6.858838490885968e-07, + "loss": 0.5254, + "mean_token_accuracy": 0.8333112001419067, + "num_tokens": 61856639.0, + "step": 1619 + }, + { + "epoch": 0.2060806513166264, + "ewc_loss": 0.018797650933265686, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.163373473100364e-05, + "grad_norm": 3.4837515354156494, + "learning_rate": 6.863077575243748e-07, + "loss": 0.5364, + "mean_token_accuracy": 0.8323863744735718, + "num_tokens": 61894313.0, + "step": 1620 + }, + { + "epoch": 0.2062078615952169, + "ewc_loss": 0.018859077244997025, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.224800745258108e-05, + "grad_norm": 3.521515130996704, + "learning_rate": 6.867316659601525e-07, + "loss": 0.4734, + "mean_token_accuracy": 0.8474607467651367, + "num_tokens": 61930748.0, + "step": 1621 + }, + { + "epoch": 0.2063350718738074, + "ewc_loss": 0.018871009349822998, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.236731860553846e-05, + "grad_norm": 3.4284229278564453, + "learning_rate": 6.871555743959304e-07, + "loss": 0.489, + "mean_token_accuracy": 0.845807671546936, + "num_tokens": 61972002.0, + "step": 1622 + }, + { + "epoch": 0.20646228215239792, + "ewc_loss": 0.018832692876458168, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.198415940161794e-05, + "grad_norm": 3.4518370628356934, + "learning_rate": 6.875794828317083e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.8542543649673462, + "num_tokens": 62010141.0, + "step": 1623 + }, + { + "epoch": 0.20658949243098843, + "ewc_loss": 0.018880346789956093, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.246069824555889e-05, + "grad_norm": 3.412626266479492, + "learning_rate": 6.880033912674862e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8571426868438721, + "num_tokens": 62055907.0, + "step": 1624 + }, + { + "epoch": 0.20671670270957893, + "ewc_loss": 0.018859799951314926, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.225521792657673e-05, + "grad_norm": 3.4842734336853027, + "learning_rate": 6.884272997032641e-07, + "loss": 0.5581, + "mean_token_accuracy": 0.8235269784927368, + "num_tokens": 62095885.0, + "step": 1625 + }, + { + "epoch": 0.20684391298816945, + "ewc_loss": 0.018912428990006447, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.278150976868346e-05, + "grad_norm": 3.467721700668335, + "learning_rate": 6.88851208139042e-07, + "loss": 0.4509, + "mean_token_accuracy": 0.8596742153167725, + "num_tokens": 62130497.0, + "step": 1626 + }, + { + "epoch": 0.20697112326675995, + "ewc_loss": 0.01887696608901024, + "ewc_loss_diag": 1.2636184692382812e-05, + "ewc_loss_parallel": 6.242687959456816e-05, + "grad_norm": 3.4801292419433594, + "learning_rate": 6.892751165748198e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.8371934294700623, + "num_tokens": 62170296.0, + "step": 1627 + }, + { + "epoch": 0.20709833354535045, + "ewc_loss": 0.018962973728775978, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.267661228775978e-05, + "grad_norm": 3.4521851539611816, + "learning_rate": 6.896990250105978e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.859905481338501, + "num_tokens": 62210168.0, + "step": 1628 + }, + { + "epoch": 0.20722554382394098, + "ewc_loss": 0.018929999321699142, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.23468731646426e-05, + "grad_norm": 3.5151102542877197, + "learning_rate": 6.901229334463755e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8481024503707886, + "num_tokens": 62245251.0, + "step": 1629 + }, + { + "epoch": 0.20735275410253148, + "ewc_loss": 0.018982568755745888, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.287256110226735e-05, + "grad_norm": 3.468136787414551, + "learning_rate": 6.905468418821534e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8550862073898315, + "num_tokens": 62286124.0, + "step": 1630 + }, + { + "epoch": 0.20747996438112198, + "ewc_loss": 0.018944459035992622, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.249146827030927e-05, + "grad_norm": 3.48203444480896, + "learning_rate": 6.909707503179313e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8459575176239014, + "num_tokens": 62325604.0, + "step": 1631 + }, + { + "epoch": 0.2076071746597125, + "ewc_loss": 0.018961451947689056, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.26613837084733e-05, + "grad_norm": 3.4850993156433105, + "learning_rate": 6.913946587537091e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.8497668504714966, + "num_tokens": 62364467.0, + "step": 1632 + }, + { + "epoch": 0.207734384938303, + "ewc_loss": 0.018950801342725754, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.25548927928321e-05, + "grad_norm": 3.511404514312744, + "learning_rate": 6.918185671894871e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8278542757034302, + "num_tokens": 62402075.0, + "step": 1633 + }, + { + "epoch": 0.2078615952168935, + "ewc_loss": 0.018959159031510353, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.263847171794623e-05, + "grad_norm": 3.495532274246216, + "learning_rate": 6.922424756252649e-07, + "loss": 0.5142, + "mean_token_accuracy": 0.8451757431030273, + "num_tokens": 62437802.0, + "step": 1634 + }, + { + "epoch": 0.20798880549548404, + "ewc_loss": 0.018957054242491722, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.261741509661078e-05, + "grad_norm": 3.4509730339050293, + "learning_rate": 6.926663840610428e-07, + "loss": 0.4329, + "mean_token_accuracy": 0.8607479333877563, + "num_tokens": 62476176.0, + "step": 1635 + }, + { + "epoch": 0.20811601577407454, + "ewc_loss": 0.018959198147058487, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.263885006774217e-05, + "grad_norm": 3.5418410301208496, + "learning_rate": 6.930902924968206e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.845081627368927, + "num_tokens": 62514378.0, + "step": 1636 + }, + { + "epoch": 0.20824322605266504, + "ewc_loss": 0.01898352801799774, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.288214353844523e-05, + "grad_norm": 3.453814744949341, + "learning_rate": 6.935142009325985e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.841480016708374, + "num_tokens": 62555302.0, + "step": 1637 + }, + { + "epoch": 0.20837043633125557, + "ewc_loss": 0.018921256065368652, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.225944525795057e-05, + "grad_norm": 3.4814541339874268, + "learning_rate": 6.939381093683764e-07, + "loss": 0.5165, + "mean_token_accuracy": 0.8363490104675293, + "num_tokens": 62594356.0, + "step": 1638 + }, + { + "epoch": 0.20849764660984607, + "ewc_loss": 0.018980054184794426, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.284741539275274e-05, + "grad_norm": 3.471311569213867, + "learning_rate": 6.943620178041543e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.862409770488739, + "num_tokens": 62632903.0, + "step": 1639 + }, + { + "epoch": 0.2086248568884366, + "ewc_loss": 0.01894298940896988, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.247677083592862e-05, + "grad_norm": 3.52219820022583, + "learning_rate": 6.947859262399321e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.8525736927986145, + "num_tokens": 62668979.0, + "step": 1640 + }, + { + "epoch": 0.2087520671670271, + "ewc_loss": 0.01899651065468788, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.301197572611272e-05, + "grad_norm": 3.5274884700775146, + "learning_rate": 6.952098346757101e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.8490598797798157, + "num_tokens": 62705867.0, + "step": 1641 + }, + { + "epoch": 0.2088792774456176, + "ewc_loss": 0.01896503008902073, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.269717414397746e-05, + "grad_norm": 3.4828314781188965, + "learning_rate": 6.956337431114879e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8529744148254395, + "num_tokens": 62744559.0, + "step": 1642 + }, + { + "epoch": 0.20900648772420813, + "ewc_loss": 0.019090745598077774, + "ewc_loss_diag": 1.2814998626708984e-05, + "ewc_loss_parallel": 6.273361941566691e-05, + "grad_norm": 3.4490954875946045, + "learning_rate": 6.960576515472658e-07, + "loss": 0.4981, + "mean_token_accuracy": 0.8426870703697205, + "num_tokens": 62785221.0, + "step": 1643 + }, + { + "epoch": 0.20913369800279863, + "ewc_loss": 0.018983282148838043, + "ewc_loss_diag": 1.2695789337158203e-05, + "ewc_loss_parallel": 6.287970609264448e-05, + "grad_norm": 3.584867000579834, + "learning_rate": 6.964815599830436e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8462023735046387, + "num_tokens": 62820095.0, + "step": 1644 + }, + { + "epoch": 0.20926090828138913, + "ewc_loss": 0.019108828157186508, + "ewc_loss_diag": 1.2755393981933594e-05, + "ewc_loss_parallel": 6.352481432259083e-05, + "grad_norm": 3.4499852657318115, + "learning_rate": 6.969054684188215e-07, + "loss": 0.5374, + "mean_token_accuracy": 0.8269051313400269, + "num_tokens": 62863414.0, + "step": 1645 + }, + { + "epoch": 0.20938811855997966, + "ewc_loss": 0.019015036523342133, + "ewc_loss_diag": 1.2755393981933594e-05, + "ewc_loss_parallel": 6.258689245441929e-05, + "grad_norm": 3.4712135791778564, + "learning_rate": 6.973293768545994e-07, + "loss": 0.448, + "mean_token_accuracy": 0.8569891452789307, + "num_tokens": 62904647.0, + "step": 1646 + }, + { + "epoch": 0.20951532883857016, + "ewc_loss": 0.019086439162492752, + "ewc_loss_diag": 1.2755393981933594e-05, + "ewc_loss_parallel": 6.330091127892956e-05, + "grad_norm": 3.41279935836792, + "learning_rate": 6.977532852903773e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8460571765899658, + "num_tokens": 62949476.0, + "step": 1647 + }, + { + "epoch": 0.20964253911716066, + "ewc_loss": 0.019118893891572952, + "ewc_loss_diag": 1.2814998626708984e-05, + "ewc_loss_parallel": 6.301511166384444e-05, + "grad_norm": 3.5698459148406982, + "learning_rate": 6.981771937261551e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.8331397771835327, + "num_tokens": 62984784.0, + "step": 1648 + }, + { + "epoch": 0.2097697493957512, + "ewc_loss": 0.01920139789581299, + "ewc_loss_diag": 1.2814998626708984e-05, + "ewc_loss_parallel": 6.384014704963192e-05, + "grad_norm": 3.4886598587036133, + "learning_rate": 6.986011021619331e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8455312252044678, + "num_tokens": 63024395.0, + "step": 1649 + }, + { + "epoch": 0.2098969596743417, + "ewc_loss": 0.019127758219838142, + "ewc_loss_diag": 1.2814998626708984e-05, + "ewc_loss_parallel": 6.310374737950042e-05, + "grad_norm": 3.527675151824951, + "learning_rate": 6.990250105977109e-07, + "loss": 0.5043, + "mean_token_accuracy": 0.8420732021331787, + "num_tokens": 63061886.0, + "step": 1650 + }, + { + "epoch": 0.2100241699529322, + "ewc_loss": 0.019187983125448227, + "ewc_loss_diag": 1.2814998626708984e-05, + "ewc_loss_parallel": 6.370600021909922e-05, + "grad_norm": 3.4800398349761963, + "learning_rate": 6.994489190334886e-07, + "loss": 0.472, + "mean_token_accuracy": 0.8481428623199463, + "num_tokens": 63104150.0, + "step": 1651 + }, + { + "epoch": 0.21015138023152272, + "ewc_loss": 0.01920958422124386, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.331165786832571e-05, + "grad_norm": 3.457965135574341, + "learning_rate": 6.998728274692666e-07, + "loss": 0.4366, + "mean_token_accuracy": 0.8609964847564697, + "num_tokens": 63144346.0, + "step": 1652 + }, + { + "epoch": 0.21027859051011322, + "ewc_loss": 0.019217589870095253, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.339171522995457e-05, + "grad_norm": 3.5301380157470703, + "learning_rate": 7.002967359050444e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.854094386100769, + "num_tokens": 63180042.0, + "step": 1653 + }, + { + "epoch": 0.21040580078870372, + "ewc_loss": 0.019248904660344124, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.370486516971141e-05, + "grad_norm": 3.503560781478882, + "learning_rate": 7.007206443408224e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8613061904907227, + "num_tokens": 63216570.0, + "step": 1654 + }, + { + "epoch": 0.21053301106729425, + "ewc_loss": 0.01922392100095749, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.345502333715558e-05, + "grad_norm": 3.4899344444274902, + "learning_rate": 7.011445527766002e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8475061655044556, + "num_tokens": 63253457.0, + "step": 1655 + }, + { + "epoch": 0.21066022134588475, + "ewc_loss": 0.019227581098675728, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.349162867991254e-05, + "grad_norm": 3.4349589347839355, + "learning_rate": 7.015684612123781e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8501520156860352, + "num_tokens": 63295430.0, + "step": 1656 + }, + { + "epoch": 0.21078743162447525, + "ewc_loss": 0.019221102818846703, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.342684355331585e-05, + "grad_norm": 3.4886317253112793, + "learning_rate": 7.01992369648156e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.8466430306434631, + "num_tokens": 63340118.0, + "step": 1657 + }, + { + "epoch": 0.21091464190306577, + "ewc_loss": 0.019271479919552803, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.39306235825643e-05, + "grad_norm": 3.6285018920898438, + "learning_rate": 7.024162780839339e-07, + "loss": 0.5184, + "mean_token_accuracy": 0.8279483318328857, + "num_tokens": 63372037.0, + "step": 1658 + }, + { + "epoch": 0.21104185218165628, + "ewc_loss": 0.019302889704704285, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.424471212085336e-05, + "grad_norm": 3.4544436931610107, + "learning_rate": 7.028401865197116e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8376020193099976, + "num_tokens": 63415508.0, + "step": 1659 + }, + { + "epoch": 0.21116906246024678, + "ewc_loss": 0.01921040564775467, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.331987970042974e-05, + "grad_norm": 3.449439764022827, + "learning_rate": 7.032640949554896e-07, + "loss": 0.4194, + "mean_token_accuracy": 0.8618674874305725, + "num_tokens": 63459320.0, + "step": 1660 + }, + { + "epoch": 0.2112962727388373, + "ewc_loss": 0.019278360530734062, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.399943231372163e-05, + "grad_norm": 3.5054848194122314, + "learning_rate": 7.036880033912674e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8378544449806213, + "num_tokens": 63499624.0, + "step": 1661 + }, + { + "epoch": 0.2114234830174278, + "ewc_loss": 0.019293949007987976, + "ewc_loss_diag": 1.2874603271484375e-05, + "ewc_loss_parallel": 6.415531970560551e-05, + "grad_norm": 3.5738375186920166, + "learning_rate": 7.041119118270454e-07, + "loss": 0.473, + "mean_token_accuracy": 0.8485245108604431, + "num_tokens": 63536507.0, + "step": 1662 + }, + { + "epoch": 0.2115506932960183, + "ewc_loss": 0.019364695996046066, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.425243918783963e-05, + "grad_norm": 3.5285253524780273, + "learning_rate": 7.045358202628232e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8395532369613647, + "num_tokens": 63576483.0, + "step": 1663 + }, + { + "epoch": 0.21167790357460883, + "ewc_loss": 0.01934041455388069, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.400961137842387e-05, + "grad_norm": 3.5178639888763428, + "learning_rate": 7.049597286986011e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8387787938117981, + "num_tokens": 63615568.0, + "step": 1664 + }, + { + "epoch": 0.21180511385319933, + "ewc_loss": 0.019339054822921753, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.399601988960057e-05, + "grad_norm": 3.5323781967163086, + "learning_rate": 7.05383637134379e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8523841500282288, + "num_tokens": 63650023.0, + "step": 1665 + }, + { + "epoch": 0.21193232413178986, + "ewc_loss": 0.019364483654499054, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.425030005630106e-05, + "grad_norm": 3.507434606552124, + "learning_rate": 7.058075455701568e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.837725818157196, + "num_tokens": 63690229.0, + "step": 1666 + }, + { + "epoch": 0.21205953441038036, + "ewc_loss": 0.019345827400684357, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.406375177903101e-05, + "grad_norm": 3.5074429512023926, + "learning_rate": 7.062314540059346e-07, + "loss": 0.441, + "mean_token_accuracy": 0.861870288848877, + "num_tokens": 63728169.0, + "step": 1667 + }, + { + "epoch": 0.21218674468897086, + "ewc_loss": 0.019354771822690964, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.415319512598217e-05, + "grad_norm": 3.576996326446533, + "learning_rate": 7.066553624417126e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.8504313826560974, + "num_tokens": 63762816.0, + "step": 1668 + }, + { + "epoch": 0.2123139549675614, + "ewc_loss": 0.0193976778537035, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.458224379457533e-05, + "grad_norm": 3.5342743396759033, + "learning_rate": 7.070792708774904e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8450689315795898, + "num_tokens": 63804288.0, + "step": 1669 + }, + { + "epoch": 0.2124411652461519, + "ewc_loss": 0.019367769360542297, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.428316555684432e-05, + "grad_norm": 3.4552791118621826, + "learning_rate": 7.075031793132684e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8430818319320679, + "num_tokens": 63851365.0, + "step": 1670 + }, + { + "epoch": 0.2125683755247424, + "ewc_loss": 0.01936219446361065, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.422740989364684e-05, + "grad_norm": 3.558671236038208, + "learning_rate": 7.079270877490462e-07, + "loss": 0.4689, + "mean_token_accuracy": 0.8523361086845398, + "num_tokens": 63886187.0, + "step": 1671 + }, + { + "epoch": 0.21269558580333292, + "ewc_loss": 0.01941669173538685, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.477238639490679e-05, + "grad_norm": 3.504488706588745, + "learning_rate": 7.08350996184824e-07, + "loss": 0.5421, + "mean_token_accuracy": 0.8293116688728333, + "num_tokens": 63927187.0, + "step": 1672 + }, + { + "epoch": 0.21282279608192342, + "ewc_loss": 0.01937246508896351, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.433011731132865e-05, + "grad_norm": 3.5273563861846924, + "learning_rate": 7.08774904620602e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.8558319807052612, + "num_tokens": 63964088.0, + "step": 1673 + }, + { + "epoch": 0.21295000636051392, + "ewc_loss": 0.019465366378426552, + "ewc_loss_diag": 1.2993812561035156e-05, + "ewc_loss_parallel": 6.464878242695704e-05, + "grad_norm": 3.5386576652526855, + "learning_rate": 7.091988130563797e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.838897705078125, + "num_tokens": 64004931.0, + "step": 1674 + }, + { + "epoch": 0.21307721663910445, + "ewc_loss": 0.019418224692344666, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.478772411355749e-05, + "grad_norm": 3.5130374431610107, + "learning_rate": 7.096227214921576e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8398473858833313, + "num_tokens": 64043966.0, + "step": 1675 + }, + { + "epoch": 0.21320442691769495, + "ewc_loss": 0.01938444934785366, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.444996688514948e-05, + "grad_norm": 3.5261056423187256, + "learning_rate": 7.100466299279355e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8484294414520264, + "num_tokens": 64078886.0, + "step": 1676 + }, + { + "epoch": 0.21333163719628545, + "ewc_loss": 0.019427141174674034, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.487688369816169e-05, + "grad_norm": 3.569209337234497, + "learning_rate": 7.104705383637134e-07, + "loss": 0.5071, + "mean_token_accuracy": 0.8447659015655518, + "num_tokens": 64117291.0, + "step": 1677 + }, + { + "epoch": 0.21345884747487598, + "ewc_loss": 0.019426658749580383, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.487205973826349e-05, + "grad_norm": 3.479863166809082, + "learning_rate": 7.108944467994913e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8514292240142822, + "num_tokens": 64156369.0, + "step": 1678 + }, + { + "epoch": 0.21358605775346648, + "ewc_loss": 0.019403688609600067, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.464235775638372e-05, + "grad_norm": 3.598353385925293, + "learning_rate": 7.113183552352692e-07, + "loss": 0.538, + "mean_token_accuracy": 0.8268482685089111, + "num_tokens": 64193646.0, + "step": 1679 + }, + { + "epoch": 0.21371326803205698, + "ewc_loss": 0.019476689398288727, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.537236185977235e-05, + "grad_norm": 3.4884133338928223, + "learning_rate": 7.11742263671047e-07, + "loss": 0.4315, + "mean_token_accuracy": 0.8612686395645142, + "num_tokens": 64231933.0, + "step": 1680 + }, + { + "epoch": 0.2138404783106475, + "ewc_loss": 0.019390394911170006, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.450941873481497e-05, + "grad_norm": 3.6303300857543945, + "learning_rate": 7.12166172106825e-07, + "loss": 0.562, + "mean_token_accuracy": 0.8209654092788696, + "num_tokens": 64267418.0, + "step": 1681 + }, + { + "epoch": 0.213967688589238, + "ewc_loss": 0.019512910395860672, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.573458085767925e-05, + "grad_norm": 3.566293239593506, + "learning_rate": 7.125900805426027e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.8576483130455017, + "num_tokens": 64302440.0, + "step": 1682 + }, + { + "epoch": 0.2140948988678285, + "ewc_loss": 0.01942296326160431, + "ewc_loss_diag": 1.2934207916259766e-05, + "ewc_loss_parallel": 6.483509059762582e-05, + "grad_norm": 3.540933132171631, + "learning_rate": 7.130139889783806e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8469177484512329, + "num_tokens": 64338240.0, + "step": 1683 + }, + { + "epoch": 0.21422210914641904, + "ewc_loss": 0.01951911300420761, + "ewc_loss_diag": 1.2993812561035156e-05, + "ewc_loss_parallel": 6.518625741591677e-05, + "grad_norm": 3.5343363285064697, + "learning_rate": 7.134378974141585e-07, + "loss": 0.4737, + "mean_token_accuracy": 0.8480057716369629, + "num_tokens": 64375734.0, + "step": 1684 + }, + { + "epoch": 0.21434931942500954, + "ewc_loss": 0.01951821893453598, + "ewc_loss_diag": 1.2993812561035156e-05, + "ewc_loss_parallel": 6.517731526400894e-05, + "grad_norm": 3.603606700897217, + "learning_rate": 7.138618058499364e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8512464165687561, + "num_tokens": 64411401.0, + "step": 1685 + }, + { + "epoch": 0.21447652970360004, + "ewc_loss": 0.019555965438485146, + "ewc_loss_diag": 1.2993812561035156e-05, + "ewc_loss_parallel": 6.555477011715993e-05, + "grad_norm": 3.4884250164031982, + "learning_rate": 7.142857142857143e-07, + "loss": 0.4427, + "mean_token_accuracy": 0.8597599864006042, + "num_tokens": 64454719.0, + "step": 1686 + }, + { + "epoch": 0.21460373998219057, + "ewc_loss": 0.01948963850736618, + "ewc_loss_diag": 1.2993812561035156e-05, + "ewc_loss_parallel": 6.489150109700859e-05, + "grad_norm": 3.5150723457336426, + "learning_rate": 7.147096227214922e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8525091409683228, + "num_tokens": 64495736.0, + "step": 1687 + }, + { + "epoch": 0.21473095026078107, + "ewc_loss": 0.019563812762498856, + "ewc_loss_diag": 1.2993812561035156e-05, + "ewc_loss_parallel": 6.563323404407129e-05, + "grad_norm": 3.517347574234009, + "learning_rate": 7.1513353115727e-07, + "loss": 0.4475, + "mean_token_accuracy": 0.8563222885131836, + "num_tokens": 64532843.0, + "step": 1688 + }, + { + "epoch": 0.21485816053937157, + "ewc_loss": 0.019536981359124184, + "ewc_loss_diag": 1.2993812561035156e-05, + "ewc_loss_parallel": 6.536492583109066e-05, + "grad_norm": 3.588771104812622, + "learning_rate": 7.155574395930479e-07, + "loss": 0.493, + "mean_token_accuracy": 0.8401097655296326, + "num_tokens": 64568950.0, + "step": 1689 + }, + { + "epoch": 0.2149853708179621, + "ewc_loss": 0.019582634791731834, + "ewc_loss_diag": 1.2993812561035156e-05, + "ewc_loss_parallel": 6.582146306755021e-05, + "grad_norm": 3.563685655593872, + "learning_rate": 7.159813480288257e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8466427326202393, + "num_tokens": 64606878.0, + "step": 1690 + }, + { + "epoch": 0.2151125810965526, + "ewc_loss": 0.01956242136657238, + "ewc_loss_diag": 1.2993812561035156e-05, + "ewc_loss_parallel": 6.56193369650282e-05, + "grad_norm": 3.573291063308716, + "learning_rate": 7.164052564646035e-07, + "loss": 0.4395, + "mean_token_accuracy": 0.8594316244125366, + "num_tokens": 64643012.0, + "step": 1691 + }, + { + "epoch": 0.21523979137514312, + "ewc_loss": 0.01957247406244278, + "ewc_loss_diag": 1.2993812561035156e-05, + "ewc_loss_parallel": 6.571984704351053e-05, + "grad_norm": 3.544781446456909, + "learning_rate": 7.168291649003815e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.8565345406532288, + "num_tokens": 64684444.0, + "step": 1692 + }, + { + "epoch": 0.21536700165373363, + "ewc_loss": 0.019625697284936905, + "ewc_loss_diag": 1.3053417205810547e-05, + "ewc_loss_parallel": 6.564173963852227e-05, + "grad_norm": 3.59258770942688, + "learning_rate": 7.172530733361593e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8401020765304565, + "num_tokens": 64722068.0, + "step": 1693 + }, + { + "epoch": 0.21549421193232413, + "ewc_loss": 0.019785676151514053, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.602082430617884e-05, + "grad_norm": 3.5284159183502197, + "learning_rate": 7.176769817719373e-07, + "loss": 0.4205, + "mean_token_accuracy": 0.8710564374923706, + "num_tokens": 64761829.0, + "step": 1694 + }, + { + "epoch": 0.21562142221091465, + "ewc_loss": 0.01972888968884945, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.545296491822228e-05, + "grad_norm": 3.53010892868042, + "learning_rate": 7.181008902077151e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8517854809761047, + "num_tokens": 64799185.0, + "step": 1695 + }, + { + "epoch": 0.21574863248950515, + "ewc_loss": 0.019767500460147858, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.583906360901892e-05, + "grad_norm": 3.565168857574463, + "learning_rate": 7.18524798643493e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8444787859916687, + "num_tokens": 64839306.0, + "step": 1696 + }, + { + "epoch": 0.21587584276809565, + "ewc_loss": 0.019766824319958687, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.583231152035296e-05, + "grad_norm": 3.5372259616851807, + "learning_rate": 7.189487070792708e-07, + "loss": 0.4731, + "mean_token_accuracy": 0.8520687222480774, + "num_tokens": 64876011.0, + "step": 1697 + }, + { + "epoch": 0.21600305304668618, + "ewc_loss": 0.01964217610657215, + "ewc_loss_diag": 1.3053417205810547e-05, + "ewc_loss_parallel": 6.58065255265683e-05, + "grad_norm": 3.5101430416107178, + "learning_rate": 7.193726155150487e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8526013493537903, + "num_tokens": 64918510.0, + "step": 1698 + }, + { + "epoch": 0.21613026332527668, + "ewc_loss": 0.01976814866065979, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.584555376321077e-05, + "grad_norm": 3.549264430999756, + "learning_rate": 7.197965239508265e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8484879732131958, + "num_tokens": 64957156.0, + "step": 1699 + }, + { + "epoch": 0.21625747360386718, + "ewc_loss": 0.01966852694749832, + "ewc_loss_diag": 1.3053417205810547e-05, + "ewc_loss_parallel": 6.607003160752356e-05, + "grad_norm": 3.534627914428711, + "learning_rate": 7.202204323866045e-07, + "loss": 0.476, + "mean_token_accuracy": 0.8504142761230469, + "num_tokens": 64998761.0, + "step": 1700 + }, + { + "epoch": 0.2163846838824577, + "ewc_loss": 0.01976500079035759, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.58140706946142e-05, + "grad_norm": 3.5389649868011475, + "learning_rate": 7.206443408223823e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.8467018604278564, + "num_tokens": 65038849.0, + "step": 1701 + }, + { + "epoch": 0.2165118941610482, + "ewc_loss": 0.01966354250907898, + "ewc_loss_diag": 1.3053417205810547e-05, + "ewc_loss_parallel": 6.60201912978664e-05, + "grad_norm": 3.5938093662261963, + "learning_rate": 7.210682492581603e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.8514737486839294, + "num_tokens": 65071130.0, + "step": 1702 + }, + { + "epoch": 0.2166391044396387, + "ewc_loss": 0.019809257239103317, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.625663809245452e-05, + "grad_norm": 3.5253775119781494, + "learning_rate": 7.214921576939381e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8377565145492554, + "num_tokens": 65111619.0, + "step": 1703 + }, + { + "epoch": 0.21676631471822924, + "ewc_loss": 0.019763696938753128, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.580102490261197e-05, + "grad_norm": 3.5712900161743164, + "learning_rate": 7.219160661297159e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.8554902076721191, + "num_tokens": 65147537.0, + "step": 1704 + }, + { + "epoch": 0.21689352499681974, + "ewc_loss": 0.019831683486700058, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.648089038208127e-05, + "grad_norm": 3.5498619079589844, + "learning_rate": 7.223399745654938e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.8462793231010437, + "num_tokens": 65187254.0, + "step": 1705 + }, + { + "epoch": 0.21702073527541024, + "ewc_loss": 0.019791482016444206, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.607887917198241e-05, + "grad_norm": 3.5559921264648438, + "learning_rate": 7.227638830012717e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8394507169723511, + "num_tokens": 65226130.0, + "step": 1706 + }, + { + "epoch": 0.21714794555400077, + "ewc_loss": 0.019825879484415054, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.642285006819293e-05, + "grad_norm": 3.6839230060577393, + "learning_rate": 7.231877914370495e-07, + "loss": 0.4597, + "mean_token_accuracy": 0.8554342985153198, + "num_tokens": 65261282.0, + "step": 1707 + }, + { + "epoch": 0.21727515583259127, + "ewc_loss": 0.019992977380752563, + "ewc_loss_diag": 1.329183578491211e-05, + "ewc_loss_parallel": 6.687312270514667e-05, + "grad_norm": 3.595139265060425, + "learning_rate": 7.236116998728275e-07, + "loss": 0.5113, + "mean_token_accuracy": 0.8371874094009399, + "num_tokens": 65294850.0, + "step": 1708 + }, + { + "epoch": 0.21740236611118177, + "ewc_loss": 0.0197994876652956, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.615893653361127e-05, + "grad_norm": 3.4998209476470947, + "learning_rate": 7.240356083086053e-07, + "loss": 0.4043, + "mean_token_accuracy": 0.8705617189407349, + "num_tokens": 65332323.0, + "step": 1709 + }, + { + "epoch": 0.2175295763897723, + "ewc_loss": 0.019812606275081635, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.629013660131022e-05, + "grad_norm": 3.5252132415771484, + "learning_rate": 7.244595167443833e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8434851169586182, + "num_tokens": 65376033.0, + "step": 1710 + }, + { + "epoch": 0.2176567866683628, + "ewc_loss": 0.019859882071614265, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.676287739537656e-05, + "grad_norm": 3.54677152633667, + "learning_rate": 7.248834251801611e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8546555042266846, + "num_tokens": 65417875.0, + "step": 1711 + }, + { + "epoch": 0.2177839969469533, + "ewc_loss": 0.019861536100506783, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.677942292299122e-05, + "grad_norm": 3.554319143295288, + "learning_rate": 7.253073336159388e-07, + "loss": 0.4403, + "mean_token_accuracy": 0.8601254820823669, + "num_tokens": 65456675.0, + "step": 1712 + }, + { + "epoch": 0.21791120722554383, + "ewc_loss": 0.01985473558306694, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.671140727121383e-05, + "grad_norm": 3.657813787460327, + "learning_rate": 7.257312420517168e-07, + "loss": 0.4317, + "mean_token_accuracy": 0.8618214726448059, + "num_tokens": 65487699.0, + "step": 1713 + }, + { + "epoch": 0.21803841750413433, + "ewc_loss": 0.019918426871299744, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.73483227728866e-05, + "grad_norm": 3.581376552581787, + "learning_rate": 7.261551504874946e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8501942157745361, + "num_tokens": 65526143.0, + "step": 1714 + }, + { + "epoch": 0.21816562778272486, + "ewc_loss": 0.019832655787467957, + "ewc_loss_diag": 1.3172626495361328e-05, + "ewc_loss_parallel": 6.649062561336905e-05, + "grad_norm": 3.5390915870666504, + "learning_rate": 7.265790589232725e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8560160398483276, + "num_tokens": 65562969.0, + "step": 1715 + }, + { + "epoch": 0.21829283806131536, + "ewc_loss": 0.01991179957985878, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.667171692242846e-05, + "grad_norm": 3.5363500118255615, + "learning_rate": 7.270029673590504e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.8461861610412598, + "num_tokens": 65605999.0, + "step": 1716 + }, + { + "epoch": 0.21842004833990586, + "ewc_loss": 0.019916769117116928, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.67213971610181e-05, + "grad_norm": 3.600947618484497, + "learning_rate": 7.274268757948283e-07, + "loss": 0.5333, + "mean_token_accuracy": 0.8320447206497192, + "num_tokens": 65644877.0, + "step": 1717 + }, + { + "epoch": 0.2185472586184964, + "ewc_loss": 0.019961398094892502, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.716768257319927e-05, + "grad_norm": 3.605041980743408, + "learning_rate": 7.278507842306062e-07, + "loss": 0.4544, + "mean_token_accuracy": 0.8556264042854309, + "num_tokens": 65678823.0, + "step": 1718 + }, + { + "epoch": 0.2186744688970869, + "ewc_loss": 0.019949980080127716, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.705350097035989e-05, + "grad_norm": 3.6081697940826416, + "learning_rate": 7.282746926663841e-07, + "loss": 0.534, + "mean_token_accuracy": 0.8316315412521362, + "num_tokens": 65717185.0, + "step": 1719 + }, + { + "epoch": 0.2188016791756774, + "ewc_loss": 0.01995682343840599, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.71219386276789e-05, + "grad_norm": 3.593844413757324, + "learning_rate": 7.286986011021618e-07, + "loss": 0.4542, + "mean_token_accuracy": 0.8555500507354736, + "num_tokens": 65754422.0, + "step": 1720 + }, + { + "epoch": 0.21892888945426792, + "ewc_loss": 0.019953671842813492, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.709043373120949e-05, + "grad_norm": 3.5529026985168457, + "learning_rate": 7.291225095379398e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8469761610031128, + "num_tokens": 65799390.0, + "step": 1721 + }, + { + "epoch": 0.21905609973285842, + "ewc_loss": 0.01995622180402279, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.711592868668959e-05, + "grad_norm": 3.582481861114502, + "learning_rate": 7.295464179737176e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8501158952713013, + "num_tokens": 65837566.0, + "step": 1722 + }, + { + "epoch": 0.21918331001144892, + "ewc_loss": 0.019985033199191093, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.740404933225363e-05, + "grad_norm": 3.609464645385742, + "learning_rate": 7.299703264094955e-07, + "loss": 0.4677, + "mean_token_accuracy": 0.8513778448104858, + "num_tokens": 65874537.0, + "step": 1723 + }, + { + "epoch": 0.21931052029003945, + "ewc_loss": 0.019974879920482635, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.730252061970532e-05, + "grad_norm": 3.6219418048858643, + "learning_rate": 7.303942348452734e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.8437032103538513, + "num_tokens": 65915616.0, + "step": 1724 + }, + { + "epoch": 0.21943773056862995, + "ewc_loss": 0.019992807880043983, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.748178566340357e-05, + "grad_norm": 3.561708450317383, + "learning_rate": 7.308181432810513e-07, + "loss": 0.4428, + "mean_token_accuracy": 0.857498049736023, + "num_tokens": 65954263.0, + "step": 1725 + }, + { + "epoch": 0.21956494084722045, + "ewc_loss": 0.020039144903421402, + "ewc_loss_diag": 1.329183578491211e-05, + "ewc_loss_parallel": 6.733480404363945e-05, + "grad_norm": 3.6962156295776367, + "learning_rate": 7.312420517168292e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.8476643562316895, + "num_tokens": 65992880.0, + "step": 1726 + }, + { + "epoch": 0.21969215112581097, + "ewc_loss": 0.02003626339137554, + "ewc_loss_diag": 1.3232231140136719e-05, + "ewc_loss_parallel": 6.791634950786829e-05, + "grad_norm": 3.6006271839141846, + "learning_rate": 7.31665960152607e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.8403806090354919, + "num_tokens": 66033958.0, + "step": 1727 + }, + { + "epoch": 0.21981936140440148, + "ewc_loss": 0.020010270178318024, + "ewc_loss_diag": 1.329183578491211e-05, + "ewc_loss_parallel": 6.704607221763581e-05, + "grad_norm": 3.6596386432647705, + "learning_rate": 7.320898685883848e-07, + "loss": 0.4652, + "mean_token_accuracy": 0.8509344458580017, + "num_tokens": 66072448.0, + "step": 1728 + }, + { + "epoch": 0.21994657168299198, + "ewc_loss": 0.020098164677619934, + "ewc_loss_diag": 1.329183578491211e-05, + "ewc_loss_parallel": 6.792500062147155e-05, + "grad_norm": 3.6461222171783447, + "learning_rate": 7.325137770241628e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8206346035003662, + "num_tokens": 66112648.0, + "step": 1729 + }, + { + "epoch": 0.2200737819615825, + "ewc_loss": 0.020036179572343826, + "ewc_loss_diag": 1.329183578491211e-05, + "ewc_loss_parallel": 6.730515451636165e-05, + "grad_norm": 3.5333845615386963, + "learning_rate": 7.329376854599406e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8463817834854126, + "num_tokens": 66154791.0, + "step": 1730 + }, + { + "epoch": 0.220200992240173, + "ewc_loss": 0.020018313080072403, + "ewc_loss_diag": 1.329183578491211e-05, + "ewc_loss_parallel": 6.712648610118777e-05, + "grad_norm": 3.6436572074890137, + "learning_rate": 7.333615938957184e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8492288589477539, + "num_tokens": 66192450.0, + "step": 1731 + }, + { + "epoch": 0.2203282025187635, + "ewc_loss": 0.020098600536584854, + "ewc_loss_diag": 1.329183578491211e-05, + "ewc_loss_parallel": 6.792936619604006e-05, + "grad_norm": 3.5977869033813477, + "learning_rate": 7.337855023314964e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8497344255447388, + "num_tokens": 66231173.0, + "step": 1732 + }, + { + "epoch": 0.22045541279735403, + "ewc_loss": 0.020095840096473694, + "ewc_loss_diag": 1.33514404296875e-05, + "ewc_loss_parallel": 6.729140295647085e-05, + "grad_norm": 3.5956757068634033, + "learning_rate": 7.342094107672742e-07, + "loss": 0.5233, + "mean_token_accuracy": 0.8337942361831665, + "num_tokens": 66272956.0, + "step": 1733 + }, + { + "epoch": 0.22058262307594453, + "ewc_loss": 0.020136846229434013, + "ewc_loss_diag": 1.33514404296875e-05, + "ewc_loss_parallel": 6.770146865164861e-05, + "grad_norm": 3.670846462249756, + "learning_rate": 7.346333192030522e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.8533153533935547, + "num_tokens": 66309450.0, + "step": 1734 + }, + { + "epoch": 0.22070983335453503, + "ewc_loss": 0.02024248242378235, + "ewc_loss_diag": 1.341104507446289e-05, + "ewc_loss_parallel": 6.814747757744044e-05, + "grad_norm": 3.638589859008789, + "learning_rate": 7.350572276388299e-07, + "loss": 0.5274, + "mean_token_accuracy": 0.8321120142936707, + "num_tokens": 66350947.0, + "step": 1735 + }, + { + "epoch": 0.22083704363312556, + "ewc_loss": 0.02021011710166931, + "ewc_loss_diag": 1.341104507446289e-05, + "ewc_loss_parallel": 6.782382115488872e-05, + "grad_norm": 3.6002187728881836, + "learning_rate": 7.354811360746078e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8438014984130859, + "num_tokens": 66388364.0, + "step": 1736 + }, + { + "epoch": 0.22096425391171606, + "ewc_loss": 0.020142288878560066, + "ewc_loss_diag": 1.33514404296875e-05, + "ewc_loss_parallel": 6.77558928146027e-05, + "grad_norm": 3.6510233879089355, + "learning_rate": 7.359050445103857e-07, + "loss": 0.5, + "mean_token_accuracy": 0.8406296968460083, + "num_tokens": 66425029.0, + "step": 1737 + }, + { + "epoch": 0.22109146419030656, + "ewc_loss": 0.02018379606306553, + "ewc_loss_diag": 1.33514404296875e-05, + "ewc_loss_parallel": 6.817097164457664e-05, + "grad_norm": 3.6011905670166016, + "learning_rate": 7.363289529461636e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8496202230453491, + "num_tokens": 66463924.0, + "step": 1738 + }, + { + "epoch": 0.2212186744688971, + "ewc_loss": 0.020149726420640945, + "ewc_loss_diag": 1.33514404296875e-05, + "ewc_loss_parallel": 6.78302749292925e-05, + "grad_norm": 3.5966544151306152, + "learning_rate": 7.367528613819415e-07, + "loss": 0.4307, + "mean_token_accuracy": 0.8619043827056885, + "num_tokens": 66504824.0, + "step": 1739 + }, + { + "epoch": 0.2213458847474876, + "ewc_loss": 0.020175546407699585, + "ewc_loss_diag": 1.33514404296875e-05, + "ewc_loss_parallel": 6.80884622852318e-05, + "grad_norm": 3.5780088901519775, + "learning_rate": 7.371767698177194e-07, + "loss": 0.45, + "mean_token_accuracy": 0.8556292057037354, + "num_tokens": 66539308.0, + "step": 1740 + }, + { + "epoch": 0.22147309502607812, + "ewc_loss": 0.020190676674246788, + "ewc_loss_diag": 1.33514404296875e-05, + "ewc_loss_parallel": 6.823978037573397e-05, + "grad_norm": 3.6187193393707275, + "learning_rate": 7.376006782534972e-07, + "loss": 0.4453, + "mean_token_accuracy": 0.85944002866745, + "num_tokens": 66572785.0, + "step": 1741 + }, + { + "epoch": 0.22160030530466862, + "ewc_loss": 0.020221078768372536, + "ewc_loss_diag": 1.33514404296875e-05, + "ewc_loss_parallel": 6.85437917127274e-05, + "grad_norm": 3.698232412338257, + "learning_rate": 7.380245866892751e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8524025678634644, + "num_tokens": 66607890.0, + "step": 1742 + }, + { + "epoch": 0.22172751558325912, + "ewc_loss": 0.02023991197347641, + "ewc_loss_diag": 1.33514404296875e-05, + "ewc_loss_parallel": 6.873212987557054e-05, + "grad_norm": 3.5560014247894287, + "learning_rate": 7.384484951250529e-07, + "loss": 0.465, + "mean_token_accuracy": 0.8539589643478394, + "num_tokens": 66650553.0, + "step": 1743 + }, + { + "epoch": 0.22185472586184965, + "ewc_loss": 0.020159685984253883, + "ewc_loss_diag": 1.33514404296875e-05, + "ewc_loss_parallel": 6.792986823711544e-05, + "grad_norm": 3.594845771789551, + "learning_rate": 7.388724035608308e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.8554703593254089, + "num_tokens": 66685330.0, + "step": 1744 + }, + { + "epoch": 0.22198193614044015, + "ewc_loss": 0.020291145890951157, + "ewc_loss_diag": 1.341104507446289e-05, + "ewc_loss_parallel": 6.863411545054987e-05, + "grad_norm": 3.645571708679199, + "learning_rate": 7.392963119966087e-07, + "loss": 0.4441, + "mean_token_accuracy": 0.8577519655227661, + "num_tokens": 66716816.0, + "step": 1745 + }, + { + "epoch": 0.22210914641903065, + "ewc_loss": 0.020305367186665535, + "ewc_loss_diag": 1.341104507446289e-05, + "ewc_loss_parallel": 6.877632404211909e-05, + "grad_norm": 3.714449644088745, + "learning_rate": 7.397202204323866e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8466399908065796, + "num_tokens": 66754030.0, + "step": 1746 + }, + { + "epoch": 0.22223635669762118, + "ewc_loss": 0.02036961168050766, + "ewc_loss_diag": 1.3470649719238281e-05, + "ewc_loss_parallel": 6.880841101519763e-05, + "grad_norm": 3.6801440715789795, + "learning_rate": 7.401441288681645e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.8586663007736206, + "num_tokens": 66793650.0, + "step": 1747 + }, + { + "epoch": 0.22236356697621168, + "ewc_loss": 0.020262353122234344, + "ewc_loss_diag": 1.341104507446289e-05, + "ewc_loss_parallel": 6.834618397988379e-05, + "grad_norm": 3.530438184738159, + "learning_rate": 7.405680373039424e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8659257888793945, + "num_tokens": 66833709.0, + "step": 1748 + }, + { + "epoch": 0.22249077725480218, + "ewc_loss": 0.020229291170835495, + "ewc_loss_diag": 1.341104507446289e-05, + "ewc_loss_parallel": 6.801557174185291e-05, + "grad_norm": 3.6831679344177246, + "learning_rate": 7.409919457397202e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8407269716262817, + "num_tokens": 66866456.0, + "step": 1749 + }, + { + "epoch": 0.2226179875333927, + "ewc_loss": 0.02035404182970524, + "ewc_loss_diag": 1.341104507446289e-05, + "ewc_loss_parallel": 6.926307833055034e-05, + "grad_norm": 3.6329128742218018, + "learning_rate": 7.414158541754981e-07, + "loss": 0.5247, + "mean_token_accuracy": 0.8324787616729736, + "num_tokens": 66905875.0, + "step": 1750 + }, + { + "epoch": 0.2227451978119832, + "ewc_loss": 0.02026926912367344, + "ewc_loss_diag": 1.341104507446289e-05, + "ewc_loss_parallel": 6.841534923296422e-05, + "grad_norm": 3.6806039810180664, + "learning_rate": 7.418397626112759e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8546038866043091, + "num_tokens": 66936009.0, + "step": 1751 + }, + { + "epoch": 0.2228724080905737, + "ewc_loss": 0.020345328375697136, + "ewc_loss_diag": 1.341104507446289e-05, + "ewc_loss_parallel": 6.917593418620527e-05, + "grad_norm": 3.622880458831787, + "learning_rate": 7.422636710470537e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.847822368144989, + "num_tokens": 66978111.0, + "step": 1752 + }, + { + "epoch": 0.22299961836916424, + "ewc_loss": 0.020311670377850533, + "ewc_loss_diag": 1.341104507446289e-05, + "ewc_loss_parallel": 6.883936293888837e-05, + "grad_norm": 3.6048901081085205, + "learning_rate": 7.426875794828317e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8467692732810974, + "num_tokens": 67015938.0, + "step": 1753 + }, + { + "epoch": 0.22312682864775474, + "ewc_loss": 0.02039935439825058, + "ewc_loss_diag": 1.3470649719238281e-05, + "ewc_loss_parallel": 6.910584488650784e-05, + "grad_norm": 3.6293509006500244, + "learning_rate": 7.431114879186095e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.8378138542175293, + "num_tokens": 67055456.0, + "step": 1754 + }, + { + "epoch": 0.22325403892634524, + "ewc_loss": 0.020407959818840027, + "ewc_loss_diag": 1.3470649719238281e-05, + "ewc_loss_parallel": 6.91919049131684e-05, + "grad_norm": 3.62469220161438, + "learning_rate": 7.435353963543875e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.8481048345565796, + "num_tokens": 67096298.0, + "step": 1755 + }, + { + "epoch": 0.22338124920493577, + "ewc_loss": 0.020514680072665215, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.903839675942436e-05, + "grad_norm": 3.5675573348999023, + "learning_rate": 7.439593047901653e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8626354932785034, + "num_tokens": 67135280.0, + "step": 1756 + }, + { + "epoch": 0.22350845948352627, + "ewc_loss": 0.02051464468240738, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.903804023750126e-05, + "grad_norm": 3.654603958129883, + "learning_rate": 7.443832132259431e-07, + "loss": 0.4956, + "mean_token_accuracy": 0.841497004032135, + "num_tokens": 67171936.0, + "step": 1757 + }, + { + "epoch": 0.22363566976211677, + "ewc_loss": 0.020557885989546776, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.947046495042741e-05, + "grad_norm": 3.6053993701934814, + "learning_rate": 7.44807121661721e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8459699153900146, + "num_tokens": 67209854.0, + "step": 1758 + }, + { + "epoch": 0.2237628800407073, + "ewc_loss": 0.020522896200418472, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.912057142471895e-05, + "grad_norm": 3.59673810005188, + "learning_rate": 7.452310300974989e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.8564997911453247, + "num_tokens": 67244369.0, + "step": 1759 + }, + { + "epoch": 0.2238900903192978, + "ewc_loss": 0.020560918375849724, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.950078386580572e-05, + "grad_norm": 3.5882630348205566, + "learning_rate": 7.456549385332767e-07, + "loss": 0.5165, + "mean_token_accuracy": 0.8380310535430908, + "num_tokens": 67283994.0, + "step": 1760 + }, + { + "epoch": 0.2240173005978883, + "ewc_loss": 0.020564598962664604, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.953759293537587e-05, + "grad_norm": 3.5386109352111816, + "learning_rate": 7.460788469690547e-07, + "loss": 0.4017, + "mean_token_accuracy": 0.8684256672859192, + "num_tokens": 67323230.0, + "step": 1761 + }, + { + "epoch": 0.22414451087647883, + "ewc_loss": 0.02054804563522339, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.937205034773797e-05, + "grad_norm": 3.616196870803833, + "learning_rate": 7.465027554048325e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.8278388977050781, + "num_tokens": 67366198.0, + "step": 1762 + }, + { + "epoch": 0.22427172115506933, + "ewc_loss": 0.02055220492184162, + "ewc_loss_diag": 1.3530254364013672e-05, + "ewc_loss_parallel": 7.002400525379926e-05, + "grad_norm": 3.6055667400360107, + "learning_rate": 7.469266638406105e-07, + "loss": 0.5379, + "mean_token_accuracy": 0.8329883813858032, + "num_tokens": 67410685.0, + "step": 1763 + }, + { + "epoch": 0.22439893143365983, + "ewc_loss": 0.020518209785223007, + "ewc_loss_diag": 1.3530254364013672e-05, + "ewc_loss_parallel": 6.968405068619177e-05, + "grad_norm": 3.614333152770996, + "learning_rate": 7.473505722763883e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.8544397354125977, + "num_tokens": 67445818.0, + "step": 1764 + }, + { + "epoch": 0.22452614171225035, + "ewc_loss": 0.02053564414381981, + "ewc_loss_diag": 1.3530254364013672e-05, + "ewc_loss_parallel": 6.985840445850044e-05, + "grad_norm": 3.7088217735290527, + "learning_rate": 7.477744807121661e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.840535044670105, + "num_tokens": 67483873.0, + "step": 1765 + }, + { + "epoch": 0.22465335199084085, + "ewc_loss": 0.020693231374025345, + "ewc_loss_diag": 1.3649463653564453e-05, + "ewc_loss_parallel": 7.02135730534792e-05, + "grad_norm": 3.6761412620544434, + "learning_rate": 7.48198389147944e-07, + "loss": 0.4809, + "mean_token_accuracy": 0.85084068775177, + "num_tokens": 67519524.0, + "step": 1766 + }, + { + "epoch": 0.22478056226943138, + "ewc_loss": 0.02052374929189682, + "ewc_loss_diag": 1.3530254364013672e-05, + "ewc_loss_parallel": 6.973944255150855e-05, + "grad_norm": 3.6055846214294434, + "learning_rate": 7.486222975837219e-07, + "loss": 0.4654, + "mean_token_accuracy": 0.8498313426971436, + "num_tokens": 67560920.0, + "step": 1767 + }, + { + "epoch": 0.22490777254802188, + "ewc_loss": 0.02058607153594494, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.975231372052804e-05, + "grad_norm": 3.7228856086730957, + "learning_rate": 7.490462060194997e-07, + "loss": 0.4369, + "mean_token_accuracy": 0.8569040298461914, + "num_tokens": 67594948.0, + "step": 1768 + }, + { + "epoch": 0.22503498282661238, + "ewc_loss": 0.020650450140237808, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 7.03960977261886e-05, + "grad_norm": 3.7173819541931152, + "learning_rate": 7.494701144552777e-07, + "loss": 0.5226, + "mean_token_accuracy": 0.8348666429519653, + "num_tokens": 67628867.0, + "step": 1769 + }, + { + "epoch": 0.2251621931052029, + "ewc_loss": 0.020601611584424973, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.9907728175167e-05, + "grad_norm": 3.618072986602783, + "learning_rate": 7.498940228910555e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8456259369850159, + "num_tokens": 67669762.0, + "step": 1770 + }, + { + "epoch": 0.2252894033837934, + "ewc_loss": 0.020581869408488274, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.971030234126374e-05, + "grad_norm": 3.7532355785369873, + "learning_rate": 7.503179313268335e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8480164408683777, + "num_tokens": 67704750.0, + "step": 1771 + }, + { + "epoch": 0.2254166136623839, + "ewc_loss": 0.02078995108604431, + "ewc_loss_diag": 1.3709068298339844e-05, + "ewc_loss_parallel": 7.057040784275159e-05, + "grad_norm": 3.6012675762176514, + "learning_rate": 7.507418397626113e-07, + "loss": 0.5083, + "mean_token_accuracy": 0.8385834693908691, + "num_tokens": 67745181.0, + "step": 1772 + }, + { + "epoch": 0.22554382394097444, + "ewc_loss": 0.020547980442643166, + "ewc_loss_diag": 1.3589859008789062e-05, + "ewc_loss_parallel": 6.937141006346792e-05, + "grad_norm": 3.6296074390411377, + "learning_rate": 7.51165748198389e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8410617113113403, + "num_tokens": 67785929.0, + "step": 1773 + }, + { + "epoch": 0.22567103421956494, + "ewc_loss": 0.02075820043683052, + "ewc_loss_diag": 1.3709068298339844e-05, + "ewc_loss_parallel": 7.025290688034147e-05, + "grad_norm": 3.6698434352874756, + "learning_rate": 7.51589656634167e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8521908521652222, + "num_tokens": 67821887.0, + "step": 1774 + }, + { + "epoch": 0.22579824449815544, + "ewc_loss": 0.020756633952260017, + "ewc_loss_diag": 1.3709068298339844e-05, + "ewc_loss_parallel": 7.023723446764052e-05, + "grad_norm": 3.7960610389709473, + "learning_rate": 7.520135650699448e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8450028896331787, + "num_tokens": 67859608.0, + "step": 1775 + }, + { + "epoch": 0.22592545477674597, + "ewc_loss": 0.020775025710463524, + "ewc_loss_diag": 1.3709068298339844e-05, + "ewc_loss_parallel": 7.042115612421185e-05, + "grad_norm": 3.6254804134368896, + "learning_rate": 7.524374735057227e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8441375494003296, + "num_tokens": 67895287.0, + "step": 1776 + }, + { + "epoch": 0.22605266505533647, + "ewc_loss": 0.020689964294433594, + "ewc_loss_diag": 1.3709068298339844e-05, + "ewc_loss_parallel": 6.957053119549528e-05, + "grad_norm": 3.6532344818115234, + "learning_rate": 7.528613819415006e-07, + "loss": 0.4556, + "mean_token_accuracy": 0.8595085144042969, + "num_tokens": 67930358.0, + "step": 1777 + }, + { + "epoch": 0.22617987533392697, + "ewc_loss": 0.020872442051768303, + "ewc_loss_diag": 1.3828277587890625e-05, + "ewc_loss_parallel": 7.017461030045524e-05, + "grad_norm": 3.630753993988037, + "learning_rate": 7.532852903772785e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.842231035232544, + "num_tokens": 67966112.0, + "step": 1778 + }, + { + "epoch": 0.2263070856125175, + "ewc_loss": 0.020862527191638947, + "ewc_loss_diag": 1.3828277587890625e-05, + "ewc_loss_parallel": 7.007546082604676e-05, + "grad_norm": 3.7001430988311768, + "learning_rate": 7.537091988130564e-07, + "loss": 0.5018, + "mean_token_accuracy": 0.8394263982772827, + "num_tokens": 68001297.0, + "step": 1779 + }, + { + "epoch": 0.226434295891108, + "ewc_loss": 0.020912885665893555, + "ewc_loss_diag": 1.3828277587890625e-05, + "ewc_loss_parallel": 7.057905168039724e-05, + "grad_norm": 3.667022943496704, + "learning_rate": 7.541331072488342e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8468351364135742, + "num_tokens": 68037346.0, + "step": 1780 + }, + { + "epoch": 0.2265615061696985, + "ewc_loss": 0.020896635949611664, + "ewc_loss_diag": 1.3828277587890625e-05, + "ewc_loss_parallel": 7.041655044304207e-05, + "grad_norm": 3.701075553894043, + "learning_rate": 7.54557015684612e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.845101535320282, + "num_tokens": 68076566.0, + "step": 1781 + }, + { + "epoch": 0.22668871644828903, + "ewc_loss": 0.020917940884828568, + "ewc_loss_diag": 1.3828277587890625e-05, + "ewc_loss_parallel": 7.062959775794297e-05, + "grad_norm": 3.6344454288482666, + "learning_rate": 7.5498092412039e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8544642925262451, + "num_tokens": 68116834.0, + "step": 1782 + }, + { + "epoch": 0.22681592672687953, + "ewc_loss": 0.020882433280348778, + "ewc_loss_diag": 1.3828277587890625e-05, + "ewc_loss_parallel": 7.027453102637082e-05, + "grad_norm": 3.606562614440918, + "learning_rate": 7.554048325561678e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.8621294498443604, + "num_tokens": 68155573.0, + "step": 1783 + }, + { + "epoch": 0.22694313700547003, + "ewc_loss": 0.020913176238536835, + "ewc_loss_diag": 1.3828277587890625e-05, + "ewc_loss_parallel": 7.05819547874853e-05, + "grad_norm": 3.673532485961914, + "learning_rate": 7.558287409919457e-07, + "loss": 0.5268, + "mean_token_accuracy": 0.8367713689804077, + "num_tokens": 68193378.0, + "step": 1784 + }, + { + "epoch": 0.22707034728406056, + "ewc_loss": 0.02099033072590828, + "ewc_loss_diag": 1.3887882232666016e-05, + "ewc_loss_parallel": 7.074315362842754e-05, + "grad_norm": 3.6757776737213135, + "learning_rate": 7.562526494277236e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8383828997612, + "num_tokens": 68230183.0, + "step": 1785 + }, + { + "epoch": 0.22719755756265106, + "ewc_loss": 0.020984185859560966, + "ewc_loss_diag": 1.3887882232666016e-05, + "ewc_loss_parallel": 7.068170816637576e-05, + "grad_norm": 3.6197221279144287, + "learning_rate": 7.566765578635015e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8534013628959656, + "num_tokens": 68270526.0, + "step": 1786 + }, + { + "epoch": 0.22732476784124156, + "ewc_loss": 0.020962748676538467, + "ewc_loss_diag": 1.3887882232666016e-05, + "ewc_loss_parallel": 7.046732207527384e-05, + "grad_norm": 3.6484522819519043, + "learning_rate": 7.571004662992794e-07, + "loss": 0.4967, + "mean_token_accuracy": 0.8470848202705383, + "num_tokens": 68308690.0, + "step": 1787 + }, + { + "epoch": 0.2274519781198321, + "ewc_loss": 0.0210084430873394, + "ewc_loss_diag": 1.3887882232666016e-05, + "ewc_loss_parallel": 7.092426676535979e-05, + "grad_norm": 3.660923719406128, + "learning_rate": 7.575243747350572e-07, + "loss": 0.5389, + "mean_token_accuracy": 0.8330764770507812, + "num_tokens": 68348660.0, + "step": 1788 + }, + { + "epoch": 0.2275791883984226, + "ewc_loss": 0.02092478796839714, + "ewc_loss_diag": 1.3828277587890625e-05, + "ewc_loss_parallel": 7.069807179505005e-05, + "grad_norm": 3.6352667808532715, + "learning_rate": 7.57948283170835e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8492841124534607, + "num_tokens": 68389129.0, + "step": 1789 + }, + { + "epoch": 0.2277063986770131, + "ewc_loss": 0.021067552268505096, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.090500730555505e-05, + "grad_norm": 3.7024052143096924, + "learning_rate": 7.58372191606613e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.8492417335510254, + "num_tokens": 68423234.0, + "step": 1790 + }, + { + "epoch": 0.22783360895560362, + "ewc_loss": 0.021089011803269386, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.111960439942777e-05, + "grad_norm": 3.6534037590026855, + "learning_rate": 7.587961000423908e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.847645103931427, + "num_tokens": 68460834.0, + "step": 1791 + }, + { + "epoch": 0.22796081923419412, + "ewc_loss": 0.02106458507478237, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.087534322636202e-05, + "grad_norm": 3.717883825302124, + "learning_rate": 7.592200084781686e-07, + "loss": 0.5318, + "mean_token_accuracy": 0.8364670872688293, + "num_tokens": 68495605.0, + "step": 1792 + }, + { + "epoch": 0.22808802951278465, + "ewc_loss": 0.021122314035892487, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.145262497942895e-05, + "grad_norm": 3.6462016105651855, + "learning_rate": 7.596439169139466e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8485080003738403, + "num_tokens": 68530838.0, + "step": 1793 + }, + { + "epoch": 0.22821523979137515, + "ewc_loss": 0.02107243239879608, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.09538217051886e-05, + "grad_norm": 3.643376588821411, + "learning_rate": 7.600678253497244e-07, + "loss": 0.4717, + "mean_token_accuracy": 0.8528309464454651, + "num_tokens": 68571225.0, + "step": 1794 + }, + { + "epoch": 0.22834245006996565, + "ewc_loss": 0.021102536469697952, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.125485717551783e-05, + "grad_norm": 3.592916965484619, + "learning_rate": 7.604917337855023e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.8580687642097473, + "num_tokens": 68608284.0, + "step": 1795 + }, + { + "epoch": 0.22846966034855618, + "ewc_loss": 0.02109532430768013, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.118273788364604e-05, + "grad_norm": 3.6460726261138916, + "learning_rate": 7.609156422212801e-07, + "loss": 0.4352, + "mean_token_accuracy": 0.8605638742446899, + "num_tokens": 68645943.0, + "step": 1796 + }, + { + "epoch": 0.22859687062714668, + "ewc_loss": 0.02113707736134529, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.160026143537834e-05, + "grad_norm": 3.7474818229675293, + "learning_rate": 7.61339550657058e-07, + "loss": 0.4578, + "mean_token_accuracy": 0.8540070652961731, + "num_tokens": 68684245.0, + "step": 1797 + }, + { + "epoch": 0.22872408090573718, + "ewc_loss": 0.021155210211873055, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.178160012699664e-05, + "grad_norm": 3.7672722339630127, + "learning_rate": 7.617634590928359e-07, + "loss": 0.472, + "mean_token_accuracy": 0.8531936407089233, + "num_tokens": 68717532.0, + "step": 1798 + }, + { + "epoch": 0.2288512911843277, + "ewc_loss": 0.02114957384765148, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.172523328335956e-05, + "grad_norm": 3.6584901809692383, + "learning_rate": 7.621873675286138e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.8467565774917603, + "num_tokens": 68759515.0, + "step": 1799 + }, + { + "epoch": 0.2289785014629182, + "ewc_loss": 0.02108766697347164, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.110616570571437e-05, + "grad_norm": 3.6661815643310547, + "learning_rate": 7.626112759643916e-07, + "loss": 0.4963, + "mean_token_accuracy": 0.8420230150222778, + "num_tokens": 68796969.0, + "step": 1800 + }, + { + "epoch": 0.2291057117415087, + "ewc_loss": 0.021137338131666183, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.160286622820422e-05, + "grad_norm": 3.750979423522949, + "learning_rate": 7.630351844001696e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.8478918671607971, + "num_tokens": 68831138.0, + "step": 1801 + }, + { + "epoch": 0.22923292202009923, + "ewc_loss": 0.02113332226872444, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.156271749408916e-05, + "grad_norm": 3.624953508377075, + "learning_rate": 7.634590928359474e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8543339967727661, + "num_tokens": 68868741.0, + "step": 1802 + }, + { + "epoch": 0.22936013229868973, + "ewc_loss": 0.02108754962682724, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.110499427653849e-05, + "grad_norm": 3.663463830947876, + "learning_rate": 7.638830012717253e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.8434333801269531, + "num_tokens": 68909191.0, + "step": 1803 + }, + { + "epoch": 0.22948734257728023, + "ewc_loss": 0.021115107461810112, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.138056389521807e-05, + "grad_norm": 3.615211009979248, + "learning_rate": 7.643069097075031e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8510749340057373, + "num_tokens": 68949261.0, + "step": 1804 + }, + { + "epoch": 0.22961455285587076, + "ewc_loss": 0.021103788167238235, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.126737909857184e-05, + "grad_norm": 3.6784684658050537, + "learning_rate": 7.64730818143281e-07, + "loss": 0.4383, + "mean_token_accuracy": 0.8568335771560669, + "num_tokens": 68990434.0, + "step": 1805 + }, + { + "epoch": 0.22974176313446126, + "ewc_loss": 0.02115214243531227, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.175092468969524e-05, + "grad_norm": 3.6798818111419678, + "learning_rate": 7.651547265790589e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8379849791526794, + "num_tokens": 69029482.0, + "step": 1806 + }, + { + "epoch": 0.22986897341305176, + "ewc_loss": 0.021114088594913483, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.137037755455822e-05, + "grad_norm": 3.6418955326080322, + "learning_rate": 7.655786350148368e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8408349752426147, + "num_tokens": 69069766.0, + "step": 1807 + }, + { + "epoch": 0.2299961836916423, + "ewc_loss": 0.021129712462425232, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.152660691644996e-05, + "grad_norm": 3.6080830097198486, + "learning_rate": 7.660025434506146e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8466126918792725, + "num_tokens": 69114592.0, + "step": 1808 + }, + { + "epoch": 0.2301233939702328, + "ewc_loss": 0.021122314035892487, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.145263953134418e-05, + "grad_norm": 3.6861484050750732, + "learning_rate": 7.664264518863926e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.8496395349502563, + "num_tokens": 69152599.0, + "step": 1809 + }, + { + "epoch": 0.2302506042488233, + "ewc_loss": 0.021173983812332153, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.196932710940018e-05, + "grad_norm": 3.731245517730713, + "learning_rate": 7.668503603221704e-07, + "loss": 0.5291, + "mean_token_accuracy": 0.8310471177101135, + "num_tokens": 69193157.0, + "step": 1810 + }, + { + "epoch": 0.23037781452741382, + "ewc_loss": 0.021242443472146988, + "ewc_loss_diag": 1.4066696166992188e-05, + "ewc_loss_parallel": 7.204357098089531e-05, + "grad_norm": 3.6562328338623047, + "learning_rate": 7.672742687579483e-07, + "loss": 0.4397, + "mean_token_accuracy": 0.8577855825424194, + "num_tokens": 69230681.0, + "step": 1811 + }, + { + "epoch": 0.23050502480600432, + "ewc_loss": 0.021122535690665245, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.145485142245889e-05, + "grad_norm": 3.767810583114624, + "learning_rate": 7.676981771937261e-07, + "loss": 0.4192, + "mean_token_accuracy": 0.862284779548645, + "num_tokens": 69261292.0, + "step": 1812 + }, + { + "epoch": 0.23063223508459482, + "ewc_loss": 0.021208351477980614, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.231300696730614e-05, + "grad_norm": 3.641368865966797, + "learning_rate": 7.681220856295039e-07, + "loss": 0.4495, + "mean_token_accuracy": 0.8575441837310791, + "num_tokens": 69301795.0, + "step": 1813 + }, + { + "epoch": 0.23075944536318535, + "ewc_loss": 0.021091574802994728, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.114523759810254e-05, + "grad_norm": 3.7193217277526855, + "learning_rate": 7.685459940652819e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8467357158660889, + "num_tokens": 69337038.0, + "step": 1814 + }, + { + "epoch": 0.23088665564177585, + "ewc_loss": 0.021194355562329292, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.217305392259732e-05, + "grad_norm": 3.6178948879241943, + "learning_rate": 7.689699025010597e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.849021315574646, + "num_tokens": 69379871.0, + "step": 1815 + }, + { + "epoch": 0.23101386592036638, + "ewc_loss": 0.02110740728676319, + "ewc_loss_diag": 1.3947486877441406e-05, + "ewc_loss_parallel": 7.130356243578717e-05, + "grad_norm": 3.72292160987854, + "learning_rate": 7.693938109368376e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8433266878128052, + "num_tokens": 69418021.0, + "step": 1816 + }, + { + "epoch": 0.23114107619895688, + "ewc_loss": 0.021288210526108742, + "ewc_loss_diag": 1.4066696166992188e-05, + "ewc_loss_parallel": 7.250124326674268e-05, + "grad_norm": 3.7261064052581787, + "learning_rate": 7.698177193726155e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8490533232688904, + "num_tokens": 69455574.0, + "step": 1817 + }, + { + "epoch": 0.23126828647754738, + "ewc_loss": 0.021230170503258705, + "ewc_loss_diag": 1.4066696166992188e-05, + "ewc_loss_parallel": 7.192084012785926e-05, + "grad_norm": 3.6942553520202637, + "learning_rate": 7.702416278083933e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8435382843017578, + "num_tokens": 69489822.0, + "step": 1818 + }, + { + "epoch": 0.2313954967561379, + "ewc_loss": 0.021234475076198578, + "ewc_loss_diag": 1.4066696166992188e-05, + "ewc_loss_parallel": 7.196388469310477e-05, + "grad_norm": 3.625742197036743, + "learning_rate": 7.706655362441712e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.850799560546875, + "num_tokens": 69532821.0, + "step": 1819 + }, + { + "epoch": 0.2315227070347284, + "ewc_loss": 0.021300598978996277, + "ewc_loss_diag": 1.4126300811767578e-05, + "ewc_loss_parallel": 7.20147872925736e-05, + "grad_norm": 3.7295641899108887, + "learning_rate": 7.710894446799491e-07, + "loss": 0.4485, + "mean_token_accuracy": 0.8559116721153259, + "num_tokens": 69565674.0, + "step": 1820 + }, + { + "epoch": 0.2316499173133189, + "ewc_loss": 0.021251089870929718, + "ewc_loss_diag": 1.4007091522216797e-05, + "ewc_loss_parallel": 7.274038944160566e-05, + "grad_norm": 3.7072577476501465, + "learning_rate": 7.715133531157269e-07, + "loss": 0.4508, + "mean_token_accuracy": 0.8583894371986389, + "num_tokens": 69606118.0, + "step": 1821 + }, + { + "epoch": 0.23177712759190944, + "ewc_loss": 0.021360043436288834, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.1998882049229e-05, + "grad_norm": 3.6683688163757324, + "learning_rate": 7.719372615515049e-07, + "loss": 0.5192, + "mean_token_accuracy": 0.8359795808792114, + "num_tokens": 69650102.0, + "step": 1822 + }, + { + "epoch": 0.23190433787049994, + "ewc_loss": 0.02139967307448387, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.23951670806855e-05, + "grad_norm": 3.6393117904663086, + "learning_rate": 7.723611699872827e-07, + "loss": 0.4641, + "mean_token_accuracy": 0.8504736423492432, + "num_tokens": 69693440.0, + "step": 1823 + }, + { + "epoch": 0.23203154814909044, + "ewc_loss": 0.02137928083539009, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.219123654067516e-05, + "grad_norm": 3.7445900440216064, + "learning_rate": 7.727850784230606e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.8426299095153809, + "num_tokens": 69732651.0, + "step": 1824 + }, + { + "epoch": 0.23215875842768097, + "ewc_loss": 0.02146824821829796, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.308092608582228e-05, + "grad_norm": 3.8044846057891846, + "learning_rate": 7.732089868588385e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8436702489852905, + "num_tokens": 69769915.0, + "step": 1825 + }, + { + "epoch": 0.23228596870627147, + "ewc_loss": 0.021422425284981728, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.262268627528101e-05, + "grad_norm": 3.5910656452178955, + "learning_rate": 7.736328952946163e-07, + "loss": 0.4221, + "mean_token_accuracy": 0.8659443855285645, + "num_tokens": 69813943.0, + "step": 1826 + }, + { + "epoch": 0.23241317898486197, + "ewc_loss": 0.02134670689702034, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.186549919424579e-05, + "grad_norm": 3.7626724243164062, + "learning_rate": 7.740568037303942e-07, + "loss": 0.4512, + "mean_token_accuracy": 0.8541052341461182, + "num_tokens": 69847039.0, + "step": 1827 + }, + { + "epoch": 0.2325403892634525, + "ewc_loss": 0.021500108763575554, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.339951844187453e-05, + "grad_norm": 3.659703493118286, + "learning_rate": 7.744807121661721e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8685091137886047, + "num_tokens": 69884588.0, + "step": 1828 + }, + { + "epoch": 0.232667599542043, + "ewc_loss": 0.021380165591835976, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.220009138109162e-05, + "grad_norm": 3.6506009101867676, + "learning_rate": 7.749046206019499e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8490152359008789, + "num_tokens": 69925552.0, + "step": 1829 + }, + { + "epoch": 0.2327948098206335, + "ewc_loss": 0.02144368179142475, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.283525337697938e-05, + "grad_norm": 3.6568286418914795, + "learning_rate": 7.753285290377279e-07, + "loss": 0.4145, + "mean_token_accuracy": 0.8665887117385864, + "num_tokens": 69961776.0, + "step": 1830 + }, + { + "epoch": 0.23292202009922403, + "ewc_loss": 0.021462123841047287, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.30196843505837e-05, + "grad_norm": 3.753631830215454, + "learning_rate": 7.757524374735057e-07, + "loss": 0.5212, + "mean_token_accuracy": 0.8341121673583984, + "num_tokens": 69999775.0, + "step": 1831 + }, + { + "epoch": 0.23304923037781453, + "ewc_loss": 0.021495245397090912, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.335089321713895e-05, + "grad_norm": 3.634855270385742, + "learning_rate": 7.761763459092836e-07, + "loss": 0.4315, + "mean_token_accuracy": 0.8651078343391418, + "num_tokens": 70037294.0, + "step": 1832 + }, + { + "epoch": 0.23317644065640503, + "ewc_loss": 0.021425575017929077, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.265419117175043e-05, + "grad_norm": 3.7471940517425537, + "learning_rate": 7.766002543450614e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8521639704704285, + "num_tokens": 70075014.0, + "step": 1833 + }, + { + "epoch": 0.23330365093499555, + "ewc_loss": 0.021537741646170616, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.377585279755294e-05, + "grad_norm": 3.678035259246826, + "learning_rate": 7.770241627808392e-07, + "loss": 0.4239, + "mean_token_accuracy": 0.8643894195556641, + "num_tokens": 70112186.0, + "step": 1834 + }, + { + "epoch": 0.23343086121358606, + "ewc_loss": 0.02146572805941105, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.305572216864675e-05, + "grad_norm": 3.7038698196411133, + "learning_rate": 7.774480712166172e-07, + "loss": 0.5002, + "mean_token_accuracy": 0.8414007425308228, + "num_tokens": 70153285.0, + "step": 1835 + }, + { + "epoch": 0.23355807149217656, + "ewc_loss": 0.02152303233742714, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.3628754762467e-05, + "grad_norm": 3.7810511589050293, + "learning_rate": 7.77871979652395e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8469129204750061, + "num_tokens": 70193382.0, + "step": 1836 + }, + { + "epoch": 0.23368528177076708, + "ewc_loss": 0.021534636616706848, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.374479901045561e-05, + "grad_norm": 3.732644557952881, + "learning_rate": 7.782958880881729e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8490172624588013, + "num_tokens": 70227279.0, + "step": 1837 + }, + { + "epoch": 0.23381249204935758, + "ewc_loss": 0.021505050361156464, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.344894402194768e-05, + "grad_norm": 3.7270586490631104, + "learning_rate": 7.787197965239508e-07, + "loss": 0.4361, + "mean_token_accuracy": 0.858069896697998, + "num_tokens": 70264395.0, + "step": 1838 + }, + { + "epoch": 0.23393970232794808, + "ewc_loss": 0.021601444110274315, + "ewc_loss_diag": 1.424551010131836e-05, + "ewc_loss_parallel": 7.380252645816654e-05, + "grad_norm": 3.8512122631073, + "learning_rate": 7.791437049597287e-07, + "loss": 0.5495, + "mean_token_accuracy": 0.8267762660980225, + "num_tokens": 70295016.0, + "step": 1839 + }, + { + "epoch": 0.2340669126065386, + "ewc_loss": 0.021644270047545433, + "ewc_loss_diag": 1.424551010131836e-05, + "ewc_loss_parallel": 7.423078204737976e-05, + "grad_norm": 3.699216842651367, + "learning_rate": 7.795676133955065e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8447002172470093, + "num_tokens": 70332774.0, + "step": 1840 + }, + { + "epoch": 0.2341941228851291, + "ewc_loss": 0.02149326168000698, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.333105168072507e-05, + "grad_norm": 3.7420332431793213, + "learning_rate": 7.799915218312844e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.8482630848884583, + "num_tokens": 70366880.0, + "step": 1841 + }, + { + "epoch": 0.23432133316371964, + "ewc_loss": 0.0215720497071743, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.411892875097692e-05, + "grad_norm": 3.695808172225952, + "learning_rate": 7.804154302670622e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.8547902703285217, + "num_tokens": 70405235.0, + "step": 1842 + }, + { + "epoch": 0.23444854344231014, + "ewc_loss": 0.021533861756324768, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.373705739155412e-05, + "grad_norm": 3.6660587787628174, + "learning_rate": 7.808393387028402e-07, + "loss": 0.4635, + "mean_token_accuracy": 0.8523786067962646, + "num_tokens": 70446503.0, + "step": 1843 + }, + { + "epoch": 0.23457575372090064, + "ewc_loss": 0.02155390754342079, + "ewc_loss_diag": 1.4185905456542969e-05, + "ewc_loss_parallel": 7.393751002382487e-05, + "grad_norm": 3.7609150409698486, + "learning_rate": 7.81263247138618e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8364803791046143, + "num_tokens": 70483514.0, + "step": 1844 + }, + { + "epoch": 0.23470296399949117, + "ewc_loss": 0.02172095701098442, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.438730244757608e-05, + "grad_norm": 5.09756326675415, + "learning_rate": 7.816871555743959e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8483067750930786, + "num_tokens": 70520944.0, + "step": 1845 + }, + { + "epoch": 0.23483017427808167, + "ewc_loss": 0.022486042231321335, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 8.20381537778303e-05, + "grad_norm": 3.8307366371154785, + "learning_rate": 7.821110640101738e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8395153880119324, + "num_tokens": 70564399.0, + "step": 1846 + }, + { + "epoch": 0.23495738455667217, + "ewc_loss": 0.0213078074157238, + "ewc_loss_diag": 1.424551010131836e-05, + "ewc_loss_parallel": 7.086616096785292e-05, + "grad_norm": 3.688612222671509, + "learning_rate": 7.825349724459517e-07, + "loss": 0.4734, + "mean_token_accuracy": 0.8509039878845215, + "num_tokens": 70604000.0, + "step": 1847 + }, + { + "epoch": 0.2350845948352627, + "ewc_loss": 0.02168414369225502, + "ewc_loss_diag": 1.424551010131836e-05, + "ewc_loss_parallel": 7.462953362846747e-05, + "grad_norm": 3.7842490673065186, + "learning_rate": 7.829588808817294e-07, + "loss": 0.4665, + "mean_token_accuracy": 0.8511996865272522, + "num_tokens": 70645133.0, + "step": 1848 + }, + { + "epoch": 0.2352118051138532, + "ewc_loss": 0.02162524126470089, + "ewc_loss_diag": 1.424551010131836e-05, + "ewc_loss_parallel": 7.404050120385364e-05, + "grad_norm": 3.8184702396392822, + "learning_rate": 7.833827893175074e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8352975845336914, + "num_tokens": 70675824.0, + "step": 1849 + }, + { + "epoch": 0.2353390153924437, + "ewc_loss": 0.021592773497104645, + "ewc_loss_diag": 1.424551010131836e-05, + "ewc_loss_parallel": 7.37158115953207e-05, + "grad_norm": 3.6938211917877197, + "learning_rate": 7.838066977532852e-07, + "loss": 0.4629, + "mean_token_accuracy": 0.8519232273101807, + "num_tokens": 70714103.0, + "step": 1850 + }, + { + "epoch": 0.23546622567103423, + "ewc_loss": 0.021643836051225662, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.361609459621832e-05, + "grad_norm": 3.831467866897583, + "learning_rate": 7.842306061890632e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.8393031358718872, + "num_tokens": 70754265.0, + "step": 1851 + }, + { + "epoch": 0.23559343594962473, + "ewc_loss": 0.021722901612520218, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.440675108227879e-05, + "grad_norm": 3.767660140991211, + "learning_rate": 7.84654514624841e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.845477819442749, + "num_tokens": 70791368.0, + "step": 1852 + }, + { + "epoch": 0.23572064622821523, + "ewc_loss": 0.02165484055876732, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.372613617917523e-05, + "grad_norm": 3.7600457668304443, + "learning_rate": 7.850784230606188e-07, + "loss": 0.524, + "mean_token_accuracy": 0.8379421234130859, + "num_tokens": 70827397.0, + "step": 1853 + }, + { + "epoch": 0.23584785650680576, + "ewc_loss": 0.02167602628469467, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.39379902370274e-05, + "grad_norm": 3.652653217315674, + "learning_rate": 7.855023314963968e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8448290228843689, + "num_tokens": 70872503.0, + "step": 1854 + }, + { + "epoch": 0.23597506678539626, + "ewc_loss": 0.021643217653036118, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.360990275628865e-05, + "grad_norm": 3.7509641647338867, + "learning_rate": 7.859262399321746e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8423552513122559, + "num_tokens": 70914376.0, + "step": 1855 + }, + { + "epoch": 0.23610227706398676, + "ewc_loss": 0.02180003747344017, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.456776802428067e-05, + "grad_norm": 3.7388885021209717, + "learning_rate": 7.863501483679524e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8555834293365479, + "num_tokens": 70954004.0, + "step": 1856 + }, + { + "epoch": 0.2362294873425773, + "ewc_loss": 0.021677857264876366, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.39563038223423e-05, + "grad_norm": 3.7352049350738525, + "learning_rate": 7.867740568037303e-07, + "loss": 0.4519, + "mean_token_accuracy": 0.8521426320075989, + "num_tokens": 70991765.0, + "step": 1857 + }, + { + "epoch": 0.2363566976211678, + "ewc_loss": 0.021704744547605515, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.422517956001684e-05, + "grad_norm": 3.7789156436920166, + "learning_rate": 7.871979652395082e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8444211483001709, + "num_tokens": 71025478.0, + "step": 1858 + }, + { + "epoch": 0.2364839078997583, + "ewc_loss": 0.0218062661588192, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.463003566954285e-05, + "grad_norm": 3.7096147537231445, + "learning_rate": 7.876218736752861e-07, + "loss": 0.4358, + "mean_token_accuracy": 0.8643705248832703, + "num_tokens": 71067287.0, + "step": 1859 + }, + { + "epoch": 0.23661111817834882, + "ewc_loss": 0.02176596410572529, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.422702037729323e-05, + "grad_norm": 3.7371082305908203, + "learning_rate": 7.88045782111064e-07, + "loss": 0.5048, + "mean_token_accuracy": 0.8406268358230591, + "num_tokens": 71108095.0, + "step": 1860 + }, + { + "epoch": 0.23673832845693932, + "ewc_loss": 0.021817900240421295, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.474638550775126e-05, + "grad_norm": 3.714839220046997, + "learning_rate": 7.884696905468418e-07, + "loss": 0.4393, + "mean_token_accuracy": 0.8606690764427185, + "num_tokens": 71150297.0, + "step": 1861 + }, + { + "epoch": 0.23686553873552982, + "ewc_loss": 0.02171785570681095, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.435629231622443e-05, + "grad_norm": 3.7567925453186035, + "learning_rate": 7.888935989826198e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8427355885505676, + "num_tokens": 71189205.0, + "step": 1862 + }, + { + "epoch": 0.23699274901412035, + "ewc_loss": 0.021756434813141823, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.474208541680127e-05, + "grad_norm": 3.7732126712799072, + "learning_rate": 7.893175074183976e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.8486826419830322, + "num_tokens": 71228141.0, + "step": 1863 + }, + { + "epoch": 0.23711995929271085, + "ewc_loss": 0.021766379475593567, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.484154048142955e-05, + "grad_norm": 3.7309629917144775, + "learning_rate": 7.897414158541754e-07, + "loss": 0.4755, + "mean_token_accuracy": 0.8491575717926025, + "num_tokens": 71267854.0, + "step": 1864 + }, + { + "epoch": 0.23724716957130135, + "ewc_loss": 0.02179594337940216, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.452681893482804e-05, + "grad_norm": 3.789410352706909, + "learning_rate": 7.901653242899533e-07, + "loss": 0.511, + "mean_token_accuracy": 0.8428331613540649, + "num_tokens": 71305030.0, + "step": 1865 + }, + { + "epoch": 0.23737437984989188, + "ewc_loss": 0.021772712469100952, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.490485586458817e-05, + "grad_norm": 3.809960126876831, + "learning_rate": 7.905892327257312e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8496325016021729, + "num_tokens": 71342202.0, + "step": 1866 + }, + { + "epoch": 0.23750159012848238, + "ewc_loss": 0.021831117570400238, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.487856055377051e-05, + "grad_norm": 4.059051513671875, + "learning_rate": 7.910131411615091e-07, + "loss": 0.4685, + "mean_token_accuracy": 0.8533487319946289, + "num_tokens": 71379702.0, + "step": 1867 + }, + { + "epoch": 0.2376288004070729, + "ewc_loss": 0.021914508193731308, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.57124726078473e-05, + "grad_norm": 3.6893930435180664, + "learning_rate": 7.91437049597287e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8494657874107361, + "num_tokens": 71417785.0, + "step": 1868 + }, + { + "epoch": 0.2377560106856634, + "ewc_loss": 0.021668367087841034, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.325104525079951e-05, + "grad_norm": 3.729328155517578, + "learning_rate": 7.918609580330648e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8374850153923035, + "num_tokens": 71461358.0, + "step": 1869 + }, + { + "epoch": 0.2378832209642539, + "ewc_loss": 0.021838869899511337, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.495608588214964e-05, + "grad_norm": 3.734856128692627, + "learning_rate": 7.922848664688428e-07, + "loss": 0.5105, + "mean_token_accuracy": 0.8405422568321228, + "num_tokens": 71500059.0, + "step": 1870 + }, + { + "epoch": 0.23801043124284443, + "ewc_loss": 0.021727338433265686, + "ewc_loss_diag": 1.430511474609375e-05, + "ewc_loss_parallel": 7.445112714776769e-05, + "grad_norm": 3.7357614040374756, + "learning_rate": 7.927087749046205e-07, + "loss": 0.4221, + "mean_token_accuracy": 0.8656916618347168, + "num_tokens": 71537043.0, + "step": 1871 + }, + { + "epoch": 0.23813764152143493, + "ewc_loss": 0.021822601556777954, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.47933954698965e-05, + "grad_norm": 3.7628207206726074, + "learning_rate": 7.931326833403983e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.8533996939659119, + "num_tokens": 71570671.0, + "step": 1872 + }, + { + "epoch": 0.23826485180002543, + "ewc_loss": 0.021850360557436943, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.507098780479282e-05, + "grad_norm": 3.765369176864624, + "learning_rate": 7.935565917761763e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8520622849464417, + "num_tokens": 71614813.0, + "step": 1873 + }, + { + "epoch": 0.23839206207861596, + "ewc_loss": 0.021848361939191818, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.505100802518427e-05, + "grad_norm": 3.7887074947357178, + "learning_rate": 7.939805002119541e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.846145749092102, + "num_tokens": 71650441.0, + "step": 1874 + }, + { + "epoch": 0.23851927235720646, + "ewc_loss": 0.021878328174352646, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.535067561548203e-05, + "grad_norm": 3.8106820583343506, + "learning_rate": 7.944044086477321e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8457179665565491, + "num_tokens": 71690425.0, + "step": 1875 + }, + { + "epoch": 0.23864648263579696, + "ewc_loss": 0.02199508249759674, + "ewc_loss_diag": 1.4483928680419922e-05, + "ewc_loss_parallel": 7.529750291723758e-05, + "grad_norm": 3.7711315155029297, + "learning_rate": 7.948283170835099e-07, + "loss": 0.4361, + "mean_token_accuracy": 0.8592972159385681, + "num_tokens": 71727349.0, + "step": 1876 + }, + { + "epoch": 0.2387736929143875, + "ewc_loss": 0.021957488730549812, + "ewc_loss_diag": 1.4483928680419922e-05, + "ewc_loss_parallel": 7.492156873922795e-05, + "grad_norm": 3.6554605960845947, + "learning_rate": 7.952522255192878e-07, + "loss": 0.4038, + "mean_token_accuracy": 0.8709169626235962, + "num_tokens": 71770386.0, + "step": 1877 + }, + { + "epoch": 0.238900903192978, + "ewc_loss": 0.021852951496839523, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.509689748985693e-05, + "grad_norm": 3.7603766918182373, + "learning_rate": 7.956761339550657e-07, + "loss": 0.4326, + "mean_token_accuracy": 0.8620651960372925, + "num_tokens": 71808946.0, + "step": 1878 + }, + { + "epoch": 0.2390281134715685, + "ewc_loss": 0.021935809403657913, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.592546899104491e-05, + "grad_norm": 3.7563278675079346, + "learning_rate": 7.961000423908435e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.8550522923469543, + "num_tokens": 71848605.0, + "step": 1879 + }, + { + "epoch": 0.23915532375015902, + "ewc_loss": 0.021889202296733856, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.5459414802026e-05, + "grad_norm": 3.7479116916656494, + "learning_rate": 7.965239508266214e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8552718162536621, + "num_tokens": 71886771.0, + "step": 1880 + }, + { + "epoch": 0.23928253402874952, + "ewc_loss": 0.021895073354244232, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.551811722805724e-05, + "grad_norm": 3.7207064628601074, + "learning_rate": 7.969478592623993e-07, + "loss": 0.4313, + "mean_token_accuracy": 0.8604118824005127, + "num_tokens": 71924905.0, + "step": 1881 + }, + { + "epoch": 0.23940974430734002, + "ewc_loss": 0.021902918815612793, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.559658115496859e-05, + "grad_norm": 3.714682102203369, + "learning_rate": 7.973717676981771e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8550439476966858, + "num_tokens": 71964513.0, + "step": 1882 + }, + { + "epoch": 0.23953695458593055, + "ewc_loss": 0.02192683331668377, + "ewc_loss_diag": 1.436471939086914e-05, + "ewc_loss_parallel": 7.583571277791634e-05, + "grad_norm": 3.7390031814575195, + "learning_rate": 7.977956761339551e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8617867231369019, + "num_tokens": 72000918.0, + "step": 1883 + }, + { + "epoch": 0.23966416486452105, + "ewc_loss": 0.022057058289647102, + "ewc_loss_diag": 1.4483928680419922e-05, + "ewc_loss_parallel": 7.591726171085611e-05, + "grad_norm": 3.8209335803985596, + "learning_rate": 7.982195845697329e-07, + "loss": 0.5586, + "mean_token_accuracy": 0.8233747482299805, + "num_tokens": 72040840.0, + "step": 1884 + }, + { + "epoch": 0.23979137514311155, + "ewc_loss": 0.02203788235783577, + "ewc_loss_diag": 1.4424324035644531e-05, + "ewc_loss_parallel": 7.633586210431531e-05, + "grad_norm": 3.813455581665039, + "learning_rate": 7.986434930055108e-07, + "loss": 0.4323, + "mean_token_accuracy": 0.8602362871170044, + "num_tokens": 72076757.0, + "step": 1885 + }, + { + "epoch": 0.23991858542170208, + "ewc_loss": 0.022050390020012856, + "ewc_loss_diag": 1.4483928680419922e-05, + "ewc_loss_parallel": 7.585057755932212e-05, + "grad_norm": 3.7037758827209473, + "learning_rate": 7.990674014412886e-07, + "loss": 0.4867, + "mean_token_accuracy": 0.847122311592102, + "num_tokens": 72120760.0, + "step": 1886 + }, + { + "epoch": 0.24004579570029258, + "ewc_loss": 0.022034648805856705, + "ewc_loss_diag": 1.4483928680419922e-05, + "ewc_loss_parallel": 7.569317676825449e-05, + "grad_norm": 3.829934597015381, + "learning_rate": 7.994913098770665e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8486465811729431, + "num_tokens": 72159985.0, + "step": 1887 + }, + { + "epoch": 0.24017300597888308, + "ewc_loss": 0.022123193368315697, + "ewc_loss_diag": 1.4483928680419922e-05, + "ewc_loss_parallel": 7.657860987819731e-05, + "grad_norm": 3.810126543045044, + "learning_rate": 7.999152183128444e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8373382091522217, + "num_tokens": 72196377.0, + "step": 1888 + }, + { + "epoch": 0.2403002162574736, + "ewc_loss": 0.022138532251119614, + "ewc_loss_diag": 1.4543533325195312e-05, + "ewc_loss_parallel": 7.612165791215375e-05, + "grad_norm": 3.7447474002838135, + "learning_rate": 8.003391267486223e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8556993007659912, + "num_tokens": 72236585.0, + "step": 1889 + }, + { + "epoch": 0.2404274265360641, + "ewc_loss": 0.022132454439997673, + "ewc_loss_diag": 1.4543533325195312e-05, + "ewc_loss_parallel": 7.606086728628725e-05, + "grad_norm": 3.9630463123321533, + "learning_rate": 8.007630351844001e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8504682779312134, + "num_tokens": 72268450.0, + "step": 1890 + }, + { + "epoch": 0.24055463681465464, + "ewc_loss": 0.022250119596719742, + "ewc_loss_diag": 1.4543533325195312e-05, + "ewc_loss_parallel": 7.723753515165299e-05, + "grad_norm": 3.7914936542510986, + "learning_rate": 8.011869436201781e-07, + "loss": 0.4535, + "mean_token_accuracy": 0.8555295467376709, + "num_tokens": 72304092.0, + "step": 1891 + }, + { + "epoch": 0.24068184709324514, + "ewc_loss": 0.02208125963807106, + "ewc_loss_diag": 1.4543533325195312e-05, + "ewc_loss_parallel": 7.55489309085533e-05, + "grad_norm": 3.7438597679138184, + "learning_rate": 8.016108520559559e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8469916582107544, + "num_tokens": 72347312.0, + "step": 1892 + }, + { + "epoch": 0.24080905737183564, + "ewc_loss": 0.022161846980452538, + "ewc_loss_diag": 1.4543533325195312e-05, + "ewc_loss_parallel": 7.635479414602742e-05, + "grad_norm": 3.8645377159118652, + "learning_rate": 8.020347604917338e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.8582186698913574, + "num_tokens": 72382135.0, + "step": 1893 + }, + { + "epoch": 0.24093626765042617, + "ewc_loss": 0.022322461009025574, + "ewc_loss_diag": 1.4662742614746094e-05, + "ewc_loss_parallel": 7.674023072468117e-05, + "grad_norm": 3.7570383548736572, + "learning_rate": 8.024586689275116e-07, + "loss": 0.4683, + "mean_token_accuracy": 0.8514302372932434, + "num_tokens": 72421526.0, + "step": 1894 + }, + { + "epoch": 0.24106347792901667, + "ewc_loss": 0.02225758694112301, + "ewc_loss_diag": 1.4662742614746094e-05, + "ewc_loss_parallel": 7.609149179188535e-05, + "grad_norm": 3.8234736919403076, + "learning_rate": 8.028825773632894e-07, + "loss": 0.472, + "mean_token_accuracy": 0.8523774147033691, + "num_tokens": 72458592.0, + "step": 1895 + }, + { + "epoch": 0.24119068820760717, + "ewc_loss": 0.02235092595219612, + "ewc_loss_diag": 1.4662742614746094e-05, + "ewc_loss_parallel": 7.702488073846325e-05, + "grad_norm": 3.8091678619384766, + "learning_rate": 8.033064857990674e-07, + "loss": 0.4028, + "mean_token_accuracy": 0.8707325458526611, + "num_tokens": 72494734.0, + "step": 1896 + }, + { + "epoch": 0.2413178984861977, + "ewc_loss": 0.02230537123978138, + "ewc_loss_diag": 1.4662742614746094e-05, + "ewc_loss_parallel": 7.656933303223923e-05, + "grad_norm": 3.7833938598632812, + "learning_rate": 8.037303942348452e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.8571087121963501, + "num_tokens": 72531560.0, + "step": 1897 + }, + { + "epoch": 0.2414451087647882, + "ewc_loss": 0.022320358082652092, + "ewc_loss_diag": 1.4662742614746094e-05, + "ewc_loss_parallel": 7.671920320717618e-05, + "grad_norm": 3.772977828979492, + "learning_rate": 8.041543026706231e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8580378293991089, + "num_tokens": 72570144.0, + "step": 1898 + }, + { + "epoch": 0.2415723190433787, + "ewc_loss": 0.022312063723802567, + "ewc_loss_diag": 1.4662742614746094e-05, + "ewc_loss_parallel": 7.663625729037449e-05, + "grad_norm": 3.7628769874572754, + "learning_rate": 8.04578211106401e-07, + "loss": 0.4579, + "mean_token_accuracy": 0.8519472479820251, + "num_tokens": 72606738.0, + "step": 1899 + }, + { + "epoch": 0.24169952932196923, + "ewc_loss": 0.02235301584005356, + "ewc_loss_diag": 1.4662742614746094e-05, + "ewc_loss_parallel": 7.704577728873119e-05, + "grad_norm": 3.796264410018921, + "learning_rate": 8.050021195421789e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8411495685577393, + "num_tokens": 72646230.0, + "step": 1900 + }, + { + "epoch": 0.24182673960055973, + "ewc_loss": 0.02235408127307892, + "ewc_loss_diag": 1.4662742614746094e-05, + "ewc_loss_parallel": 7.705644384259358e-05, + "grad_norm": 3.864716053009033, + "learning_rate": 8.054260279779567e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8476386070251465, + "num_tokens": 72681811.0, + "step": 1901 + }, + { + "epoch": 0.24195394987915023, + "ewc_loss": 0.02238091640174389, + "ewc_loss_diag": 1.4662742614746094e-05, + "ewc_loss_parallel": 7.73247957113199e-05, + "grad_norm": 3.7838332653045654, + "learning_rate": 8.058499364137346e-07, + "loss": 0.5071, + "mean_token_accuracy": 0.839593768119812, + "num_tokens": 72721805.0, + "step": 1902 + }, + { + "epoch": 0.24208116015774075, + "ewc_loss": 0.022384049370884895, + "ewc_loss_diag": 1.4722347259521484e-05, + "ewc_loss_parallel": 7.674576045246795e-05, + "grad_norm": 3.762901544570923, + "learning_rate": 8.062738448495124e-07, + "loss": 0.4224, + "mean_token_accuracy": 0.8650730848312378, + "num_tokens": 72763142.0, + "step": 1903 + }, + { + "epoch": 0.24220837043633126, + "ewc_loss": 0.02240208350121975, + "ewc_loss_diag": 1.4722347259521484e-05, + "ewc_loss_parallel": 7.692610961385071e-05, + "grad_norm": 3.776299238204956, + "learning_rate": 8.066977532852904e-07, + "loss": 0.413, + "mean_token_accuracy": 0.8722392320632935, + "num_tokens": 72802047.0, + "step": 1904 + }, + { + "epoch": 0.24233558071492176, + "ewc_loss": 0.022442912682890892, + "ewc_loss_diag": 1.4722347259521484e-05, + "ewc_loss_parallel": 7.733439997537062e-05, + "grad_norm": 3.8525006771087646, + "learning_rate": 8.071216617210682e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.8453122973442078, + "num_tokens": 72834470.0, + "step": 1905 + }, + { + "epoch": 0.24246279099351228, + "ewc_loss": 0.022463977336883545, + "ewc_loss_diag": 1.4722347259521484e-05, + "ewc_loss_parallel": 7.754503894830123e-05, + "grad_norm": 3.7853105068206787, + "learning_rate": 8.075455701568461e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.8542309403419495, + "num_tokens": 72873101.0, + "step": 1906 + }, + { + "epoch": 0.24259000127210278, + "ewc_loss": 0.022412648424506187, + "ewc_loss_diag": 1.4722347259521484e-05, + "ewc_loss_parallel": 7.703175651840866e-05, + "grad_norm": 3.792772054672241, + "learning_rate": 8.07969478592624e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8620076775550842, + "num_tokens": 72915504.0, + "step": 1907 + }, + { + "epoch": 0.24271721155069328, + "ewc_loss": 0.022439925000071526, + "ewc_loss_diag": 1.4722347259521484e-05, + "ewc_loss_parallel": 7.730452489340678e-05, + "grad_norm": 3.7922708988189697, + "learning_rate": 8.083933870284019e-07, + "loss": 0.4155, + "mean_token_accuracy": 0.8694690465927124, + "num_tokens": 72947179.0, + "step": 1908 + }, + { + "epoch": 0.2428444218292838, + "ewc_loss": 0.022438200190663338, + "ewc_loss_diag": 1.4722347259521484e-05, + "ewc_loss_parallel": 7.728728087386116e-05, + "grad_norm": 3.84047794342041, + "learning_rate": 8.088172954641796e-07, + "loss": 0.4548, + "mean_token_accuracy": 0.8554693460464478, + "num_tokens": 72983313.0, + "step": 1909 + }, + { + "epoch": 0.2429716321078743, + "ewc_loss": 0.022540848702192307, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.770341471768916e-05, + "grad_norm": 3.848281145095825, + "learning_rate": 8.092412038999576e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.8417956829071045, + "num_tokens": 73014218.0, + "step": 1910 + }, + { + "epoch": 0.24309884238646481, + "ewc_loss": 0.022541001439094543, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.770493539283052e-05, + "grad_norm": 3.98100209236145, + "learning_rate": 8.096651123357354e-07, + "loss": 0.5134, + "mean_token_accuracy": 0.8349336385726929, + "num_tokens": 73048282.0, + "step": 1911 + }, + { + "epoch": 0.24322605266505534, + "ewc_loss": 0.022594723850488663, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.824214844731614e-05, + "grad_norm": 3.8378913402557373, + "learning_rate": 8.100890207715134e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8415870666503906, + "num_tokens": 73084390.0, + "step": 1912 + }, + { + "epoch": 0.24335326294364584, + "ewc_loss": 0.022500742226839066, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.730233483016491e-05, + "grad_norm": 3.8923180103302, + "learning_rate": 8.105129292072912e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8513837456703186, + "num_tokens": 73123296.0, + "step": 1913 + }, + { + "epoch": 0.24348047322223634, + "ewc_loss": 0.022572748363018036, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.802240725141019e-05, + "grad_norm": 3.838848352432251, + "learning_rate": 8.10936837643069e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8393422961235046, + "num_tokens": 73157533.0, + "step": 1914 + }, + { + "epoch": 0.24360768350082687, + "ewc_loss": 0.022527003660798073, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.756496052024886e-05, + "grad_norm": 3.9137797355651855, + "learning_rate": 8.11360746078847e-07, + "loss": 0.447, + "mean_token_accuracy": 0.8569564819335938, + "num_tokens": 73195314.0, + "step": 1915 + }, + { + "epoch": 0.24373489377941737, + "ewc_loss": 0.02257573790848255, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.805230416124687e-05, + "grad_norm": 3.895547389984131, + "learning_rate": 8.117846545146248e-07, + "loss": 0.416, + "mean_token_accuracy": 0.864851713180542, + "num_tokens": 73236763.0, + "step": 1916 + }, + { + "epoch": 0.2438621040580079, + "ewc_loss": 0.02259107120335102, + "ewc_loss_diag": 1.4841556549072266e-05, + "ewc_loss_parallel": 7.759527943562716e-05, + "grad_norm": 3.9375851154327393, + "learning_rate": 8.122085629504026e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8379430770874023, + "num_tokens": 73276719.0, + "step": 1917 + }, + { + "epoch": 0.2439893143365984, + "ewc_loss": 0.022657610476017, + "ewc_loss_diag": 1.4901161193847656e-05, + "ewc_loss_parallel": 7.765031477902085e-05, + "grad_norm": 3.7000484466552734, + "learning_rate": 8.126324713861805e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.8533992171287537, + "num_tokens": 73318594.0, + "step": 1918 + }, + { + "epoch": 0.2441165246151889, + "ewc_loss": 0.02246759831905365, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.697090768488124e-05, + "grad_norm": 3.8424510955810547, + "learning_rate": 8.130563798219584e-07, + "loss": 0.4456, + "mean_token_accuracy": 0.8602855205535889, + "num_tokens": 73358760.0, + "step": 1919 + }, + { + "epoch": 0.24424373489377943, + "ewc_loss": 0.02273010089993477, + "ewc_loss_diag": 1.4901161193847656e-05, + "ewc_loss_parallel": 7.837523298803717e-05, + "grad_norm": 3.818459987640381, + "learning_rate": 8.134802882577363e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.8499354124069214, + "num_tokens": 73398623.0, + "step": 1920 + }, + { + "epoch": 0.24437094517236993, + "ewc_loss": 0.02265099808573723, + "ewc_loss_diag": 1.4901161193847656e-05, + "ewc_loss_parallel": 7.758420542813838e-05, + "grad_norm": 3.7556498050689697, + "learning_rate": 8.139041966935142e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8523268699645996, + "num_tokens": 73446369.0, + "step": 1921 + }, + { + "epoch": 0.24449815545096043, + "ewc_loss": 0.022670140489935875, + "ewc_loss_diag": 1.4901161193847656e-05, + "ewc_loss_parallel": 7.777562132105231e-05, + "grad_norm": 3.8905582427978516, + "learning_rate": 8.14328105129292e-07, + "loss": 0.5304, + "mean_token_accuracy": 0.83759605884552, + "num_tokens": 73483152.0, + "step": 1922 + }, + { + "epoch": 0.24462536572955096, + "ewc_loss": 0.02263273485004902, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.862226630095392e-05, + "grad_norm": 3.860212802886963, + "learning_rate": 8.1475201356507e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.847587525844574, + "num_tokens": 73519179.0, + "step": 1923 + }, + { + "epoch": 0.24475257600814146, + "ewc_loss": 0.022543825209140778, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.773316610837355e-05, + "grad_norm": 3.78151535987854, + "learning_rate": 8.151759220008477e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.854061484336853, + "num_tokens": 73561947.0, + "step": 1924 + }, + { + "epoch": 0.24487978628673196, + "ewc_loss": 0.022541385143995285, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.770876254653558e-05, + "grad_norm": 3.866901159286499, + "learning_rate": 8.155998304366256e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8545148372650146, + "num_tokens": 73595841.0, + "step": 1925 + }, + { + "epoch": 0.2450069965653225, + "ewc_loss": 0.02260092832148075, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.830420508980751e-05, + "grad_norm": 3.876708984375, + "learning_rate": 8.160237388724035e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.8413353562355042, + "num_tokens": 73629716.0, + "step": 1926 + }, + { + "epoch": 0.245134206843913, + "ewc_loss": 0.022567667067050934, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.797158468747512e-05, + "grad_norm": 3.7917537689208984, + "learning_rate": 8.164476473081814e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8483204245567322, + "num_tokens": 73667285.0, + "step": 1927 + }, + { + "epoch": 0.2452614171225035, + "ewc_loss": 0.022553371265530586, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.782863394822925e-05, + "grad_norm": 3.7573673725128174, + "learning_rate": 8.168715557439593e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8638657331466675, + "num_tokens": 73708059.0, + "step": 1928 + }, + { + "epoch": 0.24538862740109402, + "ewc_loss": 0.022586451843380928, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.815944263711572e-05, + "grad_norm": 3.7556748390197754, + "learning_rate": 8.172954641797372e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8586247563362122, + "num_tokens": 73748553.0, + "step": 1929 + }, + { + "epoch": 0.24551583767968452, + "ewc_loss": 0.022607695311307907, + "ewc_loss_diag": 1.4781951904296875e-05, + "ewc_loss_parallel": 7.837187877157703e-05, + "grad_norm": 3.8017446994781494, + "learning_rate": 8.17719372615515e-07, + "loss": 0.5411, + "mean_token_accuracy": 0.8297927379608154, + "num_tokens": 73796295.0, + "step": 1930 + }, + { + "epoch": 0.24564304795827502, + "ewc_loss": 0.02273063361644745, + "ewc_loss_diag": 1.4901161193847656e-05, + "ewc_loss_parallel": 7.838054443709552e-05, + "grad_norm": 3.866206169128418, + "learning_rate": 8.18143281051293e-07, + "loss": 0.52, + "mean_token_accuracy": 0.8377769589424133, + "num_tokens": 73835960.0, + "step": 1931 + }, + { + "epoch": 0.24577025823686555, + "ewc_loss": 0.02275955304503441, + "ewc_loss_diag": 1.4901161193847656e-05, + "ewc_loss_parallel": 7.86697564763017e-05, + "grad_norm": 3.8431732654571533, + "learning_rate": 8.185671894870707e-07, + "loss": 0.5355, + "mean_token_accuracy": 0.8322053551673889, + "num_tokens": 73875302.0, + "step": 1932 + }, + { + "epoch": 0.24589746851545605, + "ewc_loss": 0.022766374051570892, + "ewc_loss_diag": 1.4901161193847656e-05, + "ewc_loss_parallel": 7.873796857893467e-05, + "grad_norm": 3.884608745574951, + "learning_rate": 8.189910979228485e-07, + "loss": 0.4282, + "mean_token_accuracy": 0.86211097240448, + "num_tokens": 73907703.0, + "step": 1933 + }, + { + "epoch": 0.24602467879404655, + "ewc_loss": 0.02275829389691353, + "ewc_loss_diag": 1.4901161193847656e-05, + "ewc_loss_parallel": 7.865716179367155e-05, + "grad_norm": 3.8565003871917725, + "learning_rate": 8.194150063586265e-07, + "loss": 0.5225, + "mean_token_accuracy": 0.8378616571426392, + "num_tokens": 73945449.0, + "step": 1934 + }, + { + "epoch": 0.24615188907263708, + "ewc_loss": 0.022750195115804672, + "ewc_loss_diag": 1.4901161193847656e-05, + "ewc_loss_parallel": 7.857615855755284e-05, + "grad_norm": 3.8407299518585205, + "learning_rate": 8.198389147944043e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.836538553237915, + "num_tokens": 73982461.0, + "step": 1935 + }, + { + "epoch": 0.24627909935122758, + "ewc_loss": 0.022763250395655632, + "ewc_loss_diag": 1.4901161193847656e-05, + "ewc_loss_parallel": 7.870671834098175e-05, + "grad_norm": 3.83207368850708, + "learning_rate": 8.202628232301823e-07, + "loss": 0.4827, + "mean_token_accuracy": 0.8508065938949585, + "num_tokens": 74018252.0, + "step": 1936 + }, + { + "epoch": 0.24640630962981808, + "ewc_loss": 0.02282807044684887, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.874457514844835e-05, + "grad_norm": 3.7833242416381836, + "learning_rate": 8.206867316659601e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8380520939826965, + "num_tokens": 74065843.0, + "step": 1937 + }, + { + "epoch": 0.2465335199084086, + "ewc_loss": 0.022802229970693588, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.848617678973824e-05, + "grad_norm": 3.7664358615875244, + "learning_rate": 8.21110640101738e-07, + "loss": 0.3951, + "mean_token_accuracy": 0.8714354038238525, + "num_tokens": 74106389.0, + "step": 1938 + }, + { + "epoch": 0.2466607301869991, + "ewc_loss": 0.022833233699202538, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.87962053436786e-05, + "grad_norm": 3.8387320041656494, + "learning_rate": 8.215345485375159e-07, + "loss": 0.5154, + "mean_token_accuracy": 0.8378980159759521, + "num_tokens": 74150786.0, + "step": 1939 + }, + { + "epoch": 0.2467879404655896, + "ewc_loss": 0.022845350205898285, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.891737186582759e-05, + "grad_norm": 3.8223490715026855, + "learning_rate": 8.219584569732937e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8434364795684814, + "num_tokens": 74190474.0, + "step": 1940 + }, + { + "epoch": 0.24691515074418013, + "ewc_loss": 0.022846650332212448, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.893038127804175e-05, + "grad_norm": 3.814723014831543, + "learning_rate": 8.223823654090715e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.8623738288879395, + "num_tokens": 74226437.0, + "step": 1941 + }, + { + "epoch": 0.24704236102277063, + "ewc_loss": 0.022838382050395012, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.884769001975656e-05, + "grad_norm": 3.846867561340332, + "learning_rate": 8.228062738448495e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8507394790649414, + "num_tokens": 74267292.0, + "step": 1942 + }, + { + "epoch": 0.24716957130136116, + "ewc_loss": 0.022876407951116562, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.92279388406314e-05, + "grad_norm": 3.8030402660369873, + "learning_rate": 8.232301822806273e-07, + "loss": 0.489, + "mean_token_accuracy": 0.8455667495727539, + "num_tokens": 74306738.0, + "step": 1943 + }, + { + "epoch": 0.24729678157995166, + "ewc_loss": 0.02285129204392433, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.897678733570501e-05, + "grad_norm": 3.899362564086914, + "learning_rate": 8.236540907164053e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.8502790927886963, + "num_tokens": 74343151.0, + "step": 1944 + }, + { + "epoch": 0.24742399185854216, + "ewc_loss": 0.022934652864933014, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.981040107551962e-05, + "grad_norm": 3.836226224899292, + "learning_rate": 8.240779991521831e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8544943332672119, + "num_tokens": 74382755.0, + "step": 1945 + }, + { + "epoch": 0.2475512021371327, + "ewc_loss": 0.022855937480926514, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.902322977315634e-05, + "grad_norm": 3.8225467205047607, + "learning_rate": 8.24501907587961e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8453167676925659, + "num_tokens": 74426129.0, + "step": 1946 + }, + { + "epoch": 0.2476784124157232, + "ewc_loss": 0.02287963405251503, + "ewc_loss_diag": 1.4960765838623047e-05, + "ewc_loss_parallel": 7.926020043669268e-05, + "grad_norm": 3.8534069061279297, + "learning_rate": 8.249258160237388e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8462616205215454, + "num_tokens": 74460904.0, + "step": 1947 + }, + { + "epoch": 0.2478056226943137, + "ewc_loss": 0.022962763905525208, + "ewc_loss_diag": 1.5020370483398438e-05, + "ewc_loss_parallel": 7.948115671752021e-05, + "grad_norm": 3.9105513095855713, + "learning_rate": 8.253497244595167e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.844873309135437, + "num_tokens": 74497993.0, + "step": 1948 + }, + { + "epoch": 0.24793283297290422, + "ewc_loss": 0.023047391325235367, + "ewc_loss_diag": 1.5079975128173828e-05, + "ewc_loss_parallel": 7.971707236720249e-05, + "grad_norm": 3.8157341480255127, + "learning_rate": 8.257736328952945e-07, + "loss": 0.5106, + "mean_token_accuracy": 0.8393830060958862, + "num_tokens": 74541108.0, + "step": 1949 + }, + { + "epoch": 0.24806004325149472, + "ewc_loss": 0.022996488958597183, + "ewc_loss_diag": 1.5079975128173828e-05, + "ewc_loss_parallel": 7.920804637251422e-05, + "grad_norm": 3.827604293823242, + "learning_rate": 8.261975413310725e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8498327732086182, + "num_tokens": 74579389.0, + "step": 1950 + }, + { + "epoch": 0.24818725353008522, + "ewc_loss": 0.02304643951356411, + "ewc_loss_diag": 1.5079975128173828e-05, + "ewc_loss_parallel": 7.970755541464314e-05, + "grad_norm": 3.7836287021636963, + "learning_rate": 8.266214497668503e-07, + "loss": 0.4334, + "mean_token_accuracy": 0.8641636371612549, + "num_tokens": 74621546.0, + "step": 1951 + }, + { + "epoch": 0.24831446380867575, + "ewc_loss": 0.023035243153572083, + "ewc_loss_diag": 1.5079975128173828e-05, + "ewc_loss_parallel": 7.959558570291847e-05, + "grad_norm": 3.841139793395996, + "learning_rate": 8.270453582026283e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.836755633354187, + "num_tokens": 74660178.0, + "step": 1952 + }, + { + "epoch": 0.24844167408726625, + "ewc_loss": 0.023081032559275627, + "ewc_loss_diag": 1.5079975128173828e-05, + "ewc_loss_parallel": 8.005349081940949e-05, + "grad_norm": 3.917402505874634, + "learning_rate": 8.274692666384061e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8543962240219116, + "num_tokens": 74693867.0, + "step": 1953 + }, + { + "epoch": 0.24856888436585675, + "ewc_loss": 0.023092778399586678, + "ewc_loss_diag": 1.5079975128173828e-05, + "ewc_loss_parallel": 8.017094660317525e-05, + "grad_norm": 3.843522548675537, + "learning_rate": 8.27893175074184e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.848576545715332, + "num_tokens": 74727561.0, + "step": 1954 + }, + { + "epoch": 0.24869609464444728, + "ewc_loss": 0.023029375821352005, + "ewc_loss_diag": 1.5079975128173828e-05, + "ewc_loss_parallel": 7.953693420859054e-05, + "grad_norm": 3.9068071842193604, + "learning_rate": 8.283170835099618e-07, + "loss": 0.4759, + "mean_token_accuracy": 0.8500717878341675, + "num_tokens": 74763719.0, + "step": 1955 + }, + { + "epoch": 0.24882330492303778, + "ewc_loss": 0.023113079369068146, + "ewc_loss_diag": 1.5079975128173828e-05, + "ewc_loss_parallel": 8.037394582061097e-05, + "grad_norm": 3.8322365283966064, + "learning_rate": 8.287409919457396e-07, + "loss": 0.4393, + "mean_token_accuracy": 0.8596119284629822, + "num_tokens": 74802666.0, + "step": 1956 + }, + { + "epoch": 0.24895051520162828, + "ewc_loss": 0.02305535227060318, + "ewc_loss_diag": 1.5079975128173828e-05, + "ewc_loss_parallel": 7.97966931713745e-05, + "grad_norm": 3.8839147090911865, + "learning_rate": 8.291649003815175e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8454372882843018, + "num_tokens": 74840307.0, + "step": 1957 + }, + { + "epoch": 0.2490777254802188, + "ewc_loss": 0.023122156038880348, + "ewc_loss_diag": 1.5079975128173828e-05, + "ewc_loss_parallel": 8.046472066780552e-05, + "grad_norm": 3.818283796310425, + "learning_rate": 8.295888088172954e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8439911603927612, + "num_tokens": 74880181.0, + "step": 1958 + }, + { + "epoch": 0.2492049357588093, + "ewc_loss": 0.023125095292925835, + "ewc_loss_diag": 1.5139579772949219e-05, + "ewc_loss_parallel": 7.988376455614343e-05, + "grad_norm": 3.8076562881469727, + "learning_rate": 8.300127172530733e-07, + "loss": 0.4099, + "mean_token_accuracy": 0.867239236831665, + "num_tokens": 74917148.0, + "step": 1959 + }, + { + "epoch": 0.2493321460373998, + "ewc_loss": 0.023167811334133148, + "ewc_loss_diag": 1.5139579772949219e-05, + "ewc_loss_parallel": 8.03109141997993e-05, + "grad_norm": 3.941293478012085, + "learning_rate": 8.304366256888512e-07, + "loss": 0.4475, + "mean_token_accuracy": 0.8561306595802307, + "num_tokens": 74953124.0, + "step": 1960 + }, + { + "epoch": 0.24945935631599034, + "ewc_loss": 0.023216385394334793, + "ewc_loss_diag": 1.5139579772949219e-05, + "ewc_loss_parallel": 8.07966644060798e-05, + "grad_norm": 3.8873746395111084, + "learning_rate": 8.308605341246291e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8537195324897766, + "num_tokens": 74989726.0, + "step": 1961 + }, + { + "epoch": 0.24958656659458084, + "ewc_loss": 0.023152433335781097, + "ewc_loss_diag": 1.5139579772949219e-05, + "ewc_loss_parallel": 8.015715138753876e-05, + "grad_norm": 3.899245023727417, + "learning_rate": 8.312844425604068e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.858599066734314, + "num_tokens": 75024849.0, + "step": 1962 + }, + { + "epoch": 0.24971377687317134, + "ewc_loss": 0.02323095127940178, + "ewc_loss_diag": 1.519918441772461e-05, + "ewc_loss_parallel": 8.033198537304997e-05, + "grad_norm": 3.8050172328948975, + "learning_rate": 8.317083509961848e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.847678542137146, + "num_tokens": 75066464.0, + "step": 1963 + }, + { + "epoch": 0.24984098715176187, + "ewc_loss": 0.023203548043966293, + "ewc_loss_diag": 1.519918441772461e-05, + "ewc_loss_parallel": 8.005794370546937e-05, + "grad_norm": 3.835200071334839, + "learning_rate": 8.321322594319626e-07, + "loss": 0.441, + "mean_token_accuracy": 0.858888566493988, + "num_tokens": 75106829.0, + "step": 1964 + }, + { + "epoch": 0.24996819743035237, + "ewc_loss": 0.02325493097305298, + "ewc_loss_diag": 1.519918441772461e-05, + "ewc_loss_parallel": 8.057177910814062e-05, + "grad_norm": 3.8490889072418213, + "learning_rate": 8.325561678677405e-07, + "loss": 0.4953, + "mean_token_accuracy": 0.8453990817070007, + "num_tokens": 75148227.0, + "step": 1965 + }, + { + "epoch": 0.2500954077089429, + "ewc_loss": 0.023256510496139526, + "ewc_loss_diag": 1.519918441772461e-05, + "ewc_loss_parallel": 8.05875679361634e-05, + "grad_norm": 3.9114062786102295, + "learning_rate": 8.329800763035184e-07, + "loss": 0.5073, + "mean_token_accuracy": 0.8384888172149658, + "num_tokens": 75188719.0, + "step": 1966 + }, + { + "epoch": 0.25022261798753337, + "ewc_loss": 0.023319648578763008, + "ewc_loss_diag": 1.52587890625e-05, + "ewc_loss_parallel": 8.060859545366839e-05, + "grad_norm": 3.910067558288574, + "learning_rate": 8.334039847392963e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.852056086063385, + "num_tokens": 75222260.0, + "step": 1967 + }, + { + "epoch": 0.2503498282661239, + "ewc_loss": 0.023313838988542557, + "ewc_loss_diag": 1.52587890625e-05, + "ewc_loss_parallel": 8.055049693211913e-05, + "grad_norm": 3.8710241317749023, + "learning_rate": 8.338278931750742e-07, + "loss": 0.4341, + "mean_token_accuracy": 0.8620872497558594, + "num_tokens": 75255489.0, + "step": 1968 + }, + { + "epoch": 0.2504770385447144, + "ewc_loss": 0.023302186280488968, + "ewc_loss_diag": 1.52587890625e-05, + "ewc_loss_parallel": 8.043397247092798e-05, + "grad_norm": 3.925870418548584, + "learning_rate": 8.342518016108521e-07, + "loss": 0.4598, + "mean_token_accuracy": 0.8509967923164368, + "num_tokens": 75290548.0, + "step": 1969 + }, + { + "epoch": 0.2506042488233049, + "ewc_loss": 0.023338571190834045, + "ewc_loss_diag": 1.52587890625e-05, + "ewc_loss_parallel": 8.079782128334045e-05, + "grad_norm": 3.9495625495910645, + "learning_rate": 8.346757100466298e-07, + "loss": 0.4613, + "mean_token_accuracy": 0.8551530838012695, + "num_tokens": 75323912.0, + "step": 1970 + }, + { + "epoch": 0.2507314591018954, + "ewc_loss": 0.02346620336174965, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.085344597930089e-05, + "grad_norm": 4.0873517990112305, + "learning_rate": 8.350996184824078e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8389334082603455, + "num_tokens": 75361206.0, + "step": 1971 + }, + { + "epoch": 0.25085866938048595, + "ewc_loss": 0.023501718416810036, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.12085927464068e-05, + "grad_norm": 3.8558146953582764, + "learning_rate": 8.355235269181856e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8461287617683411, + "num_tokens": 75397292.0, + "step": 1972 + }, + { + "epoch": 0.2509858796590764, + "ewc_loss": 0.02336481586098671, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 7.98395776655525e-05, + "grad_norm": 3.8608171939849854, + "learning_rate": 8.359474353539635e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8603405356407166, + "num_tokens": 75437616.0, + "step": 1973 + }, + { + "epoch": 0.25111308993766696, + "ewc_loss": 0.023460552096366882, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.079692634055391e-05, + "grad_norm": 3.7855446338653564, + "learning_rate": 8.363713437897414e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8346468210220337, + "num_tokens": 75486172.0, + "step": 1974 + }, + { + "epoch": 0.2512403002162575, + "ewc_loss": 0.023414433002471924, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.033573976717889e-05, + "grad_norm": 3.881464719772339, + "learning_rate": 8.367952522255193e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.848874568939209, + "num_tokens": 75526637.0, + "step": 1975 + }, + { + "epoch": 0.25136751049484796, + "ewc_loss": 0.023627227172255516, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.124297892209142e-05, + "grad_norm": 3.881904125213623, + "learning_rate": 8.372191606612972e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.8536727428436279, + "num_tokens": 75564632.0, + "step": 1976 + }, + { + "epoch": 0.2514947207734385, + "ewc_loss": 0.023574311286211014, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.071382762864232e-05, + "grad_norm": 3.924558401107788, + "learning_rate": 8.376430690970749e-07, + "loss": 0.436, + "mean_token_accuracy": 0.862586498260498, + "num_tokens": 75595553.0, + "step": 1977 + }, + { + "epoch": 0.251621931052029, + "ewc_loss": 0.0235319584608078, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.151098154485226e-05, + "grad_norm": 3.89420747756958, + "learning_rate": 8.380669775328528e-07, + "loss": 0.431, + "mean_token_accuracy": 0.863961935043335, + "num_tokens": 75635847.0, + "step": 1978 + }, + { + "epoch": 0.25174914133061954, + "ewc_loss": 0.023452278226614, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.071419142652303e-05, + "grad_norm": 3.823275089263916, + "learning_rate": 8.384908859686307e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.8654493093490601, + "num_tokens": 75677403.0, + "step": 1979 + }, + { + "epoch": 0.25187635160921, + "ewc_loss": 0.023479443043470383, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.098583202809095e-05, + "grad_norm": 3.8317062854766846, + "learning_rate": 8.389147944044086e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8511918783187866, + "num_tokens": 75720703.0, + "step": 1980 + }, + { + "epoch": 0.25200356188780054, + "ewc_loss": 0.023497052490711212, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.116191747831181e-05, + "grad_norm": 3.8921515941619873, + "learning_rate": 8.393387028401864e-07, + "loss": 0.4952, + "mean_token_accuracy": 0.8443769812583923, + "num_tokens": 75758000.0, + "step": 1981 + }, + { + "epoch": 0.25213077216639107, + "ewc_loss": 0.0235444288700819, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.163569145835936e-05, + "grad_norm": 3.9525883197784424, + "learning_rate": 8.397626112759644e-07, + "loss": 0.5207, + "mean_token_accuracy": 0.837352454662323, + "num_tokens": 75791113.0, + "step": 1982 + }, + { + "epoch": 0.25225798244498154, + "ewc_loss": 0.023526331409811974, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.145471656462178e-05, + "grad_norm": 3.950086832046509, + "learning_rate": 8.401865197117422e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8527724146842957, + "num_tokens": 75821733.0, + "step": 1983 + }, + { + "epoch": 0.25238519272357207, + "ewc_loss": 0.023671090602874756, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.168161730282009e-05, + "grad_norm": 3.8137693405151367, + "learning_rate": 8.406104281475202e-07, + "loss": 0.445, + "mean_token_accuracy": 0.8570010662078857, + "num_tokens": 75861404.0, + "step": 1984 + }, + { + "epoch": 0.2525124030021626, + "ewc_loss": 0.023495644330978394, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.114784577628598e-05, + "grad_norm": 3.7882189750671387, + "learning_rate": 8.410343365832979e-07, + "loss": 0.4467, + "mean_token_accuracy": 0.8569504618644714, + "num_tokens": 75903059.0, + "step": 1985 + }, + { + "epoch": 0.2526396132807531, + "ewc_loss": 0.023553846403956413, + "ewc_loss_diag": 1.537799835205078e-05, + "ewc_loss_parallel": 8.172987145371735e-05, + "grad_norm": 3.851137399673462, + "learning_rate": 8.414582450190758e-07, + "loss": 0.458, + "mean_token_accuracy": 0.855209469795227, + "num_tokens": 75947125.0, + "step": 1986 + }, + { + "epoch": 0.2527668235593436, + "ewc_loss": 0.023684343323111534, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.181413431884721e-05, + "grad_norm": 3.878579616546631, + "learning_rate": 8.418821534548537e-07, + "loss": 0.5427, + "mean_token_accuracy": 0.8303992748260498, + "num_tokens": 75987519.0, + "step": 1987 + }, + { + "epoch": 0.25289403383793413, + "ewc_loss": 0.023690808564424515, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.187879575416446e-05, + "grad_norm": 3.9285149574279785, + "learning_rate": 8.423060618906316e-07, + "loss": 0.4685, + "mean_token_accuracy": 0.8520321846008301, + "num_tokens": 76027728.0, + "step": 1988 + }, + { + "epoch": 0.2530212441165246, + "ewc_loss": 0.023699898272752762, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.196968701668084e-05, + "grad_norm": 3.880500555038452, + "learning_rate": 8.427299703264095e-07, + "loss": 0.4677, + "mean_token_accuracy": 0.8538944721221924, + "num_tokens": 76070942.0, + "step": 1989 + }, + { + "epoch": 0.25314845439511513, + "ewc_loss": 0.02365463227033615, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.151703514158726e-05, + "grad_norm": 3.8332982063293457, + "learning_rate": 8.431538787621874e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.8458852767944336, + "num_tokens": 76113330.0, + "step": 1990 + }, + { + "epoch": 0.25327566467370566, + "ewc_loss": 0.023668885231018066, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.165955659933388e-05, + "grad_norm": 3.8336598873138428, + "learning_rate": 8.435777871979652e-07, + "loss": 0.4463, + "mean_token_accuracy": 0.859032392501831, + "num_tokens": 76153511.0, + "step": 1991 + }, + { + "epoch": 0.25340287495229613, + "ewc_loss": 0.023678898811340332, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.175968832802027e-05, + "grad_norm": 3.8937525749206543, + "learning_rate": 8.440016956337432e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8524698615074158, + "num_tokens": 76191505.0, + "step": 1992 + }, + { + "epoch": 0.25353008523088666, + "ewc_loss": 0.023731598630547523, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.228668593801558e-05, + "grad_norm": 3.923854351043701, + "learning_rate": 8.444256040695209e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8474358320236206, + "num_tokens": 76232479.0, + "step": 1993 + }, + { + "epoch": 0.2536572955094772, + "ewc_loss": 0.023724976927042007, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.22204674477689e-05, + "grad_norm": 3.869058609008789, + "learning_rate": 8.448495125052988e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8496878147125244, + "num_tokens": 76270192.0, + "step": 1994 + }, + { + "epoch": 0.25378450578806766, + "ewc_loss": 0.02370603010058403, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.203100878745317e-05, + "grad_norm": 3.903386354446411, + "learning_rate": 8.452734209410767e-07, + "loss": 0.4212, + "mean_token_accuracy": 0.8667837381362915, + "num_tokens": 76303981.0, + "step": 1995 + }, + { + "epoch": 0.2539117160666582, + "ewc_loss": 0.023734353482723236, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.231424726545811e-05, + "grad_norm": 3.894688844680786, + "learning_rate": 8.456973293768545e-07, + "loss": 0.4456, + "mean_token_accuracy": 0.857150673866272, + "num_tokens": 76338496.0, + "step": 1996 + }, + { + "epoch": 0.2540389263452487, + "ewc_loss": 0.023721152916550636, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.218223229050636e-05, + "grad_norm": 3.8737146854400635, + "learning_rate": 8.461212378126325e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8433953523635864, + "num_tokens": 76377904.0, + "step": 1997 + }, + { + "epoch": 0.2541661366238392, + "ewc_loss": 0.023741867393255234, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.238937152782455e-05, + "grad_norm": 3.9114768505096436, + "learning_rate": 8.465451462484103e-07, + "loss": 0.5227, + "mean_token_accuracy": 0.8375139236450195, + "num_tokens": 76413905.0, + "step": 1998 + }, + { + "epoch": 0.2542933469024297, + "ewc_loss": 0.023755498230457306, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.252567204181105e-05, + "grad_norm": 3.8916208744049072, + "learning_rate": 8.469690546841882e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8504961729049683, + "num_tokens": 76451591.0, + "step": 1999 + }, + { + "epoch": 0.25442055718102025, + "ewc_loss": 0.023746371269226074, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.243442425737157e-05, + "grad_norm": 3.9475855827331543, + "learning_rate": 8.47392963119966e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.856338381767273, + "num_tokens": 76486191.0, + "step": 2000 + }, + { + "epoch": 0.2545477674596107, + "ewc_loss": 0.023792585358023643, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.289655670523643e-05, + "grad_norm": 3.8769540786743164, + "learning_rate": 8.478168715557439e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8532198071479797, + "num_tokens": 76523317.0, + "step": 2001 + }, + { + "epoch": 0.25467497773820125, + "ewc_loss": 0.023744547739624977, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.241618343163282e-05, + "grad_norm": 3.872685194015503, + "learning_rate": 8.482407799915217e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.848467230796814, + "num_tokens": 76562661.0, + "step": 2002 + }, + { + "epoch": 0.2548021880167918, + "ewc_loss": 0.02378099039196968, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.278061432065442e-05, + "grad_norm": 3.8684656620025635, + "learning_rate": 8.486646884272997e-07, + "loss": 0.473, + "mean_token_accuracy": 0.8490771055221558, + "num_tokens": 76600914.0, + "step": 2003 + }, + { + "epoch": 0.25492939829538225, + "ewc_loss": 0.023772401735186577, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.269472164101899e-05, + "grad_norm": 3.907377004623413, + "learning_rate": 8.490885968630775e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8426515460014343, + "num_tokens": 76636310.0, + "step": 2004 + }, + { + "epoch": 0.2550566085739728, + "ewc_loss": 0.023834092542529106, + "ewc_loss_diag": 1.5497207641601562e-05, + "ewc_loss_parallel": 8.331162825925276e-05, + "grad_norm": 3.8764421939849854, + "learning_rate": 8.495125052988555e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8525137901306152, + "num_tokens": 76680889.0, + "step": 2005 + }, + { + "epoch": 0.2551838188525633, + "ewc_loss": 0.023926299065351486, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.301300113089383e-05, + "grad_norm": 3.8609254360198975, + "learning_rate": 8.499364137346333e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8556918501853943, + "num_tokens": 76722469.0, + "step": 2006 + }, + { + "epoch": 0.2553110291311538, + "ewc_loss": 0.023947738111019135, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.322739449795336e-05, + "grad_norm": 3.944754123687744, + "learning_rate": 8.503603221704112e-07, + "loss": 0.5051, + "mean_token_accuracy": 0.8401466012001038, + "num_tokens": 76757668.0, + "step": 2007 + }, + { + "epoch": 0.2554382394097443, + "ewc_loss": 0.023999560624361038, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.374560275115073e-05, + "grad_norm": 3.924877882003784, + "learning_rate": 8.50784230606189e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.8533592820167542, + "num_tokens": 76795607.0, + "step": 2008 + }, + { + "epoch": 0.25556544968833483, + "ewc_loss": 0.02394871413707733, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.323714428115636e-05, + "grad_norm": 3.8575515747070312, + "learning_rate": 8.512081390419669e-07, + "loss": 0.4938, + "mean_token_accuracy": 0.8523890972137451, + "num_tokens": 76837161.0, + "step": 2009 + }, + { + "epoch": 0.2556926599669253, + "ewc_loss": 0.023964393883943558, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.339394844369963e-05, + "grad_norm": 3.9779980182647705, + "learning_rate": 8.516320474777447e-07, + "loss": 0.4745, + "mean_token_accuracy": 0.8517804741859436, + "num_tokens": 76872198.0, + "step": 2010 + }, + { + "epoch": 0.25581987024551583, + "ewc_loss": 0.024038121104240417, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.41312066768296e-05, + "grad_norm": 3.907585859298706, + "learning_rate": 8.520559559135227e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8631695508956909, + "num_tokens": 76908976.0, + "step": 2011 + }, + { + "epoch": 0.25594708052410636, + "ewc_loss": 0.023951727896928787, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.326728129759431e-05, + "grad_norm": 3.9296281337738037, + "learning_rate": 8.524798643493005e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8495268821716309, + "num_tokens": 76947298.0, + "step": 2012 + }, + { + "epoch": 0.25607429080269684, + "ewc_loss": 0.024006441235542297, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.381442603422329e-05, + "grad_norm": 3.874650239944458, + "learning_rate": 8.529037727850785e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.857470691204071, + "num_tokens": 76985112.0, + "step": 2013 + }, + { + "epoch": 0.25620150108128736, + "ewc_loss": 0.023975780233740807, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.350780262844637e-05, + "grad_norm": 3.933424234390259, + "learning_rate": 8.533276812208563e-07, + "loss": 0.4195, + "mean_token_accuracy": 0.8670137524604797, + "num_tokens": 77020309.0, + "step": 2014 + }, + { + "epoch": 0.2563287113598779, + "ewc_loss": 0.0240123700350523, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.387369598494843e-05, + "grad_norm": 3.9349539279937744, + "learning_rate": 8.53751589656634e-07, + "loss": 0.4589, + "mean_token_accuracy": 0.8526037931442261, + "num_tokens": 77063245.0, + "step": 2015 + }, + { + "epoch": 0.25645592163846836, + "ewc_loss": 0.023976607248187065, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.351606811629608e-05, + "grad_norm": 3.979527711868286, + "learning_rate": 8.54175498092412e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8556714057922363, + "num_tokens": 77096791.0, + "step": 2016 + }, + { + "epoch": 0.2565831319170589, + "ewc_loss": 0.02403157763183117, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.406577399000525e-05, + "grad_norm": 4.0087785720825195, + "learning_rate": 8.545994065281898e-07, + "loss": 0.4363, + "mean_token_accuracy": 0.8635515570640564, + "num_tokens": 77129281.0, + "step": 2017 + }, + { + "epoch": 0.2567103421956494, + "ewc_loss": 0.024013418704271317, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.388418791582808e-05, + "grad_norm": 3.970932960510254, + "learning_rate": 8.550233149639677e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8342037200927734, + "num_tokens": 77168785.0, + "step": 2018 + }, + { + "epoch": 0.2568375524742399, + "ewc_loss": 0.02398252673447132, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.357526530744508e-05, + "grad_norm": 3.9301674365997314, + "learning_rate": 8.554472233997456e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8419609069824219, + "num_tokens": 77208360.0, + "step": 2019 + }, + { + "epoch": 0.2569647627528304, + "ewc_loss": 0.023984085768461227, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.359085768461227e-05, + "grad_norm": 4.063159465789795, + "learning_rate": 8.558711318355235e-07, + "loss": 0.5237, + "mean_token_accuracy": 0.8328888416290283, + "num_tokens": 77245539.0, + "step": 2020 + }, + { + "epoch": 0.25709197303142095, + "ewc_loss": 0.024029940366744995, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.404941036133096e-05, + "grad_norm": 3.9320950508117676, + "learning_rate": 8.562950402713014e-07, + "loss": 0.4271, + "mean_token_accuracy": 0.8696413040161133, + "num_tokens": 77279118.0, + "step": 2021 + }, + { + "epoch": 0.2572191833100114, + "ewc_loss": 0.02395300753414631, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.328007970703766e-05, + "grad_norm": 3.942549228668213, + "learning_rate": 8.567189487070793e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8417133688926697, + "num_tokens": 77316259.0, + "step": 2022 + }, + { + "epoch": 0.25734639358860195, + "ewc_loss": 0.024022886529564857, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.397886267630383e-05, + "grad_norm": 3.918466329574585, + "learning_rate": 8.57142857142857e-07, + "loss": 0.53, + "mean_token_accuracy": 0.8351626396179199, + "num_tokens": 77359712.0, + "step": 2023 + }, + { + "epoch": 0.2574736038671925, + "ewc_loss": 0.02399113029241562, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.36613035062328e-05, + "grad_norm": 3.8751626014709473, + "learning_rate": 8.57566765578635e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.8651958703994751, + "num_tokens": 77402269.0, + "step": 2024 + }, + { + "epoch": 0.25760081414578295, + "ewc_loss": 0.02400570549070835, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.380705548916012e-05, + "grad_norm": 3.883596181869507, + "learning_rate": 8.579906740144128e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8548998832702637, + "num_tokens": 77445791.0, + "step": 2025 + }, + { + "epoch": 0.2577280244243735, + "ewc_loss": 0.02401939406991005, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.394393807975575e-05, + "grad_norm": 3.9136016368865967, + "learning_rate": 8.584145824501907e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8408817648887634, + "num_tokens": 77485376.0, + "step": 2026 + }, + { + "epoch": 0.257855234702964, + "ewc_loss": 0.024049101397395134, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.424101542914286e-05, + "grad_norm": 3.9364283084869385, + "learning_rate": 8.588384908859686e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8476465344429016, + "num_tokens": 77524344.0, + "step": 2027 + }, + { + "epoch": 0.25798244498155454, + "ewc_loss": 0.024031788110733032, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.406788401771337e-05, + "grad_norm": 3.894199848175049, + "learning_rate": 8.592623993217465e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8501753807067871, + "num_tokens": 77562746.0, + "step": 2028 + }, + { + "epoch": 0.258109655260145, + "ewc_loss": 0.024043824523687363, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.41882501845248e-05, + "grad_norm": 3.9826853275299072, + "learning_rate": 8.596863077575244e-07, + "loss": 0.5085, + "mean_token_accuracy": 0.8439416885375977, + "num_tokens": 77601261.0, + "step": 2029 + }, + { + "epoch": 0.25823686553873554, + "ewc_loss": 0.024088595062494278, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.463594713248312e-05, + "grad_norm": 3.9222233295440674, + "learning_rate": 8.601102161933023e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.846996009349823, + "num_tokens": 77640054.0, + "step": 2030 + }, + { + "epoch": 0.25836407581732607, + "ewc_loss": 0.02404879406094551, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.423795225098729e-05, + "grad_norm": 3.9581148624420166, + "learning_rate": 8.6053412462908e-07, + "loss": 0.4637, + "mean_token_accuracy": 0.8539519906044006, + "num_tokens": 77680098.0, + "step": 2031 + }, + { + "epoch": 0.25849128609591654, + "ewc_loss": 0.024097785353660583, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.47278643050231e-05, + "grad_norm": 4.152117729187012, + "learning_rate": 8.60958033064858e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.85760498046875, + "num_tokens": 77712517.0, + "step": 2032 + }, + { + "epoch": 0.25861849637450707, + "ewc_loss": 0.024155860766768456, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.530860941391438e-05, + "grad_norm": 3.9299793243408203, + "learning_rate": 8.613819415006358e-07, + "loss": 0.5205, + "mean_token_accuracy": 0.8434256315231323, + "num_tokens": 77751335.0, + "step": 2033 + }, + { + "epoch": 0.2587457066530976, + "ewc_loss": 0.023982331156730652, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.357332262676209e-05, + "grad_norm": 3.920938491821289, + "learning_rate": 8.618058499364137e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8407347202301025, + "num_tokens": 77792300.0, + "step": 2034 + }, + { + "epoch": 0.25887291693168807, + "ewc_loss": 0.024065852165222168, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.440852980129421e-05, + "grad_norm": 3.8937485218048096, + "learning_rate": 8.622297583721916e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.8561317920684814, + "num_tokens": 77836391.0, + "step": 2035 + }, + { + "epoch": 0.2590001272102786, + "ewc_loss": 0.024049978703260422, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.424979023402557e-05, + "grad_norm": 3.944026231765747, + "learning_rate": 8.626536668079695e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8478074073791504, + "num_tokens": 77871600.0, + "step": 2036 + }, + { + "epoch": 0.2591273374888691, + "ewc_loss": 0.024095963686704636, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.470963075524196e-05, + "grad_norm": 3.9674794673919678, + "learning_rate": 8.630775752437474e-07, + "loss": 0.4642, + "mean_token_accuracy": 0.8536099195480347, + "num_tokens": 77904623.0, + "step": 2037 + }, + { + "epoch": 0.2592545477674596, + "ewc_loss": 0.02411133050918579, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.486329898005351e-05, + "grad_norm": 3.9697670936584473, + "learning_rate": 8.635014836795251e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.8428937196731567, + "num_tokens": 77941145.0, + "step": 2038 + }, + { + "epoch": 0.2593817580460501, + "ewc_loss": 0.02412986010313034, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.504861034452915e-05, + "grad_norm": 3.9654083251953125, + "learning_rate": 8.63925392115303e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8584190607070923, + "num_tokens": 77975087.0, + "step": 2039 + }, + { + "epoch": 0.25950896832464065, + "ewc_loss": 0.024125713855028152, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.50071373861283e-05, + "grad_norm": 4.0516462326049805, + "learning_rate": 8.643493005510809e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.85762619972229, + "num_tokens": 78009526.0, + "step": 2040 + }, + { + "epoch": 0.2596361786032311, + "ewc_loss": 0.02417091280221939, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.545914170099422e-05, + "grad_norm": 3.93572735786438, + "learning_rate": 8.647732089868588e-07, + "loss": 0.5256, + "mean_token_accuracy": 0.8345857858657837, + "num_tokens": 78050918.0, + "step": 2041 + }, + { + "epoch": 0.25976338888182166, + "ewc_loss": 0.024083085358142853, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.458085358142853e-05, + "grad_norm": 3.974651575088501, + "learning_rate": 8.651971174226366e-07, + "loss": 0.4437, + "mean_token_accuracy": 0.8599376678466797, + "num_tokens": 78085547.0, + "step": 2042 + }, + { + "epoch": 0.2598905991604122, + "ewc_loss": 0.024164078757166862, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.539078407920897e-05, + "grad_norm": 3.9979305267333984, + "learning_rate": 8.656210258584146e-07, + "loss": 0.512, + "mean_token_accuracy": 0.8375271558761597, + "num_tokens": 78122916.0, + "step": 2043 + }, + { + "epoch": 0.26001780943900266, + "ewc_loss": 0.024152137339115143, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.52713783388026e-05, + "grad_norm": 3.9241769313812256, + "learning_rate": 8.660449342941924e-07, + "loss": 0.5039, + "mean_token_accuracy": 0.8396185636520386, + "num_tokens": 78161800.0, + "step": 2044 + }, + { + "epoch": 0.2601450197175932, + "ewc_loss": 0.024136684834957123, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.511685155099258e-05, + "grad_norm": 3.9325718879699707, + "learning_rate": 8.664688427299704e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8478497862815857, + "num_tokens": 78199961.0, + "step": 2045 + }, + { + "epoch": 0.2602722299961837, + "ewc_loss": 0.024183807894587517, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.558807894587517e-05, + "grad_norm": 3.9411847591400146, + "learning_rate": 8.668927511657481e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.8594542741775513, + "num_tokens": 78231332.0, + "step": 2046 + }, + { + "epoch": 0.2603994402747742, + "ewc_loss": 0.02418341487646103, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.558415720472112e-05, + "grad_norm": 4.017139911651611, + "learning_rate": 8.67316659601526e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8428658246994019, + "num_tokens": 78265054.0, + "step": 2047 + }, + { + "epoch": 0.2605266505533647, + "ewc_loss": 0.024221006780862808, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.596007683081552e-05, + "grad_norm": 3.9245970249176025, + "learning_rate": 8.677405680373039e-07, + "loss": 0.4542, + "mean_token_accuracy": 0.8594026565551758, + "num_tokens": 78305445.0, + "step": 2048 + }, + { + "epoch": 0.26065386083195524, + "ewc_loss": 0.024158459156751633, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.533459913451225e-05, + "grad_norm": 4.0195817947387695, + "learning_rate": 8.681644764730818e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8479709029197693, + "num_tokens": 78339174.0, + "step": 2049 + }, + { + "epoch": 0.2607810711105457, + "ewc_loss": 0.024235298857092857, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.61029839143157e-05, + "grad_norm": 3.90561842918396, + "learning_rate": 8.685883849088596e-07, + "loss": 0.4258, + "mean_token_accuracy": 0.8656490445137024, + "num_tokens": 78377630.0, + "step": 2050 + }, + { + "epoch": 0.26090828138913624, + "ewc_loss": 0.02414463460445404, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.519633411196992e-05, + "grad_norm": 3.9380953311920166, + "learning_rate": 8.690122933446376e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.850458025932312, + "num_tokens": 78419908.0, + "step": 2051 + }, + { + "epoch": 0.26103549166772677, + "ewc_loss": 0.024223223328590393, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.59822248457931e-05, + "grad_norm": 3.8947510719299316, + "learning_rate": 8.694362017804154e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.859948992729187, + "num_tokens": 78459561.0, + "step": 2052 + }, + { + "epoch": 0.26116270194631724, + "ewc_loss": 0.024192210286855698, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.567209442844614e-05, + "grad_norm": 3.9577479362487793, + "learning_rate": 8.698601102161933e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.8474525809288025, + "num_tokens": 78502500.0, + "step": 2053 + }, + { + "epoch": 0.26128991222490777, + "ewc_loss": 0.02422976680099964, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.604767208453268e-05, + "grad_norm": 3.9317400455474854, + "learning_rate": 8.702840186519711e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.8616235256195068, + "num_tokens": 78545411.0, + "step": 2054 + }, + { + "epoch": 0.2614171225034983, + "ewc_loss": 0.024217069149017334, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.592068479629233e-05, + "grad_norm": 3.9962692260742188, + "learning_rate": 8.70707927087749e-07, + "loss": 0.4381, + "mean_token_accuracy": 0.8625838756561279, + "num_tokens": 78585878.0, + "step": 2055 + }, + { + "epoch": 0.2615443327820888, + "ewc_loss": 0.02422512322664261, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.600122964708135e-05, + "grad_norm": 4.027224540710449, + "learning_rate": 8.711318355235269e-07, + "loss": 0.5266, + "mean_token_accuracy": 0.8320795297622681, + "num_tokens": 78621233.0, + "step": 2056 + }, + { + "epoch": 0.2616715430606793, + "ewc_loss": 0.024252623319625854, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.627622446510941e-05, + "grad_norm": 4.014705657958984, + "learning_rate": 8.715557439593047e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.8564679622650146, + "num_tokens": 78654644.0, + "step": 2057 + }, + { + "epoch": 0.26179875333926983, + "ewc_loss": 0.024218950420618057, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.5939493146725e-05, + "grad_norm": 3.967676877975464, + "learning_rate": 8.719796523950826e-07, + "loss": 0.5233, + "mean_token_accuracy": 0.834123969078064, + "num_tokens": 78692851.0, + "step": 2058 + }, + { + "epoch": 0.2619259636178603, + "ewc_loss": 0.02420702576637268, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.582025475334376e-05, + "grad_norm": 3.9493801593780518, + "learning_rate": 8.724035608308605e-07, + "loss": 0.4177, + "mean_token_accuracy": 0.8679988384246826, + "num_tokens": 78725329.0, + "step": 2059 + }, + { + "epoch": 0.26205317389645083, + "ewc_loss": 0.02423042431473732, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.605424227425829e-05, + "grad_norm": 3.9182090759277344, + "learning_rate": 8.728274692666384e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8509224653244019, + "num_tokens": 78766645.0, + "step": 2060 + }, + { + "epoch": 0.26218038417504136, + "ewc_loss": 0.024227643385529518, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.602643356425688e-05, + "grad_norm": 3.9947597980499268, + "learning_rate": 8.732513777024162e-07, + "loss": 0.4683, + "mean_token_accuracy": 0.8532617092132568, + "num_tokens": 78801576.0, + "step": 2061 + }, + { + "epoch": 0.26230759445363183, + "ewc_loss": 0.02427889034152031, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.653889381093904e-05, + "grad_norm": 3.9277400970458984, + "learning_rate": 8.736752861381941e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.838447093963623, + "num_tokens": 78845577.0, + "step": 2062 + }, + { + "epoch": 0.26243480473222236, + "ewc_loss": 0.02424299716949463, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.617997082183138e-05, + "grad_norm": 3.98058819770813, + "learning_rate": 8.740991945739719e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.8505541682243347, + "num_tokens": 78884229.0, + "step": 2063 + }, + { + "epoch": 0.2625620150108129, + "ewc_loss": 0.024272847920656204, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.647848881082609e-05, + "grad_norm": 3.906982898712158, + "learning_rate": 8.745231030097499e-07, + "loss": 0.464, + "mean_token_accuracy": 0.8544436693191528, + "num_tokens": 78926948.0, + "step": 2064 + }, + { + "epoch": 0.26268922528940336, + "ewc_loss": 0.02424343302845955, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.618432912044227e-05, + "grad_norm": 3.9695022106170654, + "learning_rate": 8.749470114455277e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8597705364227295, + "num_tokens": 78963582.0, + "step": 2065 + }, + { + "epoch": 0.2628164355679939, + "ewc_loss": 0.024309201166033745, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.684201020514593e-05, + "grad_norm": 3.931886672973633, + "learning_rate": 8.753709198813056e-07, + "loss": 0.547, + "mean_token_accuracy": 0.8291444778442383, + "num_tokens": 79007345.0, + "step": 2066 + }, + { + "epoch": 0.2629436458465844, + "ewc_loss": 0.02426283434033394, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.63783570821397e-05, + "grad_norm": 3.9236576557159424, + "learning_rate": 8.757948283170835e-07, + "loss": 0.4448, + "mean_token_accuracy": 0.8616660833358765, + "num_tokens": 79049505.0, + "step": 2067 + }, + { + "epoch": 0.2630708561251749, + "ewc_loss": 0.024286484345793724, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.66148475324735e-05, + "grad_norm": 3.9393038749694824, + "learning_rate": 8.762187367528613e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.8583528995513916, + "num_tokens": 79087419.0, + "step": 2068 + }, + { + "epoch": 0.2631980664037654, + "ewc_loss": 0.024294055998325348, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.66905611474067e-05, + "grad_norm": 3.9708359241485596, + "learning_rate": 8.766426451886392e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8444311618804932, + "num_tokens": 79131444.0, + "step": 2069 + }, + { + "epoch": 0.26332527668235595, + "ewc_loss": 0.024426253512501717, + "ewc_loss_diag": 1.5735626220703125e-05, + "ewc_loss_parallel": 8.67918279254809e-05, + "grad_norm": 3.9691591262817383, + "learning_rate": 8.770665536244171e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.8421579599380493, + "num_tokens": 79171976.0, + "step": 2070 + }, + { + "epoch": 0.2634524869609464, + "ewc_loss": 0.024291377514600754, + "ewc_loss_diag": 1.5616416931152344e-05, + "ewc_loss_parallel": 8.66637856233865e-05, + "grad_norm": 3.9886834621429443, + "learning_rate": 8.774904620601949e-07, + "loss": 0.4439, + "mean_token_accuracy": 0.858008623123169, + "num_tokens": 79206273.0, + "step": 2071 + }, + { + "epoch": 0.26357969723953695, + "ewc_loss": 0.024422746151685715, + "ewc_loss_diag": 1.5735626220703125e-05, + "ewc_loss_parallel": 8.675676508573815e-05, + "grad_norm": 4.007293701171875, + "learning_rate": 8.779143704959729e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8596402406692505, + "num_tokens": 79245564.0, + "step": 2072 + }, + { + "epoch": 0.2637069075181275, + "ewc_loss": 0.02444007620215416, + "ewc_loss_diag": 1.5735626220703125e-05, + "ewc_loss_parallel": 8.693005656823516e-05, + "grad_norm": 3.9802935123443604, + "learning_rate": 8.783382789317507e-07, + "loss": 0.4765, + "mean_token_accuracy": 0.8508486747741699, + "num_tokens": 79281857.0, + "step": 2073 + }, + { + "epoch": 0.26383411779671795, + "ewc_loss": 0.024419551715254784, + "ewc_loss_diag": 1.5735626220703125e-05, + "ewc_loss_parallel": 8.672481635585427e-05, + "grad_norm": 4.041557312011719, + "learning_rate": 8.787621873675286e-07, + "loss": 0.5091, + "mean_token_accuracy": 0.8354707956314087, + "num_tokens": 79312482.0, + "step": 2074 + }, + { + "epoch": 0.2639613280753085, + "ewc_loss": 0.02446620725095272, + "ewc_loss_diag": 1.5735626220703125e-05, + "ewc_loss_parallel": 8.719136530999094e-05, + "grad_norm": 3.9837424755096436, + "learning_rate": 8.791860958033065e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8597298860549927, + "num_tokens": 79348157.0, + "step": 2075 + }, + { + "epoch": 0.264088538353899, + "ewc_loss": 0.024409033358097076, + "ewc_loss_diag": 1.5735626220703125e-05, + "ewc_loss_parallel": 8.661963511258364e-05, + "grad_norm": 3.9489684104919434, + "learning_rate": 8.796100042390842e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8629226684570312, + "num_tokens": 79387540.0, + "step": 2076 + }, + { + "epoch": 0.2642157486324895, + "ewc_loss": 0.024531006813049316, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.661867468617857e-05, + "grad_norm": 4.043724536895752, + "learning_rate": 8.800339126748622e-07, + "loss": 0.451, + "mean_token_accuracy": 0.856159508228302, + "num_tokens": 79421424.0, + "step": 2077 + }, + { + "epoch": 0.26434295891108, + "ewc_loss": 0.0246110949665308, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.741953934077173e-05, + "grad_norm": 4.006723880767822, + "learning_rate": 8.8045782111064e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8357498645782471, + "num_tokens": 79458733.0, + "step": 2078 + }, + { + "epoch": 0.26447016918967053, + "ewc_loss": 0.024523736909031868, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.654595876578242e-05, + "grad_norm": 3.9512808322906494, + "learning_rate": 8.808817295464179e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8513606190681458, + "num_tokens": 79495394.0, + "step": 2079 + }, + { + "epoch": 0.26459737946826106, + "ewc_loss": 0.024574581533670425, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.705439540790394e-05, + "grad_norm": 4.021651268005371, + "learning_rate": 8.813056379821958e-07, + "loss": 0.4452, + "mean_token_accuracy": 0.860755980014801, + "num_tokens": 79529799.0, + "step": 2080 + }, + { + "epoch": 0.26472458974685154, + "ewc_loss": 0.024612106382846832, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.742965292185545e-05, + "grad_norm": 3.9767580032348633, + "learning_rate": 8.817295464179737e-07, + "loss": 0.4709, + "mean_token_accuracy": 0.8520232439041138, + "num_tokens": 79564664.0, + "step": 2081 + }, + { + "epoch": 0.26485180002544206, + "ewc_loss": 0.02458042837679386, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.711287955520675e-05, + "grad_norm": 4.033858299255371, + "learning_rate": 8.821534548537515e-07, + "loss": 0.5483, + "mean_token_accuracy": 0.8255260586738586, + "num_tokens": 79606104.0, + "step": 2082 + }, + { + "epoch": 0.2649790103040326, + "ewc_loss": 0.02464159205555916, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.772451838012785e-05, + "grad_norm": 3.9370789527893066, + "learning_rate": 8.825773632895295e-07, + "loss": 0.4451, + "mean_token_accuracy": 0.859508752822876, + "num_tokens": 79647688.0, + "step": 2083 + }, + { + "epoch": 0.26510622058262306, + "ewc_loss": 0.02458387240767479, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.714732393855229e-05, + "grad_norm": 4.023913383483887, + "learning_rate": 8.830012717253072e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8562525510787964, + "num_tokens": 79685500.0, + "step": 2084 + }, + { + "epoch": 0.2652334308612136, + "ewc_loss": 0.02456817403435707, + "ewc_loss_diag": 1.5735626220703125e-05, + "ewc_loss_parallel": 8.821103256195784e-05, + "grad_norm": 3.9571735858917236, + "learning_rate": 8.834251801610852e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8453137874603271, + "num_tokens": 79729912.0, + "step": 2085 + }, + { + "epoch": 0.2653606411398041, + "ewc_loss": 0.02449341118335724, + "ewc_loss_diag": 1.5735626220703125e-05, + "ewc_loss_parallel": 8.746339881327003e-05, + "grad_norm": 3.9802215099334717, + "learning_rate": 8.83849088596863e-07, + "loss": 0.4457, + "mean_token_accuracy": 0.8594539761543274, + "num_tokens": 79767487.0, + "step": 2086 + }, + { + "epoch": 0.2654878514183946, + "ewc_loss": 0.02455829828977585, + "ewc_loss_diag": 1.5735626220703125e-05, + "ewc_loss_parallel": 8.811229054117575e-05, + "grad_norm": 3.9807069301605225, + "learning_rate": 8.842729970326409e-07, + "loss": 0.4288, + "mean_token_accuracy": 0.8668879270553589, + "num_tokens": 79804802.0, + "step": 2087 + }, + { + "epoch": 0.2656150616969851, + "ewc_loss": 0.024531476199626923, + "ewc_loss_diag": 1.5735626220703125e-05, + "ewc_loss_parallel": 8.78440696396865e-05, + "grad_norm": 3.9870152473449707, + "learning_rate": 8.846969054684188e-07, + "loss": 0.5446, + "mean_token_accuracy": 0.8307298421859741, + "num_tokens": 79845685.0, + "step": 2088 + }, + { + "epoch": 0.26574227197557565, + "ewc_loss": 0.02467973530292511, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.81059531820938e-05, + "grad_norm": 3.9957785606384277, + "learning_rate": 8.851208139041967e-07, + "loss": 0.4335, + "mean_token_accuracy": 0.862173318862915, + "num_tokens": 79884692.0, + "step": 2089 + }, + { + "epoch": 0.2658694822541661, + "ewc_loss": 0.02466733753681183, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.798197086434811e-05, + "grad_norm": 4.0744948387146, + "learning_rate": 8.855447223399745e-07, + "loss": 0.4929, + "mean_token_accuracy": 0.8423870801925659, + "num_tokens": 79919780.0, + "step": 2090 + }, + { + "epoch": 0.26599669253275665, + "ewc_loss": 0.02471294440329075, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.843803516356274e-05, + "grad_norm": 3.9915902614593506, + "learning_rate": 8.859686307757524e-07, + "loss": 0.5184, + "mean_token_accuracy": 0.8366590738296509, + "num_tokens": 79958317.0, + "step": 2091 + }, + { + "epoch": 0.2661239028113472, + "ewc_loss": 0.02464640885591507, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.777267794357613e-05, + "grad_norm": 3.942371368408203, + "learning_rate": 8.863925392115302e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8558424711227417, + "num_tokens": 79994126.0, + "step": 2092 + }, + { + "epoch": 0.26625111308993765, + "ewc_loss": 0.024664858356118202, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.795718167675659e-05, + "grad_norm": 3.964984178543091, + "learning_rate": 8.868164476473082e-07, + "loss": 0.4211, + "mean_token_accuracy": 0.8666825294494629, + "num_tokens": 80033157.0, + "step": 2093 + }, + { + "epoch": 0.2663783233685282, + "ewc_loss": 0.024678293615579605, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.809152495814487e-05, + "grad_norm": 4.002821922302246, + "learning_rate": 8.87240356083086e-07, + "loss": 0.4508, + "mean_token_accuracy": 0.8583167791366577, + "num_tokens": 80073354.0, + "step": 2094 + }, + { + "epoch": 0.2665055336471187, + "ewc_loss": 0.02468128502368927, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.812143641989678e-05, + "grad_norm": 3.9909310340881348, + "learning_rate": 8.876642645188639e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.8379243612289429, + "num_tokens": 80111819.0, + "step": 2095 + }, + { + "epoch": 0.2666327439257092, + "ewc_loss": 0.024685677140951157, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.816537592792884e-05, + "grad_norm": 4.022451400756836, + "learning_rate": 8.880881729546418e-07, + "loss": 0.4246, + "mean_token_accuracy": 0.8651942610740662, + "num_tokens": 80147951.0, + "step": 2096 + }, + { + "epoch": 0.2667599542042997, + "ewc_loss": 0.024709943681955338, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.840802911436185e-05, + "grad_norm": 3.9576170444488525, + "learning_rate": 8.885120813904197e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8602505922317505, + "num_tokens": 80191319.0, + "step": 2097 + }, + { + "epoch": 0.26688716448289024, + "ewc_loss": 0.024687692523002625, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.818553033052012e-05, + "grad_norm": 4.048637866973877, + "learning_rate": 8.889359898261976e-07, + "loss": 0.5271, + "mean_token_accuracy": 0.8371033668518066, + "num_tokens": 80226300.0, + "step": 2098 + }, + { + "epoch": 0.2670143747614807, + "ewc_loss": 0.02475651353597641, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.887371950550005e-05, + "grad_norm": 3.9681849479675293, + "learning_rate": 8.893598982619753e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8566685914993286, + "num_tokens": 80265465.0, + "step": 2099 + }, + { + "epoch": 0.26714158504007124, + "ewc_loss": 0.02467784844338894, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.808707934804261e-05, + "grad_norm": 3.9519991874694824, + "learning_rate": 8.897838066977532e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.856343686580658, + "num_tokens": 80312696.0, + "step": 2100 + }, + { + "epoch": 0.26726879531866177, + "ewc_loss": 0.024702701717615128, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.833561150822788e-05, + "grad_norm": 3.9875288009643555, + "learning_rate": 8.902077151335311e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.8491045236587524, + "num_tokens": 80353507.0, + "step": 2101 + }, + { + "epoch": 0.26739600559725224, + "ewc_loss": 0.024728860706090927, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.859720401233062e-05, + "grad_norm": 4.082540035247803, + "learning_rate": 8.90631623569309e-07, + "loss": 0.5579, + "mean_token_accuracy": 0.8216725587844849, + "num_tokens": 80390795.0, + "step": 2102 + }, + { + "epoch": 0.26752321587584277, + "ewc_loss": 0.02476697415113449, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.897833322407678e-05, + "grad_norm": 4.014432907104492, + "learning_rate": 8.910555320050868e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.8523614406585693, + "num_tokens": 80427599.0, + "step": 2103 + }, + { + "epoch": 0.2676504261544333, + "ewc_loss": 0.024677615612745285, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.808473648969084e-05, + "grad_norm": 4.614748001098633, + "learning_rate": 8.914794404408648e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8522918820381165, + "num_tokens": 80468084.0, + "step": 2104 + }, + { + "epoch": 0.26777763643302377, + "ewc_loss": 0.025051847100257874, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 9.182705252896994e-05, + "grad_norm": 3.932633399963379, + "learning_rate": 8.919033488766426e-07, + "loss": 0.4141, + "mean_token_accuracy": 0.8690773844718933, + "num_tokens": 80507218.0, + "step": 2105 + }, + { + "epoch": 0.2679048467116143, + "ewc_loss": 0.024478690698742867, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.60954969539307e-05, + "grad_norm": 4.065525054931641, + "learning_rate": 8.923272573124204e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.845129132270813, + "num_tokens": 80541730.0, + "step": 2106 + }, + { + "epoch": 0.2680320569902048, + "ewc_loss": 0.024971507489681244, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 8.980296115623787e-05, + "grad_norm": 4.293894290924072, + "learning_rate": 8.927511657481983e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.8391588926315308, + "num_tokens": 80584677.0, + "step": 2107 + }, + { + "epoch": 0.2681592672687953, + "ewc_loss": 0.024762915447354317, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.893774793250486e-05, + "grad_norm": 3.98302960395813, + "learning_rate": 8.931750741839762e-07, + "loss": 0.443, + "mean_token_accuracy": 0.862112820148468, + "num_tokens": 80616332.0, + "step": 2108 + }, + { + "epoch": 0.2682864775473858, + "ewc_loss": 0.024601716548204422, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.732575952308252e-05, + "grad_norm": 4.009005546569824, + "learning_rate": 8.935989826197541e-07, + "loss": 0.4529, + "mean_token_accuracy": 0.859539806842804, + "num_tokens": 80654435.0, + "step": 2109 + }, + { + "epoch": 0.26841368782597635, + "ewc_loss": 0.02485576458275318, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 8.864553819876164e-05, + "grad_norm": 4.008504390716553, + "learning_rate": 8.94022891055532e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8680866956710815, + "num_tokens": 80691246.0, + "step": 2110 + }, + { + "epoch": 0.2685408981045668, + "ewc_loss": 0.024678044021129608, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.808902930468321e-05, + "grad_norm": 4.007225036621094, + "learning_rate": 8.944467994913098e-07, + "loss": 0.4809, + "mean_token_accuracy": 0.8454065322875977, + "num_tokens": 80730510.0, + "step": 2111 + }, + { + "epoch": 0.26866810838315736, + "ewc_loss": 0.02484046295285225, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 8.849252481013536e-05, + "grad_norm": 4.000630855560303, + "learning_rate": 8.948707079270878e-07, + "loss": 0.3993, + "mean_token_accuracy": 0.873761773109436, + "num_tokens": 80769369.0, + "step": 2112 + }, + { + "epoch": 0.2687953186617479, + "ewc_loss": 0.02485063299536705, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 8.859421359375119e-05, + "grad_norm": 4.050495147705078, + "learning_rate": 8.952946163628656e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8564845323562622, + "num_tokens": 80802168.0, + "step": 2113 + }, + { + "epoch": 0.26892252894033836, + "ewc_loss": 0.02476518601179123, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.89604416443035e-05, + "grad_norm": 3.9492852687835693, + "learning_rate": 8.957185247986434e-07, + "loss": 0.405, + "mean_token_accuracy": 0.87232506275177, + "num_tokens": 80839389.0, + "step": 2114 + }, + { + "epoch": 0.2690497392189289, + "ewc_loss": 0.02472841553390026, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.859275112627074e-05, + "grad_norm": 4.031213283538818, + "learning_rate": 8.961424332344213e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8562129139900208, + "num_tokens": 80881139.0, + "step": 2115 + }, + { + "epoch": 0.2691769494975194, + "ewc_loss": 0.024800077080726624, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.930936019169167e-05, + "grad_norm": 4.037936687469482, + "learning_rate": 8.965663416701992e-07, + "loss": 0.4616, + "mean_token_accuracy": 0.8527182340621948, + "num_tokens": 80919809.0, + "step": 2116 + }, + { + "epoch": 0.2693041597761099, + "ewc_loss": 0.024777851998806, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.90871015144512e-05, + "grad_norm": 4.057955741882324, + "learning_rate": 8.969902501059771e-07, + "loss": 0.4376, + "mean_token_accuracy": 0.8638426065444946, + "num_tokens": 80956860.0, + "step": 2117 + }, + { + "epoch": 0.2694313700547004, + "ewc_loss": 0.024785611778497696, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.916469960240647e-05, + "grad_norm": 4.020355701446533, + "learning_rate": 8.97414158541755e-07, + "loss": 0.4052, + "mean_token_accuracy": 0.8720225095748901, + "num_tokens": 80994322.0, + "step": 2118 + }, + { + "epoch": 0.26955858033329094, + "ewc_loss": 0.024761557579040527, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.892417827155441e-05, + "grad_norm": 4.0171332359313965, + "learning_rate": 8.978380669775328e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8469846248626709, + "num_tokens": 81032792.0, + "step": 2119 + }, + { + "epoch": 0.2696857906118814, + "ewc_loss": 0.024773430079221725, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.904290007194504e-05, + "grad_norm": 4.019931793212891, + "learning_rate": 8.982619754133107e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.847478985786438, + "num_tokens": 81070011.0, + "step": 2120 + }, + { + "epoch": 0.26981300089047194, + "ewc_loss": 0.024763453751802444, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.894313214113936e-05, + "grad_norm": 3.977391481399536, + "learning_rate": 8.986858838490886e-07, + "loss": 0.49, + "mean_token_accuracy": 0.8460034132003784, + "num_tokens": 81113145.0, + "step": 2121 + }, + { + "epoch": 0.26994021116906247, + "ewc_loss": 0.02476339600980282, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.894255006453022e-05, + "grad_norm": 4.070592880249023, + "learning_rate": 8.991097922848663e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8567594289779663, + "num_tokens": 81154175.0, + "step": 2122 + }, + { + "epoch": 0.27006742144765294, + "ewc_loss": 0.024821244180202484, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.952103235060349e-05, + "grad_norm": 4.035951137542725, + "learning_rate": 8.995337007206443e-07, + "loss": 0.4825, + "mean_token_accuracy": 0.8496847152709961, + "num_tokens": 81193319.0, + "step": 2123 + }, + { + "epoch": 0.2701946317262435, + "ewc_loss": 0.024782203137874603, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.913061901694164e-05, + "grad_norm": 4.026253700256348, + "learning_rate": 8.999576091564221e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8591637015342712, + "num_tokens": 81229057.0, + "step": 2124 + }, + { + "epoch": 0.270321842004834, + "ewc_loss": 0.024797163903713226, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.928021998144686e-05, + "grad_norm": 4.045834064483643, + "learning_rate": 9.003815175922001e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8554043769836426, + "num_tokens": 81262380.0, + "step": 2125 + }, + { + "epoch": 0.2704490522834245, + "ewc_loss": 0.02479052171111107, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.921379776438698e-05, + "grad_norm": 3.961127519607544, + "learning_rate": 9.008054260279779e-07, + "loss": 0.4631, + "mean_token_accuracy": 0.8546333312988281, + "num_tokens": 81304080.0, + "step": 2126 + }, + { + "epoch": 0.270576262562015, + "ewc_loss": 0.024774491786956787, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.905350841814652e-05, + "grad_norm": 4.154628276824951, + "learning_rate": 9.012293344637558e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.8456441164016724, + "num_tokens": 81340226.0, + "step": 2127 + }, + { + "epoch": 0.27070347284060553, + "ewc_loss": 0.02490496076643467, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 9.035820403369144e-05, + "grad_norm": 4.023259162902832, + "learning_rate": 9.016532428995337e-07, + "loss": 0.4393, + "mean_token_accuracy": 0.8599156737327576, + "num_tokens": 81378046.0, + "step": 2128 + }, + { + "epoch": 0.27083068311919606, + "ewc_loss": 0.024786237627267838, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 8.917096420191228e-05, + "grad_norm": 3.9597198963165283, + "learning_rate": 9.020771513353115e-07, + "loss": 0.4836, + "mean_token_accuracy": 0.8473829030990601, + "num_tokens": 81419486.0, + "step": 2129 + }, + { + "epoch": 0.27095789339778653, + "ewc_loss": 0.024956200271844864, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 8.964988228399307e-05, + "grad_norm": 4.044775485992432, + "learning_rate": 9.025010597710894e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8475489020347595, + "num_tokens": 81457488.0, + "step": 2130 + }, + { + "epoch": 0.27108510367637706, + "ewc_loss": 0.024990247562527657, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 8.999036799650639e-05, + "grad_norm": 4.007822513580322, + "learning_rate": 9.029249682068673e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.8575711846351624, + "num_tokens": 81494812.0, + "step": 2131 + }, + { + "epoch": 0.2712123139549676, + "ewc_loss": 0.02496274560689926, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 8.971533679869026e-05, + "grad_norm": 3.9916253089904785, + "learning_rate": 9.033488766426451e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8511992692947388, + "num_tokens": 81531421.0, + "step": 2132 + }, + { + "epoch": 0.27133952423355806, + "ewc_loss": 0.025010619312524796, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.01940802577883e-05, + "grad_norm": 4.059440612792969, + "learning_rate": 9.037727850784231e-07, + "loss": 0.4335, + "mean_token_accuracy": 0.8610207438468933, + "num_tokens": 81567221.0, + "step": 2133 + }, + { + "epoch": 0.2714667345121486, + "ewc_loss": 0.025024347007274628, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.033134847413749e-05, + "grad_norm": 4.051670551300049, + "learning_rate": 9.041966935142009e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.8522769212722778, + "num_tokens": 81605832.0, + "step": 2134 + }, + { + "epoch": 0.2715939447907391, + "ewc_loss": 0.02499908022582531, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.007869084598497e-05, + "grad_norm": 3.990736246109009, + "learning_rate": 9.046206019499788e-07, + "loss": 0.4237, + "mean_token_accuracy": 0.8683329820632935, + "num_tokens": 81646919.0, + "step": 2135 + }, + { + "epoch": 0.2717211550693296, + "ewc_loss": 0.02501028962433338, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.019079152494669e-05, + "grad_norm": 4.077711582183838, + "learning_rate": 9.050445103857567e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8463979959487915, + "num_tokens": 81687214.0, + "step": 2136 + }, + { + "epoch": 0.2718483653479201, + "ewc_loss": 0.024931376799941063, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 9.062236495083198e-05, + "grad_norm": 4.008387088775635, + "learning_rate": 9.054684188215344e-07, + "loss": 0.4068, + "mean_token_accuracy": 0.8729814887046814, + "num_tokens": 81721437.0, + "step": 2137 + }, + { + "epoch": 0.27197557562651065, + "ewc_loss": 0.025016719475388527, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.025508188642561e-05, + "grad_norm": 4.143277168273926, + "learning_rate": 9.058923272573124e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8481439352035522, + "num_tokens": 81756595.0, + "step": 2138 + }, + { + "epoch": 0.2721027859051011, + "ewc_loss": 0.02511689066886902, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.125678479904309e-05, + "grad_norm": 4.0111565589904785, + "learning_rate": 9.063162356930902e-07, + "loss": 0.445, + "mean_token_accuracy": 0.8609253764152527, + "num_tokens": 81796198.0, + "step": 2139 + }, + { + "epoch": 0.27222999618369165, + "ewc_loss": 0.025035643950104713, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.044432954397053e-05, + "grad_norm": 4.0627899169921875, + "learning_rate": 9.067401441288681e-07, + "loss": 0.476, + "mean_token_accuracy": 0.8532002568244934, + "num_tokens": 81833002.0, + "step": 2140 + }, + { + "epoch": 0.2723572064622822, + "ewc_loss": 0.02512393146753311, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.132720151683316e-05, + "grad_norm": 4.054340362548828, + "learning_rate": 9.07164052564646e-07, + "loss": 0.4488, + "mean_token_accuracy": 0.8539806008338928, + "num_tokens": 81872089.0, + "step": 2141 + }, + { + "epoch": 0.27248441674087265, + "ewc_loss": 0.024957755580544472, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 9.088614751817659e-05, + "grad_norm": 4.017802715301514, + "learning_rate": 9.075879610004239e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8463015556335449, + "num_tokens": 81908356.0, + "step": 2142 + }, + { + "epoch": 0.2726116270194632, + "ewc_loss": 0.025102706626057625, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.111495455726981e-05, + "grad_norm": 4.0327558517456055, + "learning_rate": 9.080118694362017e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8446784019470215, + "num_tokens": 81950251.0, + "step": 2143 + }, + { + "epoch": 0.2727388372980537, + "ewc_loss": 0.02501022443175316, + "ewc_loss_diag": 1.5854835510253906e-05, + "ewc_loss_parallel": 9.141084592556581e-05, + "grad_norm": 4.1028971672058105, + "learning_rate": 9.084357778719796e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8472341299057007, + "num_tokens": 81987157.0, + "step": 2144 + }, + { + "epoch": 0.2728660475766442, + "ewc_loss": 0.02514563500881195, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.154423605650663e-05, + "grad_norm": 4.058154582977295, + "learning_rate": 9.088596863077574e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.8571516275405884, + "num_tokens": 82023633.0, + "step": 2145 + }, + { + "epoch": 0.2729932578552347, + "ewc_loss": 0.025130189955234528, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.138978202827275e-05, + "grad_norm": 4.032240390777588, + "learning_rate": 9.092835947435354e-07, + "loss": 0.4664, + "mean_token_accuracy": 0.8489999771118164, + "num_tokens": 82061311.0, + "step": 2146 + }, + { + "epoch": 0.27312046813382523, + "ewc_loss": 0.025146059691905975, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.154848521575332e-05, + "grad_norm": 4.0831618309021, + "learning_rate": 9.097075031793132e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8420464992523193, + "num_tokens": 82098370.0, + "step": 2147 + }, + { + "epoch": 0.2732476784124157, + "ewc_loss": 0.025157878175377846, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.16666685952805e-05, + "grad_norm": 4.066999435424805, + "learning_rate": 9.101314116150911e-07, + "loss": 0.4304, + "mean_token_accuracy": 0.8685296773910522, + "num_tokens": 82134931.0, + "step": 2148 + }, + { + "epoch": 0.27337488869100623, + "ewc_loss": 0.025155436247587204, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.164226503344253e-05, + "grad_norm": 4.041425704956055, + "learning_rate": 9.10555320050869e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.8619687557220459, + "num_tokens": 82171391.0, + "step": 2149 + }, + { + "epoch": 0.27350209896959676, + "ewc_loss": 0.025140222162008286, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.149011748377234e-05, + "grad_norm": 3.9805779457092285, + "learning_rate": 9.109792284866469e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8588405847549438, + "num_tokens": 82214064.0, + "step": 2150 + }, + { + "epoch": 0.27362930924818724, + "ewc_loss": 0.025126997381448746, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.135787695413455e-05, + "grad_norm": 4.0416178703308105, + "learning_rate": 9.114031369224247e-07, + "loss": 0.4731, + "mean_token_accuracy": 0.8508257865905762, + "num_tokens": 82252774.0, + "step": 2151 + }, + { + "epoch": 0.27375651952677776, + "ewc_loss": 0.025189511477947235, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.198299812851474e-05, + "grad_norm": 4.07246732711792, + "learning_rate": 9.118270453582026e-07, + "loss": 0.5291, + "mean_token_accuracy": 0.8379460573196411, + "num_tokens": 82291645.0, + "step": 2152 + }, + { + "epoch": 0.2738837298053683, + "ewc_loss": 0.0251825712621212, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.191359276883304e-05, + "grad_norm": 4.034857749938965, + "learning_rate": 9.122509537939804e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.8495402336120605, + "num_tokens": 82333643.0, + "step": 2153 + }, + { + "epoch": 0.27401094008395877, + "ewc_loss": 0.025155629962682724, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.164417861029506e-05, + "grad_norm": 4.082327365875244, + "learning_rate": 9.126748622297584e-07, + "loss": 0.4995, + "mean_token_accuracy": 0.840208888053894, + "num_tokens": 82370792.0, + "step": 2154 + }, + { + "epoch": 0.2741381503625493, + "ewc_loss": 0.02522287517786026, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.23166298889555e-05, + "grad_norm": 4.092610836029053, + "learning_rate": 9.130987706655362e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8456377983093262, + "num_tokens": 82408626.0, + "step": 2155 + }, + { + "epoch": 0.2742653606411398, + "ewc_loss": 0.02517302893102169, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.181818313663825e-05, + "grad_norm": 4.046362400054932, + "learning_rate": 9.135226791013141e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8455889225006104, + "num_tokens": 82448401.0, + "step": 2156 + }, + { + "epoch": 0.2743925709197303, + "ewc_loss": 0.025180820375680923, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.189608681481332e-05, + "grad_norm": 4.211634635925293, + "learning_rate": 9.13946587537092e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.8483569025993347, + "num_tokens": 82476644.0, + "step": 2157 + }, + { + "epoch": 0.2745197811983208, + "ewc_loss": 0.025254180654883385, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.262969251722097e-05, + "grad_norm": 4.100785732269287, + "learning_rate": 9.143704959728699e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.8449678421020508, + "num_tokens": 82512722.0, + "step": 2158 + }, + { + "epoch": 0.27464699147691135, + "ewc_loss": 0.02515280619263649, + "ewc_loss_diag": 1.5974044799804688e-05, + "ewc_loss_parallel": 9.161594789475203e-05, + "grad_norm": 4.099592208862305, + "learning_rate": 9.147944044086476e-07, + "loss": 0.4991, + "mean_token_accuracy": 0.8437669277191162, + "num_tokens": 82545393.0, + "step": 2159 + }, + { + "epoch": 0.2747742017555018, + "ewc_loss": 0.0253327414393425, + "ewc_loss_diag": 1.609325408935547e-05, + "ewc_loss_parallel": 9.219460480380803e-05, + "grad_norm": 4.021677017211914, + "learning_rate": 9.152183128444255e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8482239246368408, + "num_tokens": 82586329.0, + "step": 2160 + }, + { + "epoch": 0.27490141203409235, + "ewc_loss": 0.025298096239566803, + "ewc_loss_diag": 1.609325408935547e-05, + "ewc_loss_parallel": 9.184814553009346e-05, + "grad_norm": 4.104532718658447, + "learning_rate": 9.156422212802034e-07, + "loss": 0.5054, + "mean_token_accuracy": 0.8378643989562988, + "num_tokens": 82625563.0, + "step": 2161 + }, + { + "epoch": 0.2750286223126829, + "ewc_loss": 0.025367828086018562, + "ewc_loss_diag": 1.609325408935547e-05, + "ewc_loss_parallel": 9.254546603187919e-05, + "grad_norm": 4.0730977058410645, + "learning_rate": 9.160661297159813e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.8606765866279602, + "num_tokens": 82660030.0, + "step": 2162 + }, + { + "epoch": 0.27515583259127335, + "ewc_loss": 0.025332804769277573, + "ewc_loss_diag": 1.609325408935547e-05, + "ewc_loss_parallel": 9.219524508807808e-05, + "grad_norm": 4.173587322235107, + "learning_rate": 9.164900381517592e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8531803488731384, + "num_tokens": 82698794.0, + "step": 2163 + }, + { + "epoch": 0.2752830428698639, + "ewc_loss": 0.025424856692552567, + "ewc_loss_diag": 1.609325408935547e-05, + "ewc_loss_parallel": 9.311575558967888e-05, + "grad_norm": 4.023078441619873, + "learning_rate": 9.16913946587537e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8513486385345459, + "num_tokens": 82740015.0, + "step": 2164 + }, + { + "epoch": 0.2754102531484544, + "ewc_loss": 0.025415021926164627, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.17967117857188e-05, + "grad_norm": 4.121151447296143, + "learning_rate": 9.17337855023315e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.849360466003418, + "num_tokens": 82775765.0, + "step": 2165 + }, + { + "epoch": 0.2755374634270449, + "ewc_loss": 0.025531373918056488, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.296021744376048e-05, + "grad_norm": 4.10935115814209, + "learning_rate": 9.177617634590928e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8646900653839111, + "num_tokens": 82814324.0, + "step": 2166 + }, + { + "epoch": 0.2756646737056354, + "ewc_loss": 0.02546221762895584, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.226865222444758e-05, + "grad_norm": 4.0710906982421875, + "learning_rate": 9.181856718948706e-07, + "loss": 0.4158, + "mean_token_accuracy": 0.8666560053825378, + "num_tokens": 82851007.0, + "step": 2167 + }, + { + "epoch": 0.27579188398422594, + "ewc_loss": 0.025468800216913223, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.233449964085594e-05, + "grad_norm": 4.064878940582275, + "learning_rate": 9.186095803306485e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8544189929962158, + "num_tokens": 82886846.0, + "step": 2168 + }, + { + "epoch": 0.2759190942628164, + "ewc_loss": 0.025487536564469337, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.252184827346355e-05, + "grad_norm": 4.079580307006836, + "learning_rate": 9.190334887664264e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.843756914138794, + "num_tokens": 82929607.0, + "step": 2169 + }, + { + "epoch": 0.27604630454140694, + "ewc_loss": 0.02549848146736622, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.263130050385371e-05, + "grad_norm": 4.1510772705078125, + "learning_rate": 9.194573972022043e-07, + "loss": 0.5542, + "mean_token_accuracy": 0.8328649997711182, + "num_tokens": 82965089.0, + "step": 2170 + }, + { + "epoch": 0.27617351481999747, + "ewc_loss": 0.025644365698099136, + "ewc_loss_diag": 1.633167266845703e-05, + "ewc_loss_parallel": 9.286943532060832e-05, + "grad_norm": 4.514978885650635, + "learning_rate": 9.198813056379822e-07, + "loss": 0.5292, + "mean_token_accuracy": 0.8331537246704102, + "num_tokens": 83008345.0, + "step": 2171 + }, + { + "epoch": 0.27630072509858794, + "ewc_loss": 0.025659091770648956, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.423739538760856e-05, + "grad_norm": 4.048409461975098, + "learning_rate": 9.2030521407376e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8444852828979492, + "num_tokens": 83043763.0, + "step": 2172 + }, + { + "epoch": 0.27642793537717847, + "ewc_loss": 0.025345947593450546, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.110594692174345e-05, + "grad_norm": 4.061063289642334, + "learning_rate": 9.20729122509538e-07, + "loss": 0.4277, + "mean_token_accuracy": 0.864687442779541, + "num_tokens": 83081389.0, + "step": 2173 + }, + { + "epoch": 0.276555145655769, + "ewc_loss": 0.025519361719489098, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.284010593546554e-05, + "grad_norm": 4.065215110778809, + "learning_rate": 9.211530309453158e-07, + "loss": 0.4668, + "mean_token_accuracy": 0.8512582778930664, + "num_tokens": 83120547.0, + "step": 2174 + }, + { + "epoch": 0.27668235593435947, + "ewc_loss": 0.025448322296142578, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.212970326188952e-05, + "grad_norm": 4.1280622482299805, + "learning_rate": 9.215769393810936e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8524249196052551, + "num_tokens": 83157229.0, + "step": 2175 + }, + { + "epoch": 0.27680956621295, + "ewc_loss": 0.025533070787787437, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.297719225287437e-05, + "grad_norm": 4.183784484863281, + "learning_rate": 9.220008478168715e-07, + "loss": 0.5529, + "mean_token_accuracy": 0.8269847631454468, + "num_tokens": 83198217.0, + "step": 2176 + }, + { + "epoch": 0.2769367764915405, + "ewc_loss": 0.025506656616926193, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.271306043956429e-05, + "grad_norm": 4.027490615844727, + "learning_rate": 9.224247562526494e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8482503890991211, + "num_tokens": 83238750.0, + "step": 2177 + }, + { + "epoch": 0.277063986770131, + "ewc_loss": 0.025458035990595818, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.222684457199648e-05, + "grad_norm": 4.065436363220215, + "learning_rate": 9.228486646884273e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8526885509490967, + "num_tokens": 83280138.0, + "step": 2178 + }, + { + "epoch": 0.2771911970487215, + "ewc_loss": 0.02554456517100334, + "ewc_loss_diag": 1.621246337890625e-05, + "ewc_loss_parallel": 9.309214510722086e-05, + "grad_norm": 4.105803489685059, + "learning_rate": 9.232725731242052e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8365439176559448, + "num_tokens": 83320633.0, + "step": 2179 + }, + { + "epoch": 0.27731840732731206, + "ewc_loss": 0.025655124336481094, + "ewc_loss_diag": 1.633167266845703e-05, + "ewc_loss_parallel": 9.297703218180686e-05, + "grad_norm": 4.069265365600586, + "learning_rate": 9.23696481559983e-07, + "loss": 0.4969, + "mean_token_accuracy": 0.842725396156311, + "num_tokens": 83358787.0, + "step": 2180 + }, + { + "epoch": 0.2774456176059026, + "ewc_loss": 0.025675425305962563, + "ewc_loss_diag": 1.633167266845703e-05, + "ewc_loss_parallel": 9.318003139924258e-05, + "grad_norm": 4.057376861572266, + "learning_rate": 9.24120389995761e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8606773614883423, + "num_tokens": 83400609.0, + "step": 2181 + }, + { + "epoch": 0.27757282788449306, + "ewc_loss": 0.025670116767287254, + "ewc_loss_diag": 1.633167266845703e-05, + "ewc_loss_parallel": 9.31269460124895e-05, + "grad_norm": 4.109143257141113, + "learning_rate": 9.245442984315387e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.8551247119903564, + "num_tokens": 83436058.0, + "step": 2182 + }, + { + "epoch": 0.2777000381630836, + "ewc_loss": 0.02572476863861084, + "ewc_loss_diag": 1.633167266845703e-05, + "ewc_loss_parallel": 9.367347956867889e-05, + "grad_norm": 4.06182861328125, + "learning_rate": 9.249682068673165e-07, + "loss": 0.49, + "mean_token_accuracy": 0.8441517949104309, + "num_tokens": 83474542.0, + "step": 2183 + }, + { + "epoch": 0.2778272484416741, + "ewc_loss": 0.02568100392818451, + "ewc_loss_diag": 1.633167266845703e-05, + "ewc_loss_parallel": 9.323580889031291e-05, + "grad_norm": 4.1161651611328125, + "learning_rate": 9.253921153030945e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8345503211021423, + "num_tokens": 83515551.0, + "step": 2184 + }, + { + "epoch": 0.2779544587202646, + "ewc_loss": 0.025739170610904694, + "ewc_loss_diag": 1.633167266845703e-05, + "ewc_loss_parallel": 9.381747804582119e-05, + "grad_norm": 4.126275062561035, + "learning_rate": 9.258160237388723e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8621374368667603, + "num_tokens": 83551826.0, + "step": 2185 + }, + { + "epoch": 0.2780816689988551, + "ewc_loss": 0.02572556398808956, + "ewc_loss_diag": 1.633167266845703e-05, + "ewc_loss_parallel": 9.368142491439357e-05, + "grad_norm": 4.145994186401367, + "learning_rate": 9.262399321746503e-07, + "loss": 0.4598, + "mean_token_accuracy": 0.8546240329742432, + "num_tokens": 83586598.0, + "step": 2186 + }, + { + "epoch": 0.27820887927744564, + "ewc_loss": 0.025727393105626106, + "ewc_loss_diag": 1.633167266845703e-05, + "ewc_loss_parallel": 9.369970939587802e-05, + "grad_norm": 4.130952835083008, + "learning_rate": 9.266638406104281e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8402940034866333, + "num_tokens": 83625996.0, + "step": 2187 + }, + { + "epoch": 0.2783360895560361, + "ewc_loss": 0.02571273222565651, + "ewc_loss_diag": 1.633167266845703e-05, + "ewc_loss_parallel": 9.355310612590984e-05, + "grad_norm": 4.253757476806641, + "learning_rate": 9.27087749046206e-07, + "loss": 0.477, + "mean_token_accuracy": 0.851067304611206, + "num_tokens": 83657322.0, + "step": 2188 + }, + { + "epoch": 0.27846329983462664, + "ewc_loss": 0.02587622031569481, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.3967275461182e-05, + "grad_norm": 4.116793632507324, + "learning_rate": 9.275116574819839e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8504486083984375, + "num_tokens": 83693578.0, + "step": 2189 + }, + { + "epoch": 0.27859051011321717, + "ewc_loss": 0.02577723003923893, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.297737415181473e-05, + "grad_norm": 4.107388019561768, + "learning_rate": 9.279355659177617e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8517041206359863, + "num_tokens": 83727672.0, + "step": 2190 + }, + { + "epoch": 0.27871772039180764, + "ewc_loss": 0.025848668068647385, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.36917494982481e-05, + "grad_norm": 4.170169830322266, + "learning_rate": 9.283594743535395e-07, + "loss": 0.5217, + "mean_token_accuracy": 0.8379654884338379, + "num_tokens": 83761405.0, + "step": 2191 + }, + { + "epoch": 0.2788449306703982, + "ewc_loss": 0.0258647408336401, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.385248267790303e-05, + "grad_norm": 4.099048614501953, + "learning_rate": 9.287833827893175e-07, + "loss": 0.4631, + "mean_token_accuracy": 0.8503862619400024, + "num_tokens": 83801564.0, + "step": 2192 + }, + { + "epoch": 0.2789721409489887, + "ewc_loss": 0.025826983153820038, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.347491868538782e-05, + "grad_norm": 4.129059314727783, + "learning_rate": 9.292072912250953e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.8444119095802307, + "num_tokens": 83834260.0, + "step": 2193 + }, + { + "epoch": 0.2790993512275792, + "ewc_loss": 0.02587973326444626, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.400241106050089e-05, + "grad_norm": 4.040192127227783, + "learning_rate": 9.296311996608733e-07, + "loss": 0.4197, + "mean_token_accuracy": 0.8665944337844849, + "num_tokens": 83873518.0, + "step": 2194 + }, + { + "epoch": 0.2792265615061697, + "ewc_loss": 0.025816138833761215, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.336647053714842e-05, + "grad_norm": 4.09385871887207, + "learning_rate": 9.300551080966511e-07, + "loss": 0.5335, + "mean_token_accuracy": 0.8332916498184204, + "num_tokens": 83913902.0, + "step": 2195 + }, + { + "epoch": 0.27935377178476023, + "ewc_loss": 0.02591954916715622, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.440058056497946e-05, + "grad_norm": 4.15161657333374, + "learning_rate": 9.30479016532429e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8515628576278687, + "num_tokens": 83947952.0, + "step": 2196 + }, + { + "epoch": 0.2794809820633507, + "ewc_loss": 0.025885533541440964, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.406040771864355e-05, + "grad_norm": 4.085747718811035, + "learning_rate": 9.309029249682068e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8527936339378357, + "num_tokens": 83986057.0, + "step": 2197 + }, + { + "epoch": 0.27960819234194123, + "ewc_loss": 0.025993729010224342, + "ewc_loss_diag": 1.6570091247558594e-05, + "ewc_loss_parallel": 9.392166248289868e-05, + "grad_norm": 4.101596355438232, + "learning_rate": 9.313268334039847e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.8487496376037598, + "num_tokens": 84023881.0, + "step": 2198 + }, + { + "epoch": 0.27973540262053176, + "ewc_loss": 0.025895725935697556, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.416234388481826e-05, + "grad_norm": 4.0723395347595215, + "learning_rate": 9.317507418397625e-07, + "loss": 0.4134, + "mean_token_accuracy": 0.8704825639724731, + "num_tokens": 84061480.0, + "step": 2199 + }, + { + "epoch": 0.27986261289912223, + "ewc_loss": 0.0261200200766325, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.396387031301856e-05, + "grad_norm": 4.252795696258545, + "learning_rate": 9.321746502755404e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8402766585350037, + "num_tokens": 84101782.0, + "step": 2200 + }, + { + "epoch": 0.27998982317771276, + "ewc_loss": 0.025973478332161903, + "ewc_loss_diag": 1.6450881958007812e-05, + "ewc_loss_parallel": 9.493985999142751e-05, + "grad_norm": 4.132047653198242, + "learning_rate": 9.325985587113183e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8509199619293213, + "num_tokens": 84136125.0, + "step": 2201 + }, + { + "epoch": 0.2801170334563033, + "ewc_loss": 0.02597331441938877, + "ewc_loss_diag": 1.6570091247558594e-05, + "ewc_loss_parallel": 9.371752094011754e-05, + "grad_norm": 4.164450645446777, + "learning_rate": 9.330224671470962e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8524019122123718, + "num_tokens": 84169748.0, + "step": 2202 + }, + { + "epoch": 0.28024424373489376, + "ewc_loss": 0.02605501562356949, + "ewc_loss_diag": 1.6570091247558594e-05, + "ewc_loss_parallel": 9.453452366869897e-05, + "grad_norm": 4.188042640686035, + "learning_rate": 9.334463755828741e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8385069370269775, + "num_tokens": 84201292.0, + "step": 2203 + }, + { + "epoch": 0.2803714540134843, + "ewc_loss": 0.02605406753718853, + "ewc_loss_diag": 1.6570091247558594e-05, + "ewc_loss_parallel": 9.45250503718853e-05, + "grad_norm": 4.068596363067627, + "learning_rate": 9.338702840186519e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8401712775230408, + "num_tokens": 84240652.0, + "step": 2204 + }, + { + "epoch": 0.2804986642920748, + "ewc_loss": 0.026020938530564308, + "ewc_loss_diag": 1.6570091247558594e-05, + "ewc_loss_parallel": 9.41937614697963e-05, + "grad_norm": 4.111421585083008, + "learning_rate": 9.342941924544298e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.8453805446624756, + "num_tokens": 84280252.0, + "step": 2205 + }, + { + "epoch": 0.2806258745706653, + "ewc_loss": 0.026094481348991394, + "ewc_loss_diag": 1.6570091247558594e-05, + "ewc_loss_parallel": 9.492920071352273e-05, + "grad_norm": 4.127556324005127, + "learning_rate": 9.347181008902076e-07, + "loss": 0.4315, + "mean_token_accuracy": 0.8667404651641846, + "num_tokens": 84316505.0, + "step": 2206 + }, + { + "epoch": 0.2807530848492558, + "ewc_loss": 0.02609783038496971, + "ewc_loss_diag": 1.6570091247558594e-05, + "ewc_loss_parallel": 9.49626846704632e-05, + "grad_norm": 4.136807441711426, + "learning_rate": 9.351420093259855e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.8635326623916626, + "num_tokens": 84349990.0, + "step": 2207 + }, + { + "epoch": 0.28088029512784635, + "ewc_loss": 0.026103200390934944, + "ewc_loss_diag": 1.6570091247558594e-05, + "ewc_loss_parallel": 9.501638123765588e-05, + "grad_norm": 4.1532721519470215, + "learning_rate": 9.355659177617634e-07, + "loss": 0.4243, + "mean_token_accuracy": 0.8640298843383789, + "num_tokens": 84381066.0, + "step": 2208 + }, + { + "epoch": 0.2810075054064368, + "ewc_loss": 0.02622433751821518, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.500706073595211e-05, + "grad_norm": 4.072327613830566, + "learning_rate": 9.359898261975413e-07, + "loss": 0.4173, + "mean_token_accuracy": 0.8681130409240723, + "num_tokens": 84422540.0, + "step": 2209 + }, + { + "epoch": 0.28113471568502735, + "ewc_loss": 0.026216302067041397, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.492669050814584e-05, + "grad_norm": 4.189682960510254, + "learning_rate": 9.364137346333192e-07, + "loss": 0.5434, + "mean_token_accuracy": 0.8301210999488831, + "num_tokens": 84462158.0, + "step": 2210 + }, + { + "epoch": 0.2812619259636179, + "ewc_loss": 0.0262790247797966, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.555390715831891e-05, + "grad_norm": 4.1317830085754395, + "learning_rate": 9.368376430690971e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.8612220287322998, + "num_tokens": 84495887.0, + "step": 2211 + }, + { + "epoch": 0.28138913624220835, + "ewc_loss": 0.026210153475403786, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.486520866630599e-05, + "grad_norm": 4.189173698425293, + "learning_rate": 9.372615515048749e-07, + "loss": 0.5059, + "mean_token_accuracy": 0.8418375253677368, + "num_tokens": 84530176.0, + "step": 2212 + }, + { + "epoch": 0.2815163465207989, + "ewc_loss": 0.02627246081829071, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.548827802063897e-05, + "grad_norm": 4.084674835205078, + "learning_rate": 9.376854599406528e-07, + "loss": 0.5073, + "mean_token_accuracy": 0.8429107666015625, + "num_tokens": 84574450.0, + "step": 2213 + }, + { + "epoch": 0.2816435567993894, + "ewc_loss": 0.026190780103206635, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.467147174291313e-05, + "grad_norm": 4.168747425079346, + "learning_rate": 9.381093683764306e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8497989773750305, + "num_tokens": 84610661.0, + "step": 2214 + }, + { + "epoch": 0.2817707670779799, + "ewc_loss": 0.026303943246603012, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.580310870660469e-05, + "grad_norm": 4.061114311218262, + "learning_rate": 9.385332768122085e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8540500402450562, + "num_tokens": 84651259.0, + "step": 2215 + }, + { + "epoch": 0.2818979773565704, + "ewc_loss": 0.02619950845837593, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.475875413045287e-05, + "grad_norm": 4.115030288696289, + "learning_rate": 9.389571852479864e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8557707667350769, + "num_tokens": 84692389.0, + "step": 2216 + }, + { + "epoch": 0.28202518763516093, + "ewc_loss": 0.026298783719539642, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.575152216712013e-05, + "grad_norm": 4.142871856689453, + "learning_rate": 9.393810936837643e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8540003299713135, + "num_tokens": 84731905.0, + "step": 2217 + }, + { + "epoch": 0.2821523979137514, + "ewc_loss": 0.02638794481754303, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.542243060423061e-05, + "grad_norm": 4.61943244934082, + "learning_rate": 9.398050021195422e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.848594069480896, + "num_tokens": 84773727.0, + "step": 2218 + }, + { + "epoch": 0.28227960819234194, + "ewc_loss": 0.02647361531853676, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.74998256424442e-05, + "grad_norm": 4.075934410095215, + "learning_rate": 9.402289105553201e-07, + "loss": 0.4395, + "mean_token_accuracy": 0.8602292537689209, + "num_tokens": 84816771.0, + "step": 2219 + }, + { + "epoch": 0.28240681847093246, + "ewc_loss": 0.026069380342960358, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.345748549094424e-05, + "grad_norm": 4.23057222366333, + "learning_rate": 9.406528189910978e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8543310165405273, + "num_tokens": 84853172.0, + "step": 2220 + }, + { + "epoch": 0.28253402874952294, + "ewc_loss": 0.026359546929597855, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.635913011152297e-05, + "grad_norm": 4.123548984527588, + "learning_rate": 9.410767274268757e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8574428558349609, + "num_tokens": 84891508.0, + "step": 2221 + }, + { + "epoch": 0.28266123902811346, + "ewc_loss": 0.02616775967180729, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.444126771995798e-05, + "grad_norm": 4.163276195526123, + "learning_rate": 9.415006358626536e-07, + "loss": 0.5405, + "mean_token_accuracy": 0.828920304775238, + "num_tokens": 84931204.0, + "step": 2222 + }, + { + "epoch": 0.282788449306704, + "ewc_loss": 0.026245301589369774, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.521668835077435e-05, + "grad_norm": 4.1792192459106445, + "learning_rate": 9.419245442984314e-07, + "loss": 0.5218, + "mean_token_accuracy": 0.8361709713935852, + "num_tokens": 84966782.0, + "step": 2223 + }, + { + "epoch": 0.28291565958529447, + "ewc_loss": 0.026344306766986847, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.498604049440473e-05, + "grad_norm": 4.2056403160095215, + "learning_rate": 9.423484527342094e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.8503984212875366, + "num_tokens": 85001512.0, + "step": 2224 + }, + { + "epoch": 0.283042869863885, + "ewc_loss": 0.026233596727252007, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.509964002063498e-05, + "grad_norm": 4.151293754577637, + "learning_rate": 9.427723611699872e-07, + "loss": 0.477, + "mean_token_accuracy": 0.8511449694633484, + "num_tokens": 85036082.0, + "step": 2225 + }, + { + "epoch": 0.2831700801424755, + "ewc_loss": 0.02622358500957489, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.49995155679062e-05, + "grad_norm": 4.125857830047607, + "learning_rate": 9.431962696057652e-07, + "loss": 0.4827, + "mean_token_accuracy": 0.8493943214416504, + "num_tokens": 85079433.0, + "step": 2226 + }, + { + "epoch": 0.283297290421066, + "ewc_loss": 0.026231199502944946, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.507568029221147e-05, + "grad_norm": 4.168773174285889, + "learning_rate": 9.43620178041543e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8647733926773071, + "num_tokens": 85116940.0, + "step": 2227 + }, + { + "epoch": 0.2834245006996565, + "ewc_loss": 0.02625889517366886, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.535262506688014e-05, + "grad_norm": 4.134949207305908, + "learning_rate": 9.440440864773208e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8483549356460571, + "num_tokens": 85153328.0, + "step": 2228 + }, + { + "epoch": 0.28355171097824705, + "ewc_loss": 0.0262472964823246, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.523663175059482e-05, + "grad_norm": 4.176656723022461, + "learning_rate": 9.444679949130987e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.844368577003479, + "num_tokens": 85194059.0, + "step": 2229 + }, + { + "epoch": 0.2836789212568376, + "ewc_loss": 0.02628706395626068, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.563429921399802e-05, + "grad_norm": 4.218287467956543, + "learning_rate": 9.448919033488766e-07, + "loss": 0.5088, + "mean_token_accuracy": 0.8425631523132324, + "num_tokens": 85230820.0, + "step": 2230 + }, + { + "epoch": 0.28380613153542805, + "ewc_loss": 0.0262928307056427, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.569199028192088e-05, + "grad_norm": 4.136651039123535, + "learning_rate": 9.453158117846544e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.8461446762084961, + "num_tokens": 85268864.0, + "step": 2231 + }, + { + "epoch": 0.2839333418140186, + "ewc_loss": 0.0262562595307827, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.532625699648634e-05, + "grad_norm": 4.143144130706787, + "learning_rate": 9.457397202204324e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.8390135765075684, + "num_tokens": 85311377.0, + "step": 2232 + }, + { + "epoch": 0.2840605520926091, + "ewc_loss": 0.026291225105524063, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.567591041559353e-05, + "grad_norm": 4.153056621551514, + "learning_rate": 9.461636286562102e-07, + "loss": 0.4321, + "mean_token_accuracy": 0.8661283254623413, + "num_tokens": 85347082.0, + "step": 2233 + }, + { + "epoch": 0.2841877623711996, + "ewc_loss": 0.026448693126440048, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.60299075813964e-05, + "grad_norm": 4.264771938323975, + "learning_rate": 9.465875370919882e-07, + "loss": 0.5205, + "mean_token_accuracy": 0.8408169746398926, + "num_tokens": 85384696.0, + "step": 2234 + }, + { + "epoch": 0.2843149726497901, + "ewc_loss": 0.02646728977560997, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.62158665060997e-05, + "grad_norm": 4.551894187927246, + "learning_rate": 9.470114455277659e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8407872319221497, + "num_tokens": 85420192.0, + "step": 2235 + }, + { + "epoch": 0.28444218292838064, + "ewc_loss": 0.026443056762218475, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.719424997456372e-05, + "grad_norm": 4.088284492492676, + "learning_rate": 9.474353539635438e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8542760610580444, + "num_tokens": 85462281.0, + "step": 2236 + }, + { + "epoch": 0.2845693932069711, + "ewc_loss": 0.026143556460738182, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.419923298992217e-05, + "grad_norm": 4.172422885894775, + "learning_rate": 9.478592623993217e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8526868224143982, + "num_tokens": 85495191.0, + "step": 2237 + }, + { + "epoch": 0.28469660348556164, + "ewc_loss": 0.02637978456914425, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.65615181485191e-05, + "grad_norm": 4.101868629455566, + "learning_rate": 9.482831708350996e-07, + "loss": 0.4595, + "mean_token_accuracy": 0.8558515906333923, + "num_tokens": 85533803.0, + "step": 2238 + }, + { + "epoch": 0.28482381376415217, + "ewc_loss": 0.026286080479621887, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.562447667121887e-05, + "grad_norm": 4.20268440246582, + "learning_rate": 9.487070792708775e-07, + "loss": 0.5059, + "mean_token_accuracy": 0.8432157635688782, + "num_tokens": 85573424.0, + "step": 2239 + }, + { + "epoch": 0.28495102404274264, + "ewc_loss": 0.026384582743048668, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.660950308898464e-05, + "grad_norm": 4.0790114402771, + "learning_rate": 9.491309877066554e-07, + "loss": 0.49, + "mean_token_accuracy": 0.8463947772979736, + "num_tokens": 85616423.0, + "step": 2240 + }, + { + "epoch": 0.28507823432133317, + "ewc_loss": 0.026425577700138092, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.579873585607857e-05, + "grad_norm": 4.1482977867126465, + "learning_rate": 9.495548961424332e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.83887779712677, + "num_tokens": 85658410.0, + "step": 2241 + }, + { + "epoch": 0.2852054445999237, + "ewc_loss": 0.026403166353702545, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.679534559836611e-05, + "grad_norm": 4.130434036254883, + "learning_rate": 9.499788045782111e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8508254885673523, + "num_tokens": 85695767.0, + "step": 2242 + }, + { + "epoch": 0.28533265487851417, + "ewc_loss": 0.026467537507414818, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.621834760764614e-05, + "grad_norm": 4.110254764556885, + "learning_rate": 9.504027130139889e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.8459985256195068, + "num_tokens": 85737771.0, + "step": 2243 + }, + { + "epoch": 0.2854598651571047, + "ewc_loss": 0.02638022042810917, + "ewc_loss_diag": 1.6689300537109375e-05, + "ewc_loss_parallel": 9.656587644713e-05, + "grad_norm": 4.117467880249023, + "learning_rate": 9.508266214497667e-07, + "loss": 0.4369, + "mean_token_accuracy": 0.8659684062004089, + "num_tokens": 85774789.0, + "step": 2244 + }, + { + "epoch": 0.2855870754356952, + "ewc_loss": 0.026534412056207657, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.688708814792335e-05, + "grad_norm": 4.161757469177246, + "learning_rate": 9.512505298855447e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.853596031665802, + "num_tokens": 85817833.0, + "step": 2245 + }, + { + "epoch": 0.2857142857142857, + "ewc_loss": 0.026532869786024094, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.687167766969651e-05, + "grad_norm": 4.129350185394287, + "learning_rate": 9.516744383213225e-07, + "loss": 0.4205, + "mean_token_accuracy": 0.865548849105835, + "num_tokens": 85854488.0, + "step": 2246 + }, + { + "epoch": 0.2858414959928762, + "ewc_loss": 0.02651885524392128, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.673151362221688e-05, + "grad_norm": 4.186058521270752, + "learning_rate": 9.520983467571005e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8443549871444702, + "num_tokens": 85893746.0, + "step": 2247 + }, + { + "epoch": 0.28596870627146675, + "ewc_loss": 0.026553485542535782, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.707782737677917e-05, + "grad_norm": 4.13242769241333, + "learning_rate": 9.525222551928783e-07, + "loss": 0.5357, + "mean_token_accuracy": 0.8382383584976196, + "num_tokens": 85932798.0, + "step": 2248 + }, + { + "epoch": 0.2860959165500572, + "ewc_loss": 0.02651333436369896, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.667631093179807e-05, + "grad_norm": 4.210715293884277, + "learning_rate": 9.529461636286562e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.858182430267334, + "num_tokens": 85964446.0, + "step": 2249 + }, + { + "epoch": 0.28622312682864776, + "ewc_loss": 0.026593856513500214, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.748154116095975e-05, + "grad_norm": 4.219738483428955, + "learning_rate": 9.533700720644341e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8657587766647339, + "num_tokens": 86000587.0, + "step": 2250 + }, + { + "epoch": 0.2863503371072383, + "ewc_loss": 0.026531772688031197, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.686069097369909e-05, + "grad_norm": 4.345856189727783, + "learning_rate": 9.537939805002118e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8489696979522705, + "num_tokens": 86042891.0, + "step": 2251 + }, + { + "epoch": 0.28647754738582876, + "ewc_loss": 0.026615051552653313, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.76934825303033e-05, + "grad_norm": 4.155991554260254, + "learning_rate": 9.542178889359898e-07, + "loss": 0.4589, + "mean_token_accuracy": 0.8556077480316162, + "num_tokens": 86079973.0, + "step": 2252 + }, + { + "epoch": 0.2866047576644193, + "ewc_loss": 0.026496557518839836, + "ewc_loss_diag": 1.6808509826660156e-05, + "ewc_loss_parallel": 9.650854190113023e-05, + "grad_norm": 4.125939846038818, + "learning_rate": 9.546417973717677e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.850741982460022, + "num_tokens": 86119767.0, + "step": 2253 + }, + { + "epoch": 0.2867319679430098, + "ewc_loss": 0.02670050412416458, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.732730541145429e-05, + "grad_norm": 4.121651649475098, + "learning_rate": 9.550657058075455e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8445258736610413, + "num_tokens": 86163176.0, + "step": 2254 + }, + { + "epoch": 0.2868591782216003, + "ewc_loss": 0.026660028845071793, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.692256571725011e-05, + "grad_norm": 4.141068935394287, + "learning_rate": 9.554896142433234e-07, + "loss": 0.4967, + "mean_token_accuracy": 0.8442244529724121, + "num_tokens": 86201523.0, + "step": 2255 + }, + { + "epoch": 0.2869863885001908, + "ewc_loss": 0.026718920096755028, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.751146717462689e-05, + "grad_norm": 4.181577682495117, + "learning_rate": 9.559135226791012e-07, + "loss": 0.5154, + "mean_token_accuracy": 0.835105836391449, + "num_tokens": 86240277.0, + "step": 2256 + }, + { + "epoch": 0.28711359877878134, + "ewc_loss": 0.026708170771598816, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.740397217683494e-05, + "grad_norm": 4.02284049987793, + "learning_rate": 9.563374311148793e-07, + "loss": 0.5056, + "mean_token_accuracy": 0.8423340320587158, + "num_tokens": 86289714.0, + "step": 2257 + }, + { + "epoch": 0.2872408090573718, + "ewc_loss": 0.026718419045209885, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.750645403983071e-05, + "grad_norm": 4.2521820068359375, + "learning_rate": 9.56761339550657e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.8577107191085815, + "num_tokens": 86320786.0, + "step": 2258 + }, + { + "epoch": 0.28736801933596234, + "ewc_loss": 0.02684769779443741, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.87992316368036e-05, + "grad_norm": 4.1164984703063965, + "learning_rate": 9.57185247986435e-07, + "loss": 0.4463, + "mean_token_accuracy": 0.8586559295654297, + "num_tokens": 86358002.0, + "step": 2259 + }, + { + "epoch": 0.28749522961455287, + "ewc_loss": 0.026716725900769234, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.748952288646251e-05, + "grad_norm": 4.099699974060059, + "learning_rate": 9.576091564222128e-07, + "loss": 0.4146, + "mean_token_accuracy": 0.8681315183639526, + "num_tokens": 86395334.0, + "step": 2260 + }, + { + "epoch": 0.28762243989314334, + "ewc_loss": 0.026798874139785767, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.831101488089189e-05, + "grad_norm": 4.167509078979492, + "learning_rate": 9.580330648579906e-07, + "loss": 0.492, + "mean_token_accuracy": 0.8426116704940796, + "num_tokens": 86432616.0, + "step": 2261 + }, + { + "epoch": 0.2877496501717339, + "ewc_loss": 0.02694743312895298, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.857589611783624e-05, + "grad_norm": 4.199118614196777, + "learning_rate": 9.584569732937685e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8461466431617737, + "num_tokens": 86472293.0, + "step": 2262 + }, + { + "epoch": 0.2878768604503244, + "ewc_loss": 0.026812199503183365, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.844425949268043e-05, + "grad_norm": 4.1791486740112305, + "learning_rate": 9.588808817295463e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8471472263336182, + "num_tokens": 86506304.0, + "step": 2263 + }, + { + "epoch": 0.2880040707289149, + "ewc_loss": 0.02694125473499298, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.851412323769182e-05, + "grad_norm": 4.201639652252197, + "learning_rate": 9.593047901653242e-07, + "loss": 0.475, + "mean_token_accuracy": 0.8491184115409851, + "num_tokens": 86540520.0, + "step": 2264 + }, + { + "epoch": 0.2881312810075054, + "ewc_loss": 0.02682148478925228, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.853711526375264e-05, + "grad_norm": 4.149148941040039, + "learning_rate": 9.597286986011022e-07, + "loss": 0.4356, + "mean_token_accuracy": 0.8619268536567688, + "num_tokens": 86576956.0, + "step": 2265 + }, + { + "epoch": 0.28825849128609593, + "ewc_loss": 0.026939276605844498, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.849432535702363e-05, + "grad_norm": 4.231262683868408, + "learning_rate": 9.601526070368799e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8383843302726746, + "num_tokens": 86613799.0, + "step": 2266 + }, + { + "epoch": 0.2883857015646864, + "ewc_loss": 0.02698655053973198, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.896707342704758e-05, + "grad_norm": 4.214898109436035, + "learning_rate": 9.60576515472658e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8544884920120239, + "num_tokens": 86650338.0, + "step": 2267 + }, + { + "epoch": 0.28851291184327693, + "ewc_loss": 0.026931436732411385, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.84159269137308e-05, + "grad_norm": 4.205441474914551, + "learning_rate": 9.610004239084358e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8500301241874695, + "num_tokens": 86691605.0, + "step": 2268 + }, + { + "epoch": 0.28864012212186746, + "ewc_loss": 0.02693963423371315, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.849789785221219e-05, + "grad_norm": 4.179964542388916, + "learning_rate": 9.614243323442136e-07, + "loss": 0.5413, + "mean_token_accuracy": 0.8285342454910278, + "num_tokens": 86728767.0, + "step": 2269 + }, + { + "epoch": 0.28876733240045793, + "ewc_loss": 0.02680409699678421, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.83632417046465e-05, + "grad_norm": 4.248076438903809, + "learning_rate": 9.618482407799915e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.855837345123291, + "num_tokens": 86765240.0, + "step": 2270 + }, + { + "epoch": 0.28889454267904846, + "ewc_loss": 0.02685260772705078, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.884834435069934e-05, + "grad_norm": 4.103967189788818, + "learning_rate": 9.622721492157693e-07, + "loss": 0.4208, + "mean_token_accuracy": 0.8683998584747314, + "num_tokens": 86804042.0, + "step": 2271 + }, + { + "epoch": 0.289021752957639, + "ewc_loss": 0.026762284338474274, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.794511424843222e-05, + "grad_norm": 4.280829429626465, + "learning_rate": 9.626960576515472e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8523327708244324, + "num_tokens": 86835220.0, + "step": 2272 + }, + { + "epoch": 0.28914896323622946, + "ewc_loss": 0.026909075677394867, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.941301686922088e-05, + "grad_norm": 4.176416397094727, + "learning_rate": 9.63119966087325e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.8525665402412415, + "num_tokens": 86873414.0, + "step": 2273 + }, + { + "epoch": 0.28927617351482, + "ewc_loss": 0.026782220229506493, + "ewc_loss_diag": 1.6927719116210938e-05, + "ewc_loss_parallel": 9.814446821110323e-05, + "grad_norm": 4.161553382873535, + "learning_rate": 9.635438745231029e-07, + "loss": 0.4475, + "mean_token_accuracy": 0.860542893409729, + "num_tokens": 86914272.0, + "step": 2274 + }, + { + "epoch": 0.2894033837934105, + "ewc_loss": 0.02697567641735077, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.8858326964546e-05, + "grad_norm": 4.195605754852295, + "learning_rate": 9.63967782958881e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8439854383468628, + "num_tokens": 86952425.0, + "step": 2275 + }, + { + "epoch": 0.289530594072001, + "ewc_loss": 0.026990432292222977, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.900589066091925e-05, + "grad_norm": 4.204131603240967, + "learning_rate": 9.643916913946588e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8589168787002563, + "num_tokens": 86987842.0, + "step": 2276 + }, + { + "epoch": 0.2896578043505915, + "ewc_loss": 0.026987574994564056, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.897732525132596e-05, + "grad_norm": 4.119686603546143, + "learning_rate": 9.648155998304366e-07, + "loss": 0.4213, + "mean_token_accuracy": 0.8675729036331177, + "num_tokens": 87026750.0, + "step": 2277 + }, + { + "epoch": 0.28978501462918205, + "ewc_loss": 0.026954106986522675, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.864262392511591e-05, + "grad_norm": 4.180832862854004, + "learning_rate": 9.652395082662145e-07, + "loss": 0.5247, + "mean_token_accuracy": 0.8333524465560913, + "num_tokens": 87069685.0, + "step": 2278 + }, + { + "epoch": 0.2899122249077726, + "ewc_loss": 0.02702215500175953, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.932310786098242e-05, + "grad_norm": 4.1882100105285645, + "learning_rate": 9.656634167019923e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.8454652428627014, + "num_tokens": 87107054.0, + "step": 2279 + }, + { + "epoch": 0.29003943518636305, + "ewc_loss": 0.02698090672492981, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.891062654787675e-05, + "grad_norm": 4.168423175811768, + "learning_rate": 9.660873251377701e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.8466260433197021, + "num_tokens": 87146254.0, + "step": 2280 + }, + { + "epoch": 0.2901666454649536, + "ewc_loss": 0.027095384895801544, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.883470193017274e-05, + "grad_norm": 4.256120204925537, + "learning_rate": 9.66511233573548e-07, + "loss": 0.5114, + "mean_token_accuracy": 0.8423689007759094, + "num_tokens": 87182404.0, + "step": 2281 + }, + { + "epoch": 0.2902938557435441, + "ewc_loss": 0.027144530788064003, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.932617103913799e-05, + "grad_norm": 4.192575931549072, + "learning_rate": 9.669351420093258e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.8607295155525208, + "num_tokens": 87216551.0, + "step": 2282 + }, + { + "epoch": 0.2904210660221346, + "ewc_loss": 0.027057847008109093, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.84593280008994e-05, + "grad_norm": 4.237518787384033, + "learning_rate": 9.67359050445104e-07, + "loss": 0.4672, + "mean_token_accuracy": 0.8509136438369751, + "num_tokens": 87249402.0, + "step": 2283 + }, + { + "epoch": 0.2905482763007251, + "ewc_loss": 0.027034441009163857, + "ewc_loss_diag": 1.704692840576172e-05, + "ewc_loss_parallel": 9.944596968125552e-05, + "grad_norm": 4.1959333419799805, + "learning_rate": 9.677829588808817e-07, + "loss": 0.4535, + "mean_token_accuracy": 0.8542639017105103, + "num_tokens": 87281846.0, + "step": 2284 + }, + { + "epoch": 0.29067548657931563, + "ewc_loss": 0.027111146599054337, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.89923209999688e-05, + "grad_norm": 4.262866973876953, + "learning_rate": 9.682068673166596e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8402563333511353, + "num_tokens": 87315765.0, + "step": 2285 + }, + { + "epoch": 0.2908026968579061, + "ewc_loss": 0.027169395238161087, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.957481233868748e-05, + "grad_norm": 4.201237678527832, + "learning_rate": 9.686307757524374e-07, + "loss": 0.467, + "mean_token_accuracy": 0.851524829864502, + "num_tokens": 87353668.0, + "step": 2286 + }, + { + "epoch": 0.29092990713649663, + "ewc_loss": 0.027120286598801613, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.908372157951817e-05, + "grad_norm": 4.212857246398926, + "learning_rate": 9.690546841882153e-07, + "loss": 0.3999, + "mean_token_accuracy": 0.8731219172477722, + "num_tokens": 87390747.0, + "step": 2287 + }, + { + "epoch": 0.29105711741508716, + "ewc_loss": 0.02716284617781639, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.950932872015983e-05, + "grad_norm": 4.196846961975098, + "learning_rate": 9.694785926239931e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8511006236076355, + "num_tokens": 87430686.0, + "step": 2288 + }, + { + "epoch": 0.29118432769367764, + "ewc_loss": 0.027138400822877884, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.926487837219611e-05, + "grad_norm": 4.260837078094482, + "learning_rate": 9.69902501059771e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8375680446624756, + "num_tokens": 87465290.0, + "step": 2289 + }, + { + "epoch": 0.29131153797226816, + "ewc_loss": 0.027181655168533325, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.969739767257124e-05, + "grad_norm": 4.18019437789917, + "learning_rate": 9.703264094955488e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.8457477688789368, + "num_tokens": 87504553.0, + "step": 2290 + }, + { + "epoch": 0.2914387482508587, + "ewc_loss": 0.027126066386699677, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.914153633872047e-05, + "grad_norm": 4.159209728240967, + "learning_rate": 9.707503179313269e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8571441173553467, + "num_tokens": 87544896.0, + "step": 2291 + }, + { + "epoch": 0.29156595852944917, + "ewc_loss": 0.027156751602888107, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.944837802322581e-05, + "grad_norm": 4.216180801391602, + "learning_rate": 9.711742263671047e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8555909395217896, + "num_tokens": 87582075.0, + "step": 2292 + }, + { + "epoch": 0.2916931688080397, + "ewc_loss": 0.027190499007701874, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.978584421332926e-05, + "grad_norm": 4.177594184875488, + "learning_rate": 9.715981348028826e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8568882942199707, + "num_tokens": 87623117.0, + "step": 2293 + }, + { + "epoch": 0.2918203790866302, + "ewc_loss": 0.027155356481671333, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.943442273652181e-05, + "grad_norm": 4.276576519012451, + "learning_rate": 9.720220432386604e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8548682928085327, + "num_tokens": 87656263.0, + "step": 2294 + }, + { + "epoch": 0.2919475893652207, + "ewc_loss": 0.027191031724214554, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.979116293834522e-05, + "grad_norm": 4.1478729248046875, + "learning_rate": 9.724459516744383e-07, + "loss": 0.4406, + "mean_token_accuracy": 0.8617826700210571, + "num_tokens": 87695852.0, + "step": 2295 + }, + { + "epoch": 0.2920747996438112, + "ewc_loss": 0.02715224400162697, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.940328891389072e-05, + "grad_norm": 4.325174331665039, + "learning_rate": 9.728698601102161e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.861864447593689, + "num_tokens": 87734331.0, + "step": 2296 + }, + { + "epoch": 0.29220200992240175, + "ewc_loss": 0.02726837620139122, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.00010056462633656338, + "grad_norm": 4.2443742752075195, + "learning_rate": 9.73293768545994e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8475891351699829, + "num_tokens": 87768447.0, + "step": 2297 + }, + { + "epoch": 0.2923292202009922, + "ewc_loss": 0.027142155915498734, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.93024223134853e-05, + "grad_norm": 4.169999599456787, + "learning_rate": 9.737176769817718e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8465529680252075, + "num_tokens": 87813213.0, + "step": 2298 + }, + { + "epoch": 0.29245643047958275, + "ewc_loss": 0.02718373015522957, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.971817053155974e-05, + "grad_norm": 4.217676639556885, + "learning_rate": 9.741415854175499e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.8366884589195251, + "num_tokens": 87851336.0, + "step": 2299 + }, + { + "epoch": 0.2925836407581733, + "ewc_loss": 0.027227336540818214, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.00010015422594733536, + "grad_norm": 4.299381256103516, + "learning_rate": 9.745654938533277e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8468004465103149, + "num_tokens": 87888474.0, + "step": 2300 + }, + { + "epoch": 0.29271085103676375, + "ewc_loss": 0.027232438325881958, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.0001002052376861684, + "grad_norm": 4.202293395996094, + "learning_rate": 9.749894022891056e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.8414973616600037, + "num_tokens": 87928232.0, + "step": 2301 + }, + { + "epoch": 0.2928380613153543, + "ewc_loss": 0.027166282758116722, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.9543685792014e-05, + "grad_norm": 4.208535671234131, + "learning_rate": 9.754133107248834e-07, + "loss": 0.4304, + "mean_token_accuracy": 0.8643025755882263, + "num_tokens": 87966982.0, + "step": 2302 + }, + { + "epoch": 0.2929652715939448, + "ewc_loss": 0.027229908853769302, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.00010017993918154389, + "grad_norm": 4.340771675109863, + "learning_rate": 9.758372191606612e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8355975151062012, + "num_tokens": 88003550.0, + "step": 2303 + }, + { + "epoch": 0.2930924818725353, + "ewc_loss": 0.027260392904281616, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.00010048478725366294, + "grad_norm": 4.182411193847656, + "learning_rate": 9.76261127596439e-07, + "loss": 0.4292, + "mean_token_accuracy": 0.8659090995788574, + "num_tokens": 88038038.0, + "step": 2304 + }, + { + "epoch": 0.2932196921511258, + "ewc_loss": 0.027155816555023193, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.94390356936492e-05, + "grad_norm": 4.245681285858154, + "learning_rate": 9.76685036032217e-07, + "loss": 0.4995, + "mean_token_accuracy": 0.8398764133453369, + "num_tokens": 88074627.0, + "step": 2305 + }, + { + "epoch": 0.29334690242971634, + "ewc_loss": 0.027374975383281708, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.000100409917649813, + "grad_norm": 4.13823127746582, + "learning_rate": 9.771089444679948e-07, + "loss": 0.444, + "mean_token_accuracy": 0.859970211982727, + "num_tokens": 88116772.0, + "step": 2306 + }, + { + "epoch": 0.2934741127083068, + "ewc_loss": 0.027182672172784805, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 9.970757673727348e-05, + "grad_norm": 4.283344745635986, + "learning_rate": 9.775328529037728e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.8458787202835083, + "num_tokens": 88152888.0, + "step": 2307 + }, + { + "epoch": 0.29360132298689734, + "ewc_loss": 0.02727871760725975, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.00010066804679809138, + "grad_norm": 4.280008316040039, + "learning_rate": 9.779567613395507e-07, + "loss": 0.5174, + "mean_token_accuracy": 0.8404483199119568, + "num_tokens": 88184544.0, + "step": 2308 + }, + { + "epoch": 0.29372853326548787, + "ewc_loss": 0.0272417850792408, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.00010029871191363782, + "grad_norm": 4.231814384460449, + "learning_rate": 9.783806697753285e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.856468677520752, + "num_tokens": 88221603.0, + "step": 2309 + }, + { + "epoch": 0.29385574354407834, + "ewc_loss": 0.02724194899201393, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.00010030035628005862, + "grad_norm": 4.16232442855835, + "learning_rate": 9.788045782111064e-07, + "loss": 0.4352, + "mean_token_accuracy": 0.8622917532920837, + "num_tokens": 88265109.0, + "step": 2310 + }, + { + "epoch": 0.29398295382266887, + "ewc_loss": 0.02735278382897377, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.0001001880009425804, + "grad_norm": 4.190937519073486, + "learning_rate": 9.792284866468842e-07, + "loss": 0.4261, + "mean_token_accuracy": 0.8627902865409851, + "num_tokens": 88302756.0, + "step": 2311 + }, + { + "epoch": 0.2941101641012594, + "ewc_loss": 0.027276385575532913, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.0001006447128020227, + "grad_norm": 4.247440814971924, + "learning_rate": 9.79652395082662e-07, + "loss": 0.426, + "mean_token_accuracy": 0.8612978458404541, + "num_tokens": 88335605.0, + "step": 2312 + }, + { + "epoch": 0.29423737437984987, + "ewc_loss": 0.027378913015127182, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010044927330454811, + "grad_norm": 4.250777244567871, + "learning_rate": 9.8007630351844e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8452107906341553, + "num_tokens": 88369447.0, + "step": 2313 + }, + { + "epoch": 0.2943645846584404, + "ewc_loss": 0.02728387899696827, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.00010071964788949117, + "grad_norm": 4.289389133453369, + "learning_rate": 9.805002119542178e-07, + "loss": 0.5258, + "mean_token_accuracy": 0.8357870578765869, + "num_tokens": 88404442.0, + "step": 2314 + }, + { + "epoch": 0.2944917949370309, + "ewc_loss": 0.027305344119668007, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.0001009342959150672, + "grad_norm": 4.192586421966553, + "learning_rate": 9.809241203899958e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.85390305519104, + "num_tokens": 88445870.0, + "step": 2315 + }, + { + "epoch": 0.2946190052156214, + "ewc_loss": 0.027380555868148804, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010046571696875617, + "grad_norm": 4.270421504974365, + "learning_rate": 9.813480288257737e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8441653251647949, + "num_tokens": 88480994.0, + "step": 2316 + }, + { + "epoch": 0.2947462154942119, + "ewc_loss": 0.02746891975402832, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010134935291716829, + "grad_norm": 4.225152492523193, + "learning_rate": 9.817719372615515e-07, + "loss": 0.4423, + "mean_token_accuracy": 0.8614344000816345, + "num_tokens": 88519666.0, + "step": 2317 + }, + { + "epoch": 0.29487342577280246, + "ewc_loss": 0.027415066957473755, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010081081563839689, + "grad_norm": 4.286289691925049, + "learning_rate": 9.821958456973294e-07, + "loss": 0.4158, + "mean_token_accuracy": 0.867334246635437, + "num_tokens": 88553855.0, + "step": 2318 + }, + { + "epoch": 0.29500063605139293, + "ewc_loss": 0.027456218376755714, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010122233652509749, + "grad_norm": 4.284371852874756, + "learning_rate": 9.826197541331072e-07, + "loss": 0.4317, + "mean_token_accuracy": 0.8653585910797119, + "num_tokens": 88589053.0, + "step": 2319 + }, + { + "epoch": 0.29512784632998346, + "ewc_loss": 0.027413014322519302, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.0001007903047138825, + "grad_norm": 4.246089935302734, + "learning_rate": 9.83043662568885e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.8473811745643616, + "num_tokens": 88627067.0, + "step": 2320 + }, + { + "epoch": 0.295255056608574, + "ewc_loss": 0.027526523917913437, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010070469579659402, + "grad_norm": 4.188137531280518, + "learning_rate": 9.83467571004663e-07, + "loss": 0.4344, + "mean_token_accuracy": 0.8623553514480591, + "num_tokens": 88668030.0, + "step": 2321 + }, + { + "epoch": 0.29538226688716446, + "ewc_loss": 0.02739904820919037, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010065064270747826, + "grad_norm": 4.224392890930176, + "learning_rate": 9.838914794404407e-07, + "loss": 0.4332, + "mean_token_accuracy": 0.8644828200340271, + "num_tokens": 88708889.0, + "step": 2322 + }, + { + "epoch": 0.295509477165755, + "ewc_loss": 0.027447979897260666, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010113994358107448, + "grad_norm": 4.349998950958252, + "learning_rate": 9.843153878762188e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8450647592544556, + "num_tokens": 88745689.0, + "step": 2323 + }, + { + "epoch": 0.2956366874443455, + "ewc_loss": 0.027468787506222725, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010134803596884012, + "grad_norm": 4.228705883026123, + "learning_rate": 9.847392963119966e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8585615158081055, + "num_tokens": 88784667.0, + "step": 2324 + }, + { + "epoch": 0.295763897722936, + "ewc_loss": 0.027263227850198746, + "ewc_loss_diag": 1.71661376953125e-05, + "ewc_loss_parallel": 0.00010051314893644303, + "grad_norm": 4.210781097412109, + "learning_rate": 9.851632047477745e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8526395559310913, + "num_tokens": 88827334.0, + "step": 2325 + }, + { + "epoch": 0.2958911080015265, + "ewc_loss": 0.027447864413261414, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010113878670381382, + "grad_norm": 4.225049018859863, + "learning_rate": 9.855871131835523e-07, + "loss": 0.3978, + "mean_token_accuracy": 0.87325519323349, + "num_tokens": 88863744.0, + "step": 2326 + }, + { + "epoch": 0.29601831828011704, + "ewc_loss": 0.02742002345621586, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010086038673762232, + "grad_norm": 4.213756084442139, + "learning_rate": 9.860110216193302e-07, + "loss": 0.4301, + "mean_token_accuracy": 0.8654673099517822, + "num_tokens": 88903693.0, + "step": 2327 + }, + { + "epoch": 0.2961455285587075, + "ewc_loss": 0.027448540553450584, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010114556062035263, + "grad_norm": 4.275243282318115, + "learning_rate": 9.86434930055108e-07, + "loss": 0.4257, + "mean_token_accuracy": 0.8651678562164307, + "num_tokens": 88936668.0, + "step": 2328 + }, + { + "epoch": 0.29627273883729804, + "ewc_loss": 0.0274581927806139, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010124208347406238, + "grad_norm": 4.319642066955566, + "learning_rate": 9.868588384908859e-07, + "loss": 0.4349, + "mean_token_accuracy": 0.8598885536193848, + "num_tokens": 88970920.0, + "step": 2329 + }, + { + "epoch": 0.2963999491158886, + "ewc_loss": 0.027459492906928062, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010125508561031893, + "grad_norm": 4.227762222290039, + "learning_rate": 9.872827469266637e-07, + "loss": 0.4379, + "mean_token_accuracy": 0.8582397103309631, + "num_tokens": 89010856.0, + "step": 2330 + }, + { + "epoch": 0.2965271593944791, + "ewc_loss": 0.02741052582859993, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010076542821479961, + "grad_norm": 4.288216590881348, + "learning_rate": 9.877066553624418e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.858847975730896, + "num_tokens": 89047109.0, + "step": 2331 + }, + { + "epoch": 0.2966543696730696, + "ewc_loss": 0.027449127286672592, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010115141776623204, + "grad_norm": 4.171248912811279, + "learning_rate": 9.881305637982196e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8580116033554077, + "num_tokens": 89087859.0, + "step": 2332 + }, + { + "epoch": 0.2967815799516601, + "ewc_loss": 0.0274238009005785, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010089816350955516, + "grad_norm": 4.27445650100708, + "learning_rate": 9.885544722339975e-07, + "loss": 0.4371, + "mean_token_accuracy": 0.8607070446014404, + "num_tokens": 89123601.0, + "step": 2333 + }, + { + "epoch": 0.29690879023025063, + "ewc_loss": 0.02749379351735115, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010159808880416676, + "grad_norm": 4.216673374176025, + "learning_rate": 9.889783806697753e-07, + "loss": 0.42, + "mean_token_accuracy": 0.8683667182922363, + "num_tokens": 89160133.0, + "step": 2334 + }, + { + "epoch": 0.2970360005088411, + "ewc_loss": 0.027428220957517624, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010094235767610371, + "grad_norm": 4.255285739898682, + "learning_rate": 9.894022891055532e-07, + "loss": 0.4145, + "mean_token_accuracy": 0.8696882724761963, + "num_tokens": 89196731.0, + "step": 2335 + }, + { + "epoch": 0.29716321078743163, + "ewc_loss": 0.027492189779877663, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.0001015820525935851, + "grad_norm": 4.249632358551025, + "learning_rate": 9.89826197541331e-07, + "loss": 0.4035, + "mean_token_accuracy": 0.8723037242889404, + "num_tokens": 89230159.0, + "step": 2336 + }, + { + "epoch": 0.29729042106602216, + "ewc_loss": 0.027477426454424858, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010143441613763571, + "grad_norm": 4.306885719299316, + "learning_rate": 9.902501059771089e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8510830402374268, + "num_tokens": 89264648.0, + "step": 2337 + }, + { + "epoch": 0.29741763134461263, + "ewc_loss": 0.0275326669216156, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010198681411566213, + "grad_norm": 4.240396022796631, + "learning_rate": 9.906740144128867e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8516309857368469, + "num_tokens": 89302505.0, + "step": 2338 + }, + { + "epoch": 0.29754484162320316, + "ewc_loss": 0.02749025821685791, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010156274220207706, + "grad_norm": 4.31798791885376, + "learning_rate": 9.910979228486648e-07, + "loss": 0.4255, + "mean_token_accuracy": 0.8646640777587891, + "num_tokens": 89338079.0, + "step": 2339 + }, + { + "epoch": 0.2976720519017937, + "ewc_loss": 0.027535373345017433, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.0001020138879539445, + "grad_norm": 4.257331848144531, + "learning_rate": 9.915218312844426e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.8504661321640015, + "num_tokens": 89376717.0, + "step": 2340 + }, + { + "epoch": 0.29779926218038416, + "ewc_loss": 0.027517307549715042, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010183324047829956, + "grad_norm": 4.297347068786621, + "learning_rate": 9.919457397202205e-07, + "loss": 0.4802, + "mean_token_accuracy": 0.8484818935394287, + "num_tokens": 89416164.0, + "step": 2341 + }, + { + "epoch": 0.2979264724589747, + "ewc_loss": 0.02755754441022873, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.0001022356009343639, + "grad_norm": 4.307487964630127, + "learning_rate": 9.923696481559983e-07, + "loss": 0.4356, + "mean_token_accuracy": 0.863448977470398, + "num_tokens": 89446491.0, + "step": 2342 + }, + { + "epoch": 0.2980536827375652, + "ewc_loss": 0.027522489428520203, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010188505257247016, + "grad_norm": 4.283823013305664, + "learning_rate": 9.927935565917761e-07, + "loss": 0.4823, + "mean_token_accuracy": 0.845282793045044, + "num_tokens": 89483637.0, + "step": 2343 + }, + { + "epoch": 0.2981808930161557, + "ewc_loss": 0.027538640424609184, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010204655700363219, + "grad_norm": 4.221536636352539, + "learning_rate": 9.93217465027554e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.8548386693000793, + "num_tokens": 89526532.0, + "step": 2344 + }, + { + "epoch": 0.2983081032947462, + "ewc_loss": 0.027523979544639587, + "ewc_loss_diag": 1.728534698486328e-05, + "ewc_loss_parallel": 0.00010189994645770639, + "grad_norm": 4.397269248962402, + "learning_rate": 9.936413734633318e-07, + "loss": 0.5371, + "mean_token_accuracy": 0.8368799686431885, + "num_tokens": 89558353.0, + "step": 2345 + }, + { + "epoch": 0.29843531357333675, + "ewc_loss": 0.027738573029637337, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.0001028251790557988, + "grad_norm": 4.235995292663574, + "learning_rate": 9.940652818991097e-07, + "loss": 0.4289, + "mean_token_accuracy": 0.8643639087677002, + "num_tokens": 89598727.0, + "step": 2346 + }, + { + "epoch": 0.2985625238519272, + "ewc_loss": 0.027639836072921753, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010183781705563888, + "grad_norm": 4.261401176452637, + "learning_rate": 9.944891903348877e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8448375463485718, + "num_tokens": 89640093.0, + "step": 2347 + }, + { + "epoch": 0.29868973413051775, + "ewc_loss": 0.027722129598259926, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010266074968967587, + "grad_norm": 4.169066429138184, + "learning_rate": 9.949130987706656e-07, + "loss": 0.4416, + "mean_token_accuracy": 0.8594968318939209, + "num_tokens": 89681401.0, + "step": 2348 + }, + { + "epoch": 0.2988169444091083, + "ewc_loss": 0.02767988294363022, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010223827848676592, + "grad_norm": 4.275615692138672, + "learning_rate": 9.953370072064432e-07, + "loss": 0.435, + "mean_token_accuracy": 0.8614009022712708, + "num_tokens": 89720366.0, + "step": 2349 + }, + { + "epoch": 0.29894415468769875, + "ewc_loss": 0.027798080816864967, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010342025780119002, + "grad_norm": 4.231077671051025, + "learning_rate": 9.957609156422213e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.839535117149353, + "num_tokens": 89762234.0, + "step": 2350 + }, + { + "epoch": 0.2990713649662893, + "ewc_loss": 0.027708666399121284, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010252611536998302, + "grad_norm": 4.350854396820068, + "learning_rate": 9.961848240779991e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8562678098678589, + "num_tokens": 89792208.0, + "step": 2351 + }, + { + "epoch": 0.2991985752448798, + "ewc_loss": 0.027818119153380394, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010362064494984224, + "grad_norm": 4.203037261962891, + "learning_rate": 9.96608732513777e-07, + "loss": 0.436, + "mean_token_accuracy": 0.8654755353927612, + "num_tokens": 89833246.0, + "step": 2352 + }, + { + "epoch": 0.2993257855234703, + "ewc_loss": 0.027694493532180786, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010238440154353157, + "grad_norm": 4.224205017089844, + "learning_rate": 9.970326409495548e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8433676958084106, + "num_tokens": 89873271.0, + "step": 2353 + }, + { + "epoch": 0.2994529958020608, + "ewc_loss": 0.027794983237981796, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010338928404962644, + "grad_norm": 4.2637786865234375, + "learning_rate": 9.974565493853327e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8396103382110596, + "num_tokens": 89913069.0, + "step": 2354 + }, + { + "epoch": 0.29958020608065133, + "ewc_loss": 0.027791827917099, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010335772094549611, + "grad_norm": 4.3336100578308105, + "learning_rate": 9.978804578211107e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.845016598701477, + "num_tokens": 89948915.0, + "step": 2355 + }, + { + "epoch": 0.2997074163592418, + "ewc_loss": 0.027827316895127296, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010371262033004314, + "grad_norm": 4.357710838317871, + "learning_rate": 9.983043662568886e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.8481001853942871, + "num_tokens": 89982123.0, + "step": 2356 + }, + { + "epoch": 0.29983462663783234, + "ewc_loss": 0.02779717929661274, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010341124288970605, + "grad_norm": 4.2623982429504395, + "learning_rate": 9.987282746926662e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.8612804412841797, + "num_tokens": 90018259.0, + "step": 2357 + }, + { + "epoch": 0.29996183691642286, + "ewc_loss": 0.027793671935796738, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.0001033761800499633, + "grad_norm": 4.308460712432861, + "learning_rate": 9.991521831284443e-07, + "loss": 0.4595, + "mean_token_accuracy": 0.8561813831329346, + "num_tokens": 90059041.0, + "step": 2358 + }, + { + "epoch": 0.30008904719501334, + "ewc_loss": 0.02783159725368023, + "ewc_loss_diag": 1.7404556274414062e-05, + "ewc_loss_parallel": 0.00010375542478868738, + "grad_norm": 4.302350044250488, + "learning_rate": 9.995760915642221e-07, + "loss": 0.4089, + "mean_token_accuracy": 0.8707852363586426, + "num_tokens": 90089463.0, + "step": 2359 + }, + { + "epoch": 0.30021625747360386, + "ewc_loss": 0.027938880026340485, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010360756277805194, + "grad_norm": 4.285674095153809, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8624886870384216, + "num_tokens": 90125691.0, + "step": 2360 + }, + { + "epoch": 0.3003434677521944, + "ewc_loss": 0.027933478355407715, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010355353151680902, + "grad_norm": 4.289369106292725, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8532774448394775, + "num_tokens": 90161778.0, + "step": 2361 + }, + { + "epoch": 0.30047067803078487, + "ewc_loss": 0.027933355420827866, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010355230187997222, + "grad_norm": 4.256510257720947, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8393881320953369, + "num_tokens": 90202243.0, + "step": 2362 + }, + { + "epoch": 0.3005978883093754, + "ewc_loss": 0.02792850323021412, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010350377851864323, + "grad_norm": 4.241493225097656, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8454740047454834, + "num_tokens": 90248924.0, + "step": 2363 + }, + { + "epoch": 0.3007250985879659, + "ewc_loss": 0.02793586626648903, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010357740393374115, + "grad_norm": 4.3929829597473145, + "learning_rate": 1e-06, + "loss": 0.5752, + "mean_token_accuracy": 0.8255935311317444, + "num_tokens": 90281988.0, + "step": 2364 + }, + { + "epoch": 0.3008523088665564, + "ewc_loss": 0.02800554782152176, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.0001042742223944515, + "grad_norm": 4.267470359802246, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8491145968437195, + "num_tokens": 90317050.0, + "step": 2365 + }, + { + "epoch": 0.3009795191451469, + "ewc_loss": 0.02789997309446335, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010321847366867587, + "grad_norm": 4.265848159790039, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8616742491722107, + "num_tokens": 90358064.0, + "step": 2366 + }, + { + "epoch": 0.30110672942373745, + "ewc_loss": 0.027974463999271393, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010396337893325835, + "grad_norm": 4.246417999267578, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8424558639526367, + "num_tokens": 90400692.0, + "step": 2367 + }, + { + "epoch": 0.3012339397023279, + "ewc_loss": 0.028219811618328094, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010397547157481313, + "grad_norm": 12.1197509765625, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8454865217208862, + "num_tokens": 90432207.0, + "step": 2368 + }, + { + "epoch": 0.30136114998091845, + "ewc_loss": 0.03394773602485657, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.0001612547057447955, + "grad_norm": 5.661562919616699, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8509881496429443, + "num_tokens": 90466081.0, + "step": 2369 + }, + { + "epoch": 0.301488360259509, + "ewc_loss": 0.028132406994700432, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010310141078662127, + "grad_norm": 3.8884308338165283, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.860472559928894, + "num_tokens": 90501963.0, + "step": 2370 + }, + { + "epoch": 0.30161557053809945, + "ewc_loss": 0.029332289472222328, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00011510023614391685, + "grad_norm": 5.050380229949951, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8647112250328064, + "num_tokens": 90540651.0, + "step": 2371 + }, + { + "epoch": 0.30174278081669, + "ewc_loss": 0.030406862497329712, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00012584598152898252, + "grad_norm": 4.586719512939453, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8452837467193604, + "num_tokens": 90585105.0, + "step": 2372 + }, + { + "epoch": 0.3018699910952805, + "ewc_loss": 0.02860986813902855, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010787602514028549, + "grad_norm": 4.450161457061768, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.849768877029419, + "num_tokens": 90626773.0, + "step": 2373 + }, + { + "epoch": 0.301997201373871, + "ewc_loss": 0.028869275003671646, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00011291150440229103, + "grad_norm": 4.4965691566467285, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8691372871398926, + "num_tokens": 90665974.0, + "step": 2374 + }, + { + "epoch": 0.3021244116524615, + "ewc_loss": 0.028568552806973457, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010990427836077288, + "grad_norm": 4.449127674102783, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.847713828086853, + "num_tokens": 90706801.0, + "step": 2375 + }, + { + "epoch": 0.30225162193105204, + "ewc_loss": 0.028451494872570038, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010873370774788782, + "grad_norm": 4.503627777099609, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8494429588317871, + "num_tokens": 90743635.0, + "step": 2376 + }, + { + "epoch": 0.3023788322096425, + "ewc_loss": 0.028370417654514313, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010792293323902413, + "grad_norm": 4.4263811111450195, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8535075187683105, + "num_tokens": 90782797.0, + "step": 2377 + }, + { + "epoch": 0.30250604248823304, + "ewc_loss": 0.02823694795370102, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010658823885023594, + "grad_norm": 4.393780708312988, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8457379937171936, + "num_tokens": 90822661.0, + "step": 2378 + }, + { + "epoch": 0.30263325276682357, + "ewc_loss": 0.028189249336719513, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010611124889692292, + "grad_norm": 4.4337053298950195, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.83512282371521, + "num_tokens": 90857804.0, + "step": 2379 + }, + { + "epoch": 0.3027604630454141, + "ewc_loss": 0.028141740709543228, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010563616524450481, + "grad_norm": 4.353191375732422, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8517422080039978, + "num_tokens": 90895778.0, + "step": 2380 + }, + { + "epoch": 0.30288767332400457, + "ewc_loss": 0.028090544044971466, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010512417793506756, + "grad_norm": 4.363171577453613, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8596929907798767, + "num_tokens": 90934463.0, + "step": 2381 + }, + { + "epoch": 0.3030148836025951, + "ewc_loss": 0.028061585500836372, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00010483460209798068, + "grad_norm": 12.181310653686523, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8563084602355957, + "num_tokens": 90967888.0, + "step": 2382 + }, + { + "epoch": 0.3031420938811856, + "ewc_loss": 0.03394012898206711, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00016117864288389683, + "grad_norm": 5.645901203155518, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.860361635684967, + "num_tokens": 91005200.0, + "step": 2383 + }, + { + "epoch": 0.3032693041597761, + "ewc_loss": 0.028049182146787643, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010226915765088052, + "grad_norm": 3.8475863933563232, + "learning_rate": 1e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8372432589530945, + "num_tokens": 91037840.0, + "step": 2384 + }, + { + "epoch": 0.3033965144383666, + "ewc_loss": 0.02911493182182312, + "ewc_loss_diag": 1.7523765563964844e-05, + "ewc_loss_parallel": 0.00011536807141965255, + "grad_norm": 5.032445430755615, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8400671482086182, + "num_tokens": 91073336.0, + "step": 2385 + }, + { + "epoch": 0.30352372471695716, + "ewc_loss": 0.03056510165333748, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00012742837134283036, + "grad_norm": 4.62130880355835, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8548043966293335, + "num_tokens": 91107001.0, + "step": 2386 + }, + { + "epoch": 0.30365093499554763, + "ewc_loss": 0.028639845550060272, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010817580186994746, + "grad_norm": 4.463340759277344, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8608253598213196, + "num_tokens": 91143937.0, + "step": 2387 + }, + { + "epoch": 0.30377814527413816, + "ewc_loss": 0.02914409711956978, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00011321832425892353, + "grad_norm": 4.5170979499816895, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8441624045372009, + "num_tokens": 91185391.0, + "step": 2388 + }, + { + "epoch": 0.3039053555527287, + "ewc_loss": 0.028940612450242043, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00011118346446892247, + "grad_norm": 4.414237022399902, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8519721031188965, + "num_tokens": 91227653.0, + "step": 2389 + }, + { + "epoch": 0.30403256583131916, + "ewc_loss": 0.028681905940175056, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010859640315175056, + "grad_norm": 4.430506706237793, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8730390667915344, + "num_tokens": 91263610.0, + "step": 2390 + }, + { + "epoch": 0.3041597761099097, + "ewc_loss": 0.028661776334047318, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010839510650839657, + "grad_norm": 4.527878761291504, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8480331301689148, + "num_tokens": 91296880.0, + "step": 2391 + }, + { + "epoch": 0.3042869863885002, + "ewc_loss": 0.028624601662158966, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010802334873005748, + "grad_norm": 4.421393394470215, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8413747549057007, + "num_tokens": 91336242.0, + "step": 2392 + }, + { + "epoch": 0.3044141966670907, + "ewc_loss": 0.028437837958335876, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010615573410177603, + "grad_norm": 4.408194065093994, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8616995811462402, + "num_tokens": 91376527.0, + "step": 2393 + }, + { + "epoch": 0.3045414069456812, + "ewc_loss": 0.028483325615525246, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010661059786798432, + "grad_norm": 4.4677252769470215, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8441041111946106, + "num_tokens": 91408415.0, + "step": 2394 + }, + { + "epoch": 0.30466861722427174, + "ewc_loss": 0.02840161696076393, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010579352237982675, + "grad_norm": 4.288204193115234, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8543301820755005, + "num_tokens": 91449189.0, + "step": 2395 + }, + { + "epoch": 0.3047958275028622, + "ewc_loss": 0.028292037546634674, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010469771950738505, + "grad_norm": 4.300026893615723, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8496347665786743, + "num_tokens": 91488599.0, + "step": 2396 + }, + { + "epoch": 0.30492303778145274, + "ewc_loss": 0.028365982696413994, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010543716780375689, + "grad_norm": 4.334359645843506, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8578707575798035, + "num_tokens": 91525138.0, + "step": 2397 + }, + { + "epoch": 0.30505024806004327, + "ewc_loss": 0.02833997644484043, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010517711052671075, + "grad_norm": 4.309127330780029, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8548210859298706, + "num_tokens": 91567444.0, + "step": 2398 + }, + { + "epoch": 0.30517745833863374, + "ewc_loss": 0.028289243578910828, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010466977255418897, + "grad_norm": 4.374990463256836, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8507800102233887, + "num_tokens": 91602926.0, + "step": 2399 + }, + { + "epoch": 0.3053046686172243, + "ewc_loss": 0.028336476534605026, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010514210589462891, + "grad_norm": 4.358964920043945, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8646317720413208, + "num_tokens": 91635901.0, + "step": 2400 + }, + { + "epoch": 0.3054318788958148, + "ewc_loss": 0.02827981486916542, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010457548341946676, + "grad_norm": 4.258702278137207, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8561526536941528, + "num_tokens": 91679542.0, + "step": 2401 + }, + { + "epoch": 0.3055590891744053, + "ewc_loss": 0.028277700766921043, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010455435403855518, + "grad_norm": 4.291393756866455, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.847679853439331, + "num_tokens": 91723022.0, + "step": 2402 + }, + { + "epoch": 0.3056862994529958, + "ewc_loss": 0.028300166130065918, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010477901378180832, + "grad_norm": 4.358826160430908, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8493587970733643, + "num_tokens": 91760664.0, + "step": 2403 + }, + { + "epoch": 0.30581350973158633, + "ewc_loss": 0.028315873816609383, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.0001049360798788257, + "grad_norm": 4.3571906089782715, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8490623235702515, + "num_tokens": 91805570.0, + "step": 2404 + }, + { + "epoch": 0.3059407200101768, + "ewc_loss": 0.02825370617210865, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010431440750835463, + "grad_norm": 4.293857574462891, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8455175161361694, + "num_tokens": 91843342.0, + "step": 2405 + }, + { + "epoch": 0.30606793028876733, + "ewc_loss": 0.028294861316680908, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010472595022292808, + "grad_norm": 4.282535552978516, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8444368243217468, + "num_tokens": 91883872.0, + "step": 2406 + }, + { + "epoch": 0.30619514056735786, + "ewc_loss": 0.028275588527321815, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010453322465764359, + "grad_norm": 4.290838718414307, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8523546457290649, + "num_tokens": 91923174.0, + "step": 2407 + }, + { + "epoch": 0.30632235084594833, + "ewc_loss": 0.028303751721978188, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.0001048148624249734, + "grad_norm": 4.322144985198975, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8520405292510986, + "num_tokens": 91963462.0, + "step": 2408 + }, + { + "epoch": 0.30644956112453886, + "ewc_loss": 0.028298061341047287, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010475795716047287, + "grad_norm": 4.28761100769043, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8554712533950806, + "num_tokens": 92000907.0, + "step": 2409 + }, + { + "epoch": 0.3065767714031294, + "ewc_loss": 0.02831019088625908, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010487926192581654, + "grad_norm": 4.35547399520874, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8556354641914368, + "num_tokens": 92036110.0, + "step": 2410 + }, + { + "epoch": 0.30670398168171986, + "ewc_loss": 0.02831512689590454, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010492861474631354, + "grad_norm": 4.238378047943115, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.849359929561615, + "num_tokens": 92080647.0, + "step": 2411 + }, + { + "epoch": 0.3068311919603104, + "ewc_loss": 0.028272287920117378, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010450022091390565, + "grad_norm": 4.373661041259766, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8572030663490295, + "num_tokens": 92115407.0, + "step": 2412 + }, + { + "epoch": 0.3069584022389009, + "ewc_loss": 0.02836696431040764, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010544697579462081, + "grad_norm": 4.296538829803467, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.853539764881134, + "num_tokens": 92149660.0, + "step": 2413 + }, + { + "epoch": 0.3070856125174914, + "ewc_loss": 0.028275037184357643, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010452771675772965, + "grad_norm": 4.257997035980225, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8562538623809814, + "num_tokens": 92191581.0, + "step": 2414 + }, + { + "epoch": 0.3072128227960819, + "ewc_loss": 0.02834751456975937, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010525248217163607, + "grad_norm": 4.276525497436523, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8480753302574158, + "num_tokens": 92230435.0, + "step": 2415 + }, + { + "epoch": 0.30734003307467245, + "ewc_loss": 0.028364382684230804, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010542116069700569, + "grad_norm": 4.313566207885742, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8412315845489502, + "num_tokens": 92269545.0, + "step": 2416 + }, + { + "epoch": 0.3074672433532629, + "ewc_loss": 0.028372595086693764, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.0001055032989825122, + "grad_norm": 4.255221366882324, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8605040311813354, + "num_tokens": 92307447.0, + "step": 2417 + }, + { + "epoch": 0.30759445363185345, + "ewc_loss": 0.028333401307463646, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010511135769775137, + "grad_norm": 4.275082111358643, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.852053165435791, + "num_tokens": 92349441.0, + "step": 2418 + }, + { + "epoch": 0.307721663910444, + "ewc_loss": 0.02838158793747425, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.0001055932225426659, + "grad_norm": 4.290119171142578, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8582661151885986, + "num_tokens": 92380405.0, + "step": 2419 + }, + { + "epoch": 0.30784887418903445, + "ewc_loss": 0.02839023619890213, + "ewc_loss_diag": 1.7762184143066406e-05, + "ewc_loss_parallel": 0.00010567969729891047, + "grad_norm": 4.300289630889893, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.851268470287323, + "num_tokens": 92419596.0, + "step": 2420 + }, + { + "epoch": 0.307976084467625, + "ewc_loss": 0.028398141264915466, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010575875057838857, + "grad_norm": 4.29728364944458, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8454242944717407, + "num_tokens": 92454590.0, + "step": 2421 + }, + { + "epoch": 0.3081032947462155, + "ewc_loss": 0.028406735509634018, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010584468691376969, + "grad_norm": 4.303189277648926, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8503336906433105, + "num_tokens": 92491640.0, + "step": 2422 + }, + { + "epoch": 0.308230505024806, + "ewc_loss": 0.028413813561201096, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010591548925731331, + "grad_norm": 4.287080764770508, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8450096845626831, + "num_tokens": 92528682.0, + "step": 2423 + }, + { + "epoch": 0.3083577153033965, + "ewc_loss": 0.02842537686228752, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010603110422380269, + "grad_norm": 4.278451442718506, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8603826761245728, + "num_tokens": 92566368.0, + "step": 2424 + }, + { + "epoch": 0.30848492558198704, + "ewc_loss": 0.02842850238084793, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.0001060623544617556, + "grad_norm": 4.396979331970215, + "learning_rate": 1e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8318948745727539, + "num_tokens": 92601380.0, + "step": 2425 + }, + { + "epoch": 0.3086121358605775, + "ewc_loss": 0.02850838005542755, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010686114546842873, + "grad_norm": 4.306018829345703, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8695645332336426, + "num_tokens": 92639724.0, + "step": 2426 + }, + { + "epoch": 0.30873934613916804, + "ewc_loss": 0.028407201170921326, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010584934352664277, + "grad_norm": 4.261478900909424, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8583963513374329, + "num_tokens": 92677450.0, + "step": 2427 + }, + { + "epoch": 0.30886655641775856, + "ewc_loss": 0.028476038947701454, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010653772915247828, + "grad_norm": 4.292888164520264, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8619695901870728, + "num_tokens": 92714681.0, + "step": 2428 + }, + { + "epoch": 0.3089937666963491, + "ewc_loss": 0.028435517102479935, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010613250924507156, + "grad_norm": 4.294760227203369, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8489428758621216, + "num_tokens": 92749925.0, + "step": 2429 + }, + { + "epoch": 0.30912097697493957, + "ewc_loss": 0.02846849150955677, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010646226292010397, + "grad_norm": 4.264496326446533, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8579987287521362, + "num_tokens": 92791720.0, + "step": 2430 + }, + { + "epoch": 0.3092481872535301, + "ewc_loss": 0.02845454216003418, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010632276098476723, + "grad_norm": 4.32230281829834, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8457961082458496, + "num_tokens": 92826179.0, + "step": 2431 + }, + { + "epoch": 0.3093753975321206, + "ewc_loss": 0.028523650020360947, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010701385326683521, + "grad_norm": 4.236672401428223, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8523149490356445, + "num_tokens": 92872751.0, + "step": 2432 + }, + { + "epoch": 0.3095026078107111, + "ewc_loss": 0.028441302478313446, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010619036038406193, + "grad_norm": 4.345587253570557, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8475791215896606, + "num_tokens": 92907470.0, + "step": 2433 + }, + { + "epoch": 0.3096298180893016, + "ewc_loss": 0.0285247340798378, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.0001070246726158075, + "grad_norm": 4.320988178253174, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8586864471435547, + "num_tokens": 92943321.0, + "step": 2434 + }, + { + "epoch": 0.30975702836789215, + "ewc_loss": 0.028456557542085648, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010634291538735852, + "grad_norm": 4.318572044372559, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8550567626953125, + "num_tokens": 92982136.0, + "step": 2435 + }, + { + "epoch": 0.3098842386464826, + "ewc_loss": 0.028705798089504242, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010639390529831871, + "grad_norm": 4.444903373718262, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8435555696487427, + "num_tokens": 93015974.0, + "step": 2436 + }, + { + "epoch": 0.31001144892507315, + "ewc_loss": 0.028527207672595978, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010704941087169573, + "grad_norm": 4.265373229980469, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8559736013412476, + "num_tokens": 93054010.0, + "step": 2437 + }, + { + "epoch": 0.3101386592036637, + "ewc_loss": 0.028401263058185577, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010578998626442626, + "grad_norm": 4.307315349578857, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8611938953399658, + "num_tokens": 93090226.0, + "step": 2438 + }, + { + "epoch": 0.31026586948225415, + "ewc_loss": 0.028498750180006027, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010676484816940501, + "grad_norm": 4.3261189460754395, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8499069213867188, + "num_tokens": 93130242.0, + "step": 2439 + }, + { + "epoch": 0.3103930797608447, + "ewc_loss": 0.028714295476675034, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.0001064789030351676, + "grad_norm": 12.158836364746094, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8489752411842346, + "num_tokens": 93170990.0, + "step": 2440 + }, + { + "epoch": 0.3105202900394352, + "ewc_loss": 0.034478239715099335, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.0001641183189349249, + "grad_norm": 5.679583549499512, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8456755876541138, + "num_tokens": 93205478.0, + "step": 2441 + }, + { + "epoch": 0.3106475003180257, + "ewc_loss": 0.028581442311406136, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.0001051503568305634, + "grad_norm": 3.9409472942352295, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8438512682914734, + "num_tokens": 93240952.0, + "step": 2442 + }, + { + "epoch": 0.3107747105966162, + "ewc_loss": 0.02982746809720993, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00011761062341975048, + "grad_norm": 4.988267421722412, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8429520726203918, + "num_tokens": 93278882.0, + "step": 2443 + }, + { + "epoch": 0.31090192087520674, + "ewc_loss": 0.0307148564606905, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00012648450501728803, + "grad_norm": 4.868147850036621, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8448939323425293, + "num_tokens": 93324573.0, + "step": 2444 + }, + { + "epoch": 0.3110291311537972, + "ewc_loss": 0.03074968233704567, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.0001109636141336523, + "grad_norm": 53.432456970214844, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8664801716804504, + "num_tokens": 93361049.0, + "step": 2445 + }, + { + "epoch": 0.31115634143238774, + "ewc_loss": 0.04009115695953369, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00021902679873164743, + "grad_norm": 7.0178937911987305, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8495567440986633, + "num_tokens": 93401748.0, + "step": 2446 + }, + { + "epoch": 0.31128355171097827, + "ewc_loss": 0.03281623125076294, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00014505682338494807, + "grad_norm": 4.705357551574707, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8638253808021545, + "num_tokens": 93434857.0, + "step": 2447 + }, + { + "epoch": 0.31141076198956874, + "ewc_loss": 0.03067662939429283, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00012610222620423883, + "grad_norm": 5.418334484100342, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8520520925521851, + "num_tokens": 93474681.0, + "step": 2448 + }, + { + "epoch": 0.31153797226815927, + "ewc_loss": 0.03395813703536987, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00015891730436123908, + "grad_norm": 5.813765048980713, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8512757420539856, + "num_tokens": 93510627.0, + "step": 2449 + }, + { + "epoch": 0.3116651825467498, + "ewc_loss": 0.0307452529668808, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00012678847997449338, + "grad_norm": 4.695575714111328, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8440735936164856, + "num_tokens": 93551598.0, + "step": 2450 + }, + { + "epoch": 0.31179239282534027, + "ewc_loss": 0.03031783178448677, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.0001225142477778718, + "grad_norm": 5.096822738647461, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8455801606178284, + "num_tokens": 93586622.0, + "step": 2451 + }, + { + "epoch": 0.3119196031039308, + "ewc_loss": 0.03118155151605606, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.0001311514643020928, + "grad_norm": 5.004807472229004, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8423286080360413, + "num_tokens": 93626496.0, + "step": 2452 + }, + { + "epoch": 0.3120468133825213, + "ewc_loss": 0.029632607474923134, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00011566201283130795, + "grad_norm": 4.66215705871582, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8532990217208862, + "num_tokens": 93665641.0, + "step": 2453 + }, + { + "epoch": 0.3121740236611118, + "ewc_loss": 0.029528401792049408, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00011706135410349816, + "grad_norm": 4.854896545410156, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8577783703804016, + "num_tokens": 93702135.0, + "step": 2454 + }, + { + "epoch": 0.3123012339397023, + "ewc_loss": 0.029526926577091217, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.0001170465984614566, + "grad_norm": 4.690188407897949, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8589062690734863, + "num_tokens": 93738226.0, + "step": 2455 + }, + { + "epoch": 0.31242844421829286, + "ewc_loss": 0.02895047888159752, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00011128211917821318, + "grad_norm": 4.59621524810791, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8546868562698364, + "num_tokens": 93780273.0, + "step": 2456 + }, + { + "epoch": 0.31255565449688333, + "ewc_loss": 0.029000455513596535, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00011178189743077382, + "grad_norm": 4.589001655578613, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8400566577911377, + "num_tokens": 93821977.0, + "step": 2457 + }, + { + "epoch": 0.31268286477547386, + "ewc_loss": 0.02881215512752533, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010989889415213838, + "grad_norm": 4.562876224517822, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8620525002479553, + "num_tokens": 93856084.0, + "step": 2458 + }, + { + "epoch": 0.3128100750540644, + "ewc_loss": 0.02872183918952942, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010899574408540502, + "grad_norm": 4.525556564331055, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8557122945785522, + "num_tokens": 93893491.0, + "step": 2459 + }, + { + "epoch": 0.31293728533265486, + "ewc_loss": 0.028651833534240723, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010829566599568352, + "grad_norm": 4.388208389282227, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8406909108161926, + "num_tokens": 93940240.0, + "step": 2460 + }, + { + "epoch": 0.3130644956112454, + "ewc_loss": 0.02853737398982048, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010715109237935394, + "grad_norm": 4.4915032386779785, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8461605310440063, + "num_tokens": 93974504.0, + "step": 2461 + }, + { + "epoch": 0.3131917058898359, + "ewc_loss": 0.028632577508687973, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010810311505338177, + "grad_norm": 4.464475154876709, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8552420735359192, + "num_tokens": 94009208.0, + "step": 2462 + }, + { + "epoch": 0.3133189161684264, + "ewc_loss": 0.02846779115498066, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010645525617292151, + "grad_norm": 4.361344814300537, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8524491786956787, + "num_tokens": 94049249.0, + "step": 2463 + }, + { + "epoch": 0.3134461264470169, + "ewc_loss": 0.028510749340057373, + "ewc_loss_diag": 1.7881393432617188e-05, + "ewc_loss_parallel": 0.00010688483598642051, + "grad_norm": 4.485713481903076, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.853814959526062, + "num_tokens": 94085553.0, + "step": 2464 + }, + { + "epoch": 0.31357333672560744, + "ewc_loss": 0.02866494469344616, + "ewc_loss_diag": 1.800060272216797e-05, + "ewc_loss_parallel": 0.00010720608406700194, + "grad_norm": 4.368747234344482, + "learning_rate": 1e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8325944542884827, + "num_tokens": 94127531.0, + "step": 2465 + }, + { + "epoch": 0.3137005470041979, + "ewc_loss": 0.02867826074361801, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010611853940645233, + "grad_norm": 4.333440780639648, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8642514944076538, + "num_tokens": 94171191.0, + "step": 2466 + }, + { + "epoch": 0.31382775728278844, + "ewc_loss": 0.028615601360797882, + "ewc_loss_diag": 1.800060272216797e-05, + "ewc_loss_parallel": 0.00010671265044948086, + "grad_norm": 4.374977111816406, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8572618365287781, + "num_tokens": 94212090.0, + "step": 2467 + }, + { + "epoch": 0.313954967561379, + "ewc_loss": 0.028744341805577278, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.0001067793564288877, + "grad_norm": 4.374472618103027, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8714485764503479, + "num_tokens": 94246835.0, + "step": 2468 + }, + { + "epoch": 0.31408217783996945, + "ewc_loss": 0.028739895671606064, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010673490760382265, + "grad_norm": 4.362752437591553, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8513149619102478, + "num_tokens": 94284581.0, + "step": 2469 + }, + { + "epoch": 0.31420938811856, + "ewc_loss": 0.02875729277729988, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010690886119846255, + "grad_norm": 4.408708572387695, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8630061745643616, + "num_tokens": 94323050.0, + "step": 2470 + }, + { + "epoch": 0.3143365983971505, + "ewc_loss": 0.0287503432482481, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010683936852728948, + "grad_norm": 4.308066368103027, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8480837345123291, + "num_tokens": 94364702.0, + "step": 2471 + }, + { + "epoch": 0.314463808675741, + "ewc_loss": 0.028740983456373215, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010674576333258301, + "grad_norm": 4.35140323638916, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8594115376472473, + "num_tokens": 94404735.0, + "step": 2472 + }, + { + "epoch": 0.3145910189543315, + "ewc_loss": 0.028681756928563118, + "ewc_loss_diag": 1.800060272216797e-05, + "ewc_loss_parallel": 0.00010737420961959288, + "grad_norm": 4.373010635375977, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8390364646911621, + "num_tokens": 94444683.0, + "step": 2473 + }, + { + "epoch": 0.31471822923292203, + "ewc_loss": 0.028777029365301132, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010710624337662011, + "grad_norm": 4.34817361831665, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8545686602592468, + "num_tokens": 94484392.0, + "step": 2474 + }, + { + "epoch": 0.3148454395115125, + "ewc_loss": 0.028788436204195023, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010722029401222244, + "grad_norm": 4.384687900543213, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8598951101303101, + "num_tokens": 94524860.0, + "step": 2475 + }, + { + "epoch": 0.31497264979010303, + "ewc_loss": 0.02865845337510109, + "ewc_loss_diag": 1.800060272216797e-05, + "ewc_loss_parallel": 0.00010714116069721058, + "grad_norm": 4.352260589599609, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8493480682373047, + "num_tokens": 94565076.0, + "step": 2476 + }, + { + "epoch": 0.31509986006869356, + "ewc_loss": 0.028774484992027283, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010708079935284331, + "grad_norm": 4.367453098297119, + "learning_rate": 1e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.8295627236366272, + "num_tokens": 94606191.0, + "step": 2477 + }, + { + "epoch": 0.31522707034728403, + "ewc_loss": 0.02881592884659767, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010749523062258959, + "grad_norm": 4.356843948364258, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8469330072402954, + "num_tokens": 94648021.0, + "step": 2478 + }, + { + "epoch": 0.31535428062587456, + "ewc_loss": 0.028813043609261513, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010746637417469174, + "grad_norm": 4.3990302085876465, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8641362190246582, + "num_tokens": 94686305.0, + "step": 2479 + }, + { + "epoch": 0.3154814909044651, + "ewc_loss": 0.02882455289363861, + "ewc_loss_diag": 1.811981201171875e-05, + "ewc_loss_parallel": 0.00010758147982414812, + "grad_norm": 4.347671031951904, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8501525521278381, + "num_tokens": 94728606.0, + "step": 2480 + }, + { + "epoch": 0.3156087011830556, + "ewc_loss": 0.028943605720996857, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010755128460004926, + "grad_norm": 4.317615509033203, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8558335304260254, + "num_tokens": 94770162.0, + "step": 2481 + }, + { + "epoch": 0.3157359114616461, + "ewc_loss": 0.02905520796775818, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010744661267381161, + "grad_norm": 4.3631391525268555, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.859138011932373, + "num_tokens": 94807735.0, + "step": 2482 + }, + { + "epoch": 0.3158631217402366, + "ewc_loss": 0.02900840900838375, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010819932504091412, + "grad_norm": 4.445078372955322, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8472292423248291, + "num_tokens": 94840366.0, + "step": 2483 + }, + { + "epoch": 0.31599033201882715, + "ewc_loss": 0.028976131230592728, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010787654900923371, + "grad_norm": 4.349735736846924, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8497475385665894, + "num_tokens": 94874293.0, + "step": 2484 + }, + { + "epoch": 0.3161175422974176, + "ewc_loss": 0.029064998030662537, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010754451795946807, + "grad_norm": 4.427299499511719, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8518153429031372, + "num_tokens": 94909347.0, + "step": 2485 + }, + { + "epoch": 0.31624475257600815, + "ewc_loss": 0.02904359996318817, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010855124128283933, + "grad_norm": 4.305581569671631, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8622870445251465, + "num_tokens": 94955120.0, + "step": 2486 + }, + { + "epoch": 0.3163719628545987, + "ewc_loss": 0.0289546400308609, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010766163904918358, + "grad_norm": 4.360332012176514, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8668835163116455, + "num_tokens": 94992319.0, + "step": 2487 + }, + { + "epoch": 0.31649917313318915, + "ewc_loss": 0.02905261144042015, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010864133946597576, + "grad_norm": 4.426122665405273, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8635028600692749, + "num_tokens": 95023464.0, + "step": 2488 + }, + { + "epoch": 0.3166263834117797, + "ewc_loss": 0.029053445905447006, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010864969226531684, + "grad_norm": 4.378419876098633, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8663454055786133, + "num_tokens": 95055963.0, + "step": 2489 + }, + { + "epoch": 0.3167535936903702, + "ewc_loss": 0.0290067121386528, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010818234295584261, + "grad_norm": 4.423771381378174, + "learning_rate": 1e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8346914649009705, + "num_tokens": 95089628.0, + "step": 2490 + }, + { + "epoch": 0.3168808039689607, + "ewc_loss": 0.02905980311334133, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010871326958294958, + "grad_norm": 4.283535003662109, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8658623695373535, + "num_tokens": 95128557.0, + "step": 2491 + }, + { + "epoch": 0.3170080142475512, + "ewc_loss": 0.029001640155911446, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010813163680722937, + "grad_norm": 4.370730876922607, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8418450951576233, + "num_tokens": 95168324.0, + "step": 2492 + }, + { + "epoch": 0.31713522452614173, + "ewc_loss": 0.029083900153636932, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010895424202317372, + "grad_norm": 4.332759857177734, + "learning_rate": 1e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.833564817905426, + "num_tokens": 95208139.0, + "step": 2493 + }, + { + "epoch": 0.3172624348047322, + "ewc_loss": 0.029031068086624146, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010842592018889263, + "grad_norm": 4.380824089050293, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8473383188247681, + "num_tokens": 95247337.0, + "step": 2494 + }, + { + "epoch": 0.31738964508332274, + "ewc_loss": 0.0290873721241951, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.0001089889628929086, + "grad_norm": 4.3558549880981445, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8458147644996643, + "num_tokens": 95289208.0, + "step": 2495 + }, + { + "epoch": 0.31751685536191326, + "ewc_loss": 0.029017264023423195, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010828787344507873, + "grad_norm": 4.4072394371032715, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8513198494911194, + "num_tokens": 95329717.0, + "step": 2496 + }, + { + "epoch": 0.31764406564050374, + "ewc_loss": 0.029067829251289368, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010879351611947641, + "grad_norm": 4.345705986022949, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8343653678894043, + "num_tokens": 95371052.0, + "step": 2497 + }, + { + "epoch": 0.31777127591909426, + "ewc_loss": 0.02914208546280861, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010831539839273319, + "grad_norm": 4.313403606414795, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8723080158233643, + "num_tokens": 95412486.0, + "step": 2498 + }, + { + "epoch": 0.3178984861976848, + "ewc_loss": 0.029153790324926376, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010843243217095733, + "grad_norm": 4.386286735534668, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8689603805541992, + "num_tokens": 95446167.0, + "step": 2499 + }, + { + "epoch": 0.31802569647627527, + "ewc_loss": 0.029190240427851677, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010879693581955507, + "grad_norm": 4.348653316497803, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8473860025405884, + "num_tokens": 95485763.0, + "step": 2500 + }, + { + "epoch": 0.3181529067548658, + "ewc_loss": 0.02916276454925537, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.0001085221883840859, + "grad_norm": 4.363803863525391, + "learning_rate": 1e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8426035642623901, + "num_tokens": 95526407.0, + "step": 2501 + }, + { + "epoch": 0.3182801170334563, + "ewc_loss": 0.02915230020880699, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010841753100976348, + "grad_norm": 4.380681037902832, + "learning_rate": 1e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8212915658950806, + "num_tokens": 95567479.0, + "step": 2502 + }, + { + "epoch": 0.3184073273120468, + "ewc_loss": 0.029160335659980774, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010849790123756975, + "grad_norm": 4.320084571838379, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8593018054962158, + "num_tokens": 95606588.0, + "step": 2503 + }, + { + "epoch": 0.3185345375906373, + "ewc_loss": 0.029155593365430832, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010845046199392527, + "grad_norm": 4.287731647491455, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8567101955413818, + "num_tokens": 95653293.0, + "step": 2504 + }, + { + "epoch": 0.31866174786922785, + "ewc_loss": 0.029158353805541992, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.0001084780742530711, + "grad_norm": 4.367866039276123, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8445563912391663, + "num_tokens": 95692424.0, + "step": 2505 + }, + { + "epoch": 0.3187889581478183, + "ewc_loss": 0.02922292798757553, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010912380093941465, + "grad_norm": 4.331375598907471, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8610883355140686, + "num_tokens": 95733679.0, + "step": 2506 + }, + { + "epoch": 0.31891616842640885, + "ewc_loss": 0.0291767381131649, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010866192315006629, + "grad_norm": 4.48645544052124, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8484954833984375, + "num_tokens": 95768211.0, + "step": 2507 + }, + { + "epoch": 0.3190433787049994, + "ewc_loss": 0.029128991067409515, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010940514039248228, + "grad_norm": 4.354035377502441, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8511301279067993, + "num_tokens": 95805974.0, + "step": 2508 + }, + { + "epoch": 0.31917058898358985, + "ewc_loss": 0.02899281494319439, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010804337944136932, + "grad_norm": 4.356483459472656, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8521156311035156, + "num_tokens": 95845463.0, + "step": 2509 + }, + { + "epoch": 0.3192977992621804, + "ewc_loss": 0.02907501719892025, + "ewc_loss_diag": 1.823902130126953e-05, + "ewc_loss_parallel": 0.00010886540985666215, + "grad_norm": 4.394734859466553, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8480479717254639, + "num_tokens": 95888148.0, + "step": 2510 + }, + { + "epoch": 0.3194250095407709, + "ewc_loss": 0.0293026901781559, + "ewc_loss_diag": 1.8477439880371094e-05, + "ewc_loss_parallel": 0.00010870072583202273, + "grad_norm": 4.339820861816406, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8746411204338074, + "num_tokens": 95925432.0, + "step": 2511 + }, + { + "epoch": 0.3195522198193614, + "ewc_loss": 0.029285665601491928, + "ewc_loss_diag": 1.8477439880371094e-05, + "ewc_loss_parallel": 0.00010853047569980845, + "grad_norm": 4.310136795043945, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8584664463996887, + "num_tokens": 95970695.0, + "step": 2512 + }, + { + "epoch": 0.3196794300979519, + "ewc_loss": 0.02942769229412079, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010873004066525027, + "grad_norm": 4.384336948394775, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8536630868911743, + "num_tokens": 96006093.0, + "step": 2513 + }, + { + "epoch": 0.31980664037654244, + "ewc_loss": 0.029228614643216133, + "ewc_loss_diag": 1.8358230590820312e-05, + "ewc_loss_parallel": 0.00010918067710008472, + "grad_norm": 4.335302352905273, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8535943627357483, + "num_tokens": 96045945.0, + "step": 2514 + }, + { + "epoch": 0.3199338506551329, + "ewc_loss": 0.029467951506376266, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010913263395195827, + "grad_norm": 4.401598930358887, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8697900772094727, + "num_tokens": 96076748.0, + "step": 2515 + }, + { + "epoch": 0.32006106093372344, + "ewc_loss": 0.029509270563721657, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010954582830891013, + "grad_norm": 4.376619815826416, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8556168079376221, + "num_tokens": 96116964.0, + "step": 2516 + }, + { + "epoch": 0.32018827121231397, + "ewc_loss": 0.029496202245354652, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.0001094151521101594, + "grad_norm": 4.393736839294434, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8553915023803711, + "num_tokens": 96152073.0, + "step": 2517 + }, + { + "epoch": 0.32031548149090444, + "ewc_loss": 0.02953379787504673, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010979110084008425, + "grad_norm": 4.412052154541016, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8515433669090271, + "num_tokens": 96191608.0, + "step": 2518 + }, + { + "epoch": 0.32044269176949497, + "ewc_loss": 0.029509924352169037, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010955235484289005, + "grad_norm": 4.370033264160156, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8699823617935181, + "num_tokens": 96228124.0, + "step": 2519 + }, + { + "epoch": 0.3205699020480855, + "ewc_loss": 0.02950359135866165, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010948903945973143, + "grad_norm": 4.369236946105957, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.864676296710968, + "num_tokens": 96265870.0, + "step": 2520 + }, + { + "epoch": 0.32069711232667597, + "ewc_loss": 0.029515499249100685, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010960811778204516, + "grad_norm": 4.358170509338379, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8634468913078308, + "num_tokens": 96309623.0, + "step": 2521 + }, + { + "epoch": 0.3208243226052665, + "ewc_loss": 0.029488451778888702, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010933764860965312, + "grad_norm": 4.4002227783203125, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8468323945999146, + "num_tokens": 96344036.0, + "step": 2522 + }, + { + "epoch": 0.320951532883857, + "ewc_loss": 0.0294935442507267, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010938856576103717, + "grad_norm": 4.362229824066162, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8650014996528625, + "num_tokens": 96377627.0, + "step": 2523 + }, + { + "epoch": 0.3210787431624475, + "ewc_loss": 0.02947910875082016, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010924420348601416, + "grad_norm": 4.379800796508789, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8546518087387085, + "num_tokens": 96413568.0, + "step": 2524 + }, + { + "epoch": 0.32120595344103803, + "ewc_loss": 0.029514148831367493, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010959461360471323, + "grad_norm": 4.348046779632568, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8511519432067871, + "num_tokens": 96454682.0, + "step": 2525 + }, + { + "epoch": 0.32133316371962856, + "ewc_loss": 0.029523024335503578, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010968336573569104, + "grad_norm": 4.454278469085693, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8427791595458984, + "num_tokens": 96490456.0, + "step": 2526 + }, + { + "epoch": 0.32146037399821903, + "ewc_loss": 0.029553867876529694, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010999181540682912, + "grad_norm": 4.4801154136657715, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8466651439666748, + "num_tokens": 96526891.0, + "step": 2527 + }, + { + "epoch": 0.32158758427680956, + "ewc_loss": 0.029656030237674713, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00010979273065458983, + "grad_norm": 4.3206071853637695, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8482660055160522, + "num_tokens": 96575226.0, + "step": 2528 + }, + { + "epoch": 0.3217147945554001, + "ewc_loss": 0.029571054503321648, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00010894296428887174, + "grad_norm": 4.444736957550049, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8432011604309082, + "num_tokens": 96606383.0, + "step": 2529 + }, + { + "epoch": 0.3218420048339906, + "ewc_loss": 0.029559031128883362, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00011004343832610175, + "grad_norm": 4.393876075744629, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8564156889915466, + "num_tokens": 96639826.0, + "step": 2530 + }, + { + "epoch": 0.3219692151125811, + "ewc_loss": 0.029498714953660965, + "ewc_loss_diag": 1.8596649169921875e-05, + "ewc_loss_parallel": 0.00010944027599180117, + "grad_norm": 4.399867057800293, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8580065965652466, + "num_tokens": 96679151.0, + "step": 2531 + }, + { + "epoch": 0.3220964253911716, + "ewc_loss": 0.02967046946287155, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00010993712930940092, + "grad_norm": 4.3543500900268555, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8594108819961548, + "num_tokens": 96719365.0, + "step": 2532 + }, + { + "epoch": 0.32222363566976214, + "ewc_loss": 0.02962491661310196, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.0001094815888791345, + "grad_norm": 4.342652797698975, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.850557804107666, + "num_tokens": 96758318.0, + "step": 2533 + }, + { + "epoch": 0.3223508459483526, + "ewc_loss": 0.02965722046792507, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00010980462684528902, + "grad_norm": 4.498693943023682, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8484535217285156, + "num_tokens": 96791159.0, + "step": 2534 + }, + { + "epoch": 0.32247805622694314, + "ewc_loss": 0.029741402715444565, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00011064645514125004, + "grad_norm": 4.369851589202881, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8577734231948853, + "num_tokens": 96825352.0, + "step": 2535 + }, + { + "epoch": 0.32260526650553367, + "ewc_loss": 0.02959161251783371, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00010914854647126049, + "grad_norm": 4.415087699890137, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8516521453857422, + "num_tokens": 96859700.0, + "step": 2536 + }, + { + "epoch": 0.32273247678412414, + "ewc_loss": 0.029732417315244675, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00011055658251279965, + "grad_norm": 4.334316730499268, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8520160913467407, + "num_tokens": 96897077.0, + "step": 2537 + }, + { + "epoch": 0.3228596870627147, + "ewc_loss": 0.029650483280420303, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00010973724420182407, + "grad_norm": 4.356703758239746, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8491813540458679, + "num_tokens": 96936936.0, + "step": 2538 + }, + { + "epoch": 0.3229868973413052, + "ewc_loss": 0.029957536607980728, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011036638170480728, + "grad_norm": 4.3489580154418945, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8586357831954956, + "num_tokens": 96975702.0, + "step": 2539 + }, + { + "epoch": 0.3231141076198957, + "ewc_loss": 0.029704706743359566, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00011027948494302109, + "grad_norm": 4.3598456382751465, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8526738286018372, + "num_tokens": 97010585.0, + "step": 2540 + }, + { + "epoch": 0.3232413178984862, + "ewc_loss": 0.02969975210726261, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00011022994294762611, + "grad_norm": 4.314905166625977, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8770689964294434, + "num_tokens": 97047210.0, + "step": 2541 + }, + { + "epoch": 0.32336852817707673, + "ewc_loss": 0.02992800623178482, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.0001100710651371628, + "grad_norm": 4.462264537811279, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8485381603240967, + "num_tokens": 97084835.0, + "step": 2542 + }, + { + "epoch": 0.3234957384556672, + "ewc_loss": 0.030020084232091904, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011099185940111056, + "grad_norm": 4.322719573974609, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8506556749343872, + "num_tokens": 97125829.0, + "step": 2543 + }, + { + "epoch": 0.32362294873425773, + "ewc_loss": 0.02987835556268692, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00010957457561744377, + "grad_norm": 4.354477405548096, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8464657068252563, + "num_tokens": 97162874.0, + "step": 2544 + }, + { + "epoch": 0.32375015901284826, + "ewc_loss": 0.02995096519589424, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011030065797967836, + "grad_norm": 4.603555202484131, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.855904221534729, + "num_tokens": 97197559.0, + "step": 2545 + }, + { + "epoch": 0.32387736929143873, + "ewc_loss": 0.029779423028230667, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00011102663847850636, + "grad_norm": 4.462989330291748, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8413950204849243, + "num_tokens": 97227028.0, + "step": 2546 + }, + { + "epoch": 0.32400457957002926, + "ewc_loss": 0.029592812061309814, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00010916054452536628, + "grad_norm": 4.3157243728637695, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8495467305183411, + "num_tokens": 97270209.0, + "step": 2547 + }, + { + "epoch": 0.3241317898486198, + "ewc_loss": 0.029851188883185387, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00010930290591204539, + "grad_norm": 4.319923400878906, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8565928936004639, + "num_tokens": 97309176.0, + "step": 2548 + }, + { + "epoch": 0.32425900012721026, + "ewc_loss": 0.029915498569607735, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00010994600597769022, + "grad_norm": 4.459106922149658, + "learning_rate": 1e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8342452049255371, + "num_tokens": 97340464.0, + "step": 2549 + }, + { + "epoch": 0.3243862104058008, + "ewc_loss": 0.029955927282571793, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011035029456252232, + "grad_norm": 4.292118549346924, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8567045331001282, + "num_tokens": 97378435.0, + "step": 2550 + }, + { + "epoch": 0.3245134206843913, + "ewc_loss": 0.029862282797694206, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00010941384243778884, + "grad_norm": 4.331888198852539, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8482679128646851, + "num_tokens": 97417886.0, + "step": 2551 + }, + { + "epoch": 0.3246406309629818, + "ewc_loss": 0.029966071248054504, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011045172868762165, + "grad_norm": 4.396853446960449, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8568891286849976, + "num_tokens": 97450552.0, + "step": 2552 + }, + { + "epoch": 0.3247678412415723, + "ewc_loss": 0.029677212238311768, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00011000453378073871, + "grad_norm": 4.324280738830566, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8596339821815491, + "num_tokens": 97488586.0, + "step": 2553 + }, + { + "epoch": 0.32489505152016285, + "ewc_loss": 0.02967953309416771, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00011002774408552796, + "grad_norm": 4.342585563659668, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8485024571418762, + "num_tokens": 97528893.0, + "step": 2554 + }, + { + "epoch": 0.3250222617987533, + "ewc_loss": 0.029705369845032692, + "ewc_loss_diag": 1.8715858459472656e-05, + "ewc_loss_parallel": 0.00011028612061636522, + "grad_norm": 4.381721019744873, + "learning_rate": 1e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8298609852790833, + "num_tokens": 97568654.0, + "step": 2555 + }, + { + "epoch": 0.32514947207734385, + "ewc_loss": 0.02999163046479225, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011070731852669269, + "grad_norm": 4.417308807373047, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8579857349395752, + "num_tokens": 97607068.0, + "step": 2556 + }, + { + "epoch": 0.3252766823559344, + "ewc_loss": 0.029962021857500076, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011041123798349872, + "grad_norm": 4.4206414222717285, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8749871253967285, + "num_tokens": 97647505.0, + "step": 2557 + }, + { + "epoch": 0.32540389263452485, + "ewc_loss": 0.02997775375843048, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011056855146307498, + "grad_norm": 4.3412933349609375, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8473411202430725, + "num_tokens": 97690505.0, + "step": 2558 + }, + { + "epoch": 0.3255311029131154, + "ewc_loss": 0.029920727014541626, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00010999829828506336, + "grad_norm": 4.4875006675720215, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8407482504844666, + "num_tokens": 97723955.0, + "step": 2559 + }, + { + "epoch": 0.3256583131917059, + "ewc_loss": 0.030009863898158073, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011088965402450413, + "grad_norm": 4.392934322357178, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8473154306411743, + "num_tokens": 97763822.0, + "step": 2560 + }, + { + "epoch": 0.3257855234702964, + "ewc_loss": 0.029917526990175247, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00010996629862347618, + "grad_norm": 4.318905830383301, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8489245176315308, + "num_tokens": 97803827.0, + "step": 2561 + }, + { + "epoch": 0.3259127337488869, + "ewc_loss": 0.02994789555668831, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011026996071450412, + "grad_norm": 4.3099141120910645, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8522244095802307, + "num_tokens": 97841056.0, + "step": 2562 + }, + { + "epoch": 0.32603994402747744, + "ewc_loss": 0.02996714785695076, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011046250438084826, + "grad_norm": 4.3343095779418945, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8699067234992981, + "num_tokens": 97881714.0, + "step": 2563 + }, + { + "epoch": 0.3261671543060679, + "ewc_loss": 0.029964003711938858, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011043105041608214, + "grad_norm": 4.350871562957764, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.850430965423584, + "num_tokens": 97925797.0, + "step": 2564 + }, + { + "epoch": 0.32629436458465844, + "ewc_loss": 0.029995271936058998, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011074373469455168, + "grad_norm": 4.483851432800293, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8580900430679321, + "num_tokens": 97963638.0, + "step": 2565 + }, + { + "epoch": 0.32642157486324896, + "ewc_loss": 0.03000056743621826, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011079668183811009, + "grad_norm": 4.342362880706787, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8467824459075928, + "num_tokens": 98002043.0, + "step": 2566 + }, + { + "epoch": 0.32654878514183944, + "ewc_loss": 0.02994626760482788, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.0001102536843973212, + "grad_norm": 4.395841598510742, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8553930521011353, + "num_tokens": 98041031.0, + "step": 2567 + }, + { + "epoch": 0.32667599542042997, + "ewc_loss": 0.030002569779753685, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011081671254942194, + "grad_norm": 4.427490711212158, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8611457347869873, + "num_tokens": 98075510.0, + "step": 2568 + }, + { + "epoch": 0.3268032056990205, + "ewc_loss": 0.02996380627155304, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011042908590752631, + "grad_norm": 4.357256889343262, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8595025539398193, + "num_tokens": 98114162.0, + "step": 2569 + }, + { + "epoch": 0.32693041597761097, + "ewc_loss": 0.02994345873594284, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011022561375284567, + "grad_norm": 4.462526321411133, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8486511707305908, + "num_tokens": 98148722.0, + "step": 2570 + }, + { + "epoch": 0.3270576262562015, + "ewc_loss": 0.030031351372599602, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011110452760476619, + "grad_norm": 4.393660068511963, + "learning_rate": 1e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8329609632492065, + "num_tokens": 98188255.0, + "step": 2571 + }, + { + "epoch": 0.327184836534792, + "ewc_loss": 0.029938314110040665, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011017414362868294, + "grad_norm": 4.391425609588623, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8636721968650818, + "num_tokens": 98228597.0, + "step": 2572 + }, + { + "epoch": 0.3273120468133825, + "ewc_loss": 0.029995733872056007, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011074835492763668, + "grad_norm": 4.367584228515625, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8442325592041016, + "num_tokens": 98273405.0, + "step": 2573 + }, + { + "epoch": 0.327439257091973, + "ewc_loss": 0.02996029518544674, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011039396486012265, + "grad_norm": 4.380385875701904, + "learning_rate": 1e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8383253812789917, + "num_tokens": 98315091.0, + "step": 2574 + }, + { + "epoch": 0.32756646737056355, + "ewc_loss": 0.029956014826893806, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011035116040147841, + "grad_norm": 4.333092212677002, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8534115552902222, + "num_tokens": 98356967.0, + "step": 2575 + }, + { + "epoch": 0.327693677649154, + "ewc_loss": 0.029969412833452225, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011048513988498598, + "grad_norm": 4.488062381744385, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8437130451202393, + "num_tokens": 98394455.0, + "step": 2576 + }, + { + "epoch": 0.32782088792774455, + "ewc_loss": 0.030041608959436417, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011120711133116856, + "grad_norm": 4.374619007110596, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8620057106018066, + "num_tokens": 98430455.0, + "step": 2577 + }, + { + "epoch": 0.3279480982063351, + "ewc_loss": 0.029935099184513092, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011014199844794348, + "grad_norm": 4.355335712432861, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8386324048042297, + "num_tokens": 98472218.0, + "step": 2578 + }, + { + "epoch": 0.3280753084849256, + "ewc_loss": 0.03002336621284485, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011102467396995053, + "grad_norm": 4.396207809448242, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8606030941009521, + "num_tokens": 98509540.0, + "step": 2579 + }, + { + "epoch": 0.3282025187635161, + "ewc_loss": 0.02999209240078926, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011071193148382008, + "grad_norm": 4.381361484527588, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8524125218391418, + "num_tokens": 98544761.0, + "step": 2580 + }, + { + "epoch": 0.3283297290421066, + "ewc_loss": 0.030020974576473236, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011100076517323032, + "grad_norm": 4.416727066040039, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8657933473587036, + "num_tokens": 98580522.0, + "step": 2581 + }, + { + "epoch": 0.32845693932069714, + "ewc_loss": 0.029985930770635605, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011065032595070079, + "grad_norm": 4.413124084472656, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8612658977508545, + "num_tokens": 98612988.0, + "step": 2582 + }, + { + "epoch": 0.3285841495992876, + "ewc_loss": 0.030029363930225372, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011108464241260663, + "grad_norm": 4.394522190093994, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8416280746459961, + "num_tokens": 98654213.0, + "step": 2583 + }, + { + "epoch": 0.32871135987787814, + "ewc_loss": 0.029972471296787262, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011051573528675362, + "grad_norm": 4.34655237197876, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8561239242553711, + "num_tokens": 98697168.0, + "step": 2584 + }, + { + "epoch": 0.32883857015646867, + "ewc_loss": 0.03000825271010399, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011087353777838871, + "grad_norm": 4.415548801422119, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8585779666900635, + "num_tokens": 98735664.0, + "step": 2585 + }, + { + "epoch": 0.32896578043505914, + "ewc_loss": 0.030013540759682655, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011092642671428621, + "grad_norm": 4.35332727432251, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.848233699798584, + "num_tokens": 98777257.0, + "step": 2586 + }, + { + "epoch": 0.32909299071364967, + "ewc_loss": 0.02998160943388939, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011060712131438777, + "grad_norm": 4.411125183105469, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8579532504081726, + "num_tokens": 98812443.0, + "step": 2587 + }, + { + "epoch": 0.3292202009922402, + "ewc_loss": 0.030047815293073654, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011126916797365993, + "grad_norm": 4.526623249053955, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8610070943832397, + "num_tokens": 98848231.0, + "step": 2588 + }, + { + "epoch": 0.32934741127083067, + "ewc_loss": 0.030051954090595245, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011131056089652702, + "grad_norm": 4.3528971672058105, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8566074967384338, + "num_tokens": 98886487.0, + "step": 2589 + }, + { + "epoch": 0.3294746215494212, + "ewc_loss": 0.0299600251019001, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.0001103912727558054, + "grad_norm": 4.418515205383301, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8428305983543396, + "num_tokens": 98924750.0, + "step": 2590 + }, + { + "epoch": 0.3296018318280117, + "ewc_loss": 0.030044889077544212, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011123990407213569, + "grad_norm": 4.395595550537109, + "learning_rate": 1e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.8315844535827637, + "num_tokens": 98964151.0, + "step": 2591 + }, + { + "epoch": 0.3297290421066022, + "ewc_loss": 0.029977284371852875, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011056386574637145, + "grad_norm": 4.416964054107666, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8641077280044556, + "num_tokens": 99000277.0, + "step": 2592 + }, + { + "epoch": 0.3298562523851927, + "ewc_loss": 0.030037838965654373, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011116940004285425, + "grad_norm": 4.323023319244385, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8440299034118652, + "num_tokens": 99042335.0, + "step": 2593 + }, + { + "epoch": 0.32998346266378326, + "ewc_loss": 0.029978446662425995, + "ewc_loss_diag": 1.895427703857422e-05, + "ewc_loss_parallel": 0.00011057547089876607, + "grad_norm": 4.332790374755859, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8517802357673645, + "num_tokens": 99082964.0, + "step": 2594 + }, + { + "epoch": 0.33011067294237373, + "ewc_loss": 0.0302901491522789, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011125108721898869, + "grad_norm": 4.289546966552734, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8767077922821045, + "num_tokens": 99123440.0, + "step": 2595 + }, + { + "epoch": 0.33023788322096426, + "ewc_loss": 0.03027593530714512, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011110896593891084, + "grad_norm": 4.34214973449707, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8514468669891357, + "num_tokens": 99167047.0, + "step": 2596 + }, + { + "epoch": 0.3303650934995548, + "ewc_loss": 0.03020690754055977, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011163939052494243, + "grad_norm": 4.360457897186279, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.850189208984375, + "num_tokens": 99202594.0, + "step": 2597 + }, + { + "epoch": 0.33049230377814526, + "ewc_loss": 0.03021416813135147, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011171199003001675, + "grad_norm": 4.3417229652404785, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8602507710456848, + "num_tokens": 99244732.0, + "step": 2598 + }, + { + "epoch": 0.3306195140567358, + "ewc_loss": 0.030185408890247345, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011142440052935854, + "grad_norm": 4.3277082443237305, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8635897636413574, + "num_tokens": 99287161.0, + "step": 2599 + }, + { + "epoch": 0.3307467243353263, + "ewc_loss": 0.030221644788980484, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.0001117867577704601, + "grad_norm": 4.481189250946045, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8455135822296143, + "num_tokens": 99322650.0, + "step": 2600 + }, + { + "epoch": 0.3308739346139168, + "ewc_loss": 0.030277956277132034, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011234988051000983, + "grad_norm": 4.451402187347412, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8539792895317078, + "num_tokens": 99358529.0, + "step": 2601 + }, + { + "epoch": 0.3310011448925073, + "ewc_loss": 0.030192697420716286, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011149728379677981, + "grad_norm": 4.378045558929443, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8620873093605042, + "num_tokens": 99394010.0, + "step": 2602 + }, + { + "epoch": 0.33112835517109784, + "ewc_loss": 0.03021962195634842, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.0001117665451602079, + "grad_norm": 4.3831329345703125, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8567080497741699, + "num_tokens": 99434764.0, + "step": 2603 + }, + { + "epoch": 0.3312555654496883, + "ewc_loss": 0.030230745673179626, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.0001118777654482983, + "grad_norm": 4.327850341796875, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8555785417556763, + "num_tokens": 99481322.0, + "step": 2604 + }, + { + "epoch": 0.33138277572827884, + "ewc_loss": 0.030210264027118683, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011167294724145904, + "grad_norm": 4.420301914215088, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8440394401550293, + "num_tokens": 99517719.0, + "step": 2605 + }, + { + "epoch": 0.3315099860068694, + "ewc_loss": 0.03026355430483818, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011220584565307945, + "grad_norm": 4.322159767150879, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8560130596160889, + "num_tokens": 99553662.0, + "step": 2606 + }, + { + "epoch": 0.33163719628545985, + "ewc_loss": 0.03022850677371025, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011185537005076185, + "grad_norm": 4.41223669052124, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8532184958457947, + "num_tokens": 99589195.0, + "step": 2607 + }, + { + "epoch": 0.3317644065640504, + "ewc_loss": 0.030429702252149582, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.0001126466304413043, + "grad_norm": 4.4106340408325195, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8560683727264404, + "num_tokens": 99626092.0, + "step": 2608 + }, + { + "epoch": 0.3318916168426409, + "ewc_loss": 0.030386444181203842, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011221403838135302, + "grad_norm": 4.388116836547852, + "learning_rate": 1e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.828947901725769, + "num_tokens": 99667087.0, + "step": 2609 + }, + { + "epoch": 0.3320188271212314, + "ewc_loss": 0.030408531427383423, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011243493645451963, + "grad_norm": 4.391819477081299, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.849035382270813, + "num_tokens": 99705929.0, + "step": 2610 + }, + { + "epoch": 0.3321460373998219, + "ewc_loss": 0.030409175902605057, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011244136840105057, + "grad_norm": 4.429394721984863, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8506415486335754, + "num_tokens": 99742044.0, + "step": 2611 + }, + { + "epoch": 0.33227324767841243, + "ewc_loss": 0.0304427407681942, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011277703015366569, + "grad_norm": 4.505953788757324, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8561239242553711, + "num_tokens": 99772621.0, + "step": 2612 + }, + { + "epoch": 0.3324004579570029, + "ewc_loss": 0.030468635261058807, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011303597420919687, + "grad_norm": 4.311861515045166, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8594492673873901, + "num_tokens": 99814405.0, + "step": 2613 + }, + { + "epoch": 0.33252766823559343, + "ewc_loss": 0.03036683239042759, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011201793677173555, + "grad_norm": 4.342399597167969, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.864124059677124, + "num_tokens": 99852197.0, + "step": 2614 + }, + { + "epoch": 0.33265487851418396, + "ewc_loss": 0.030443314462900162, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011278274905635044, + "grad_norm": 4.379263877868652, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8482566475868225, + "num_tokens": 99894254.0, + "step": 2615 + }, + { + "epoch": 0.33278208879277443, + "ewc_loss": 0.03045988455414772, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011294845899101347, + "grad_norm": 4.386999130249023, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8483776450157166, + "num_tokens": 99936230.0, + "step": 2616 + }, + { + "epoch": 0.33290929907136496, + "ewc_loss": 0.030430328100919724, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011265289504081011, + "grad_norm": 4.368807315826416, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8552425503730774, + "num_tokens": 99974964.0, + "step": 2617 + }, + { + "epoch": 0.3330365093499555, + "ewc_loss": 0.03045688197016716, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011291842383798212, + "grad_norm": 4.373862266540527, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8447012901306152, + "num_tokens": 100018750.0, + "step": 2618 + }, + { + "epoch": 0.33316371962854596, + "ewc_loss": 0.030465304851531982, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011300266487523913, + "grad_norm": 4.485569953918457, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8479276299476624, + "num_tokens": 100054064.0, + "step": 2619 + }, + { + "epoch": 0.3332909299071365, + "ewc_loss": 0.0305331964045763, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011368156992830336, + "grad_norm": 4.370537281036377, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8588834404945374, + "num_tokens": 100092549.0, + "step": 2620 + }, + { + "epoch": 0.333418140185727, + "ewc_loss": 0.03031151369214058, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011268544767517596, + "grad_norm": 4.518089294433594, + "learning_rate": 1e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.827737033367157, + "num_tokens": 100123973.0, + "step": 2621 + }, + { + "epoch": 0.3335453504643175, + "ewc_loss": 0.030450664460659027, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011407696729293093, + "grad_norm": 4.394186019897461, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.852024495601654, + "num_tokens": 100162142.0, + "step": 2622 + }, + { + "epoch": 0.333672560742908, + "ewc_loss": 0.030315443873405457, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011272473784629256, + "grad_norm": 4.392242908477783, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8491801023483276, + "num_tokens": 100199352.0, + "step": 2623 + }, + { + "epoch": 0.33379977102149855, + "ewc_loss": 0.03052784875035286, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011362809163983911, + "grad_norm": 4.44208288192749, + "learning_rate": 1e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8383982181549072, + "num_tokens": 100237446.0, + "step": 2624 + }, + { + "epoch": 0.333926981300089, + "ewc_loss": 0.03053443692624569, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011369397543603554, + "grad_norm": 12.194390296936035, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8583008050918579, + "num_tokens": 100272986.0, + "step": 2625 + }, + { + "epoch": 0.33405419157867955, + "ewc_loss": 0.03650598227977753, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.0001734094403218478, + "grad_norm": 5.7253642082214355, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8444474935531616, + "num_tokens": 100304998.0, + "step": 2626 + }, + { + "epoch": 0.3341814018572701, + "ewc_loss": 0.0302729532122612, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011107913451269269, + "grad_norm": 4.025364875793457, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8497061729431152, + "num_tokens": 100338473.0, + "step": 2627 + }, + { + "epoch": 0.33430861213586055, + "ewc_loss": 0.03162615746259689, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00012461119331419468, + "grad_norm": 5.061797618865967, + "learning_rate": 1e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.8372501134872437, + "num_tokens": 100372644.0, + "step": 2628 + }, + { + "epoch": 0.3344358224144511, + "ewc_loss": 0.032463617622852325, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00013298579142428935, + "grad_norm": 4.597299098968506, + "learning_rate": 1e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.832791805267334, + "num_tokens": 100413248.0, + "step": 2629 + }, + { + "epoch": 0.3345630326930416, + "ewc_loss": 0.030732885003089905, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011689917300827801, + "grad_norm": 4.684203624725342, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8474185466766357, + "num_tokens": 100448022.0, + "step": 2630 + }, + { + "epoch": 0.33469024297163213, + "ewc_loss": 0.03138595074415207, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00012342982518021017, + "grad_norm": 4.543872356414795, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8412366509437561, + "num_tokens": 100493222.0, + "step": 2631 + }, + { + "epoch": 0.3348174532502226, + "ewc_loss": 0.030870337039232254, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011827369598904625, + "grad_norm": 4.5333733558654785, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8591914176940918, + "num_tokens": 100534736.0, + "step": 2632 + }, + { + "epoch": 0.33494466352881314, + "ewc_loss": 0.0309058278799057, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011862858809763566, + "grad_norm": 4.541201591491699, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8604633808135986, + "num_tokens": 100576754.0, + "step": 2633 + }, + { + "epoch": 0.33507187380740366, + "ewc_loss": 0.030770786106586456, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.0001172781630884856, + "grad_norm": 4.549149513244629, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8456476330757141, + "num_tokens": 100611407.0, + "step": 2634 + }, + { + "epoch": 0.33519908408599414, + "ewc_loss": 0.030700290575623512, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011657321738312021, + "grad_norm": 4.610612869262695, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8598765134811401, + "num_tokens": 100643531.0, + "step": 2635 + }, + { + "epoch": 0.33532629436458466, + "ewc_loss": 0.030645448714494705, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011602479935390875, + "grad_norm": 4.452883720397949, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8549214601516724, + "num_tokens": 100678612.0, + "step": 2636 + }, + { + "epoch": 0.3354535046431752, + "ewc_loss": 0.030507909134030342, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011464940325822681, + "grad_norm": 4.503091812133789, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8577021360397339, + "num_tokens": 100718873.0, + "step": 2637 + }, + { + "epoch": 0.33558071492176567, + "ewc_loss": 0.030566291883587837, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.0001152332333731465, + "grad_norm": 4.453181743621826, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8521817326545715, + "num_tokens": 100759883.0, + "step": 2638 + }, + { + "epoch": 0.3357079252003562, + "ewc_loss": 0.030429519712924957, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011386550613678992, + "grad_norm": 4.471756458282471, + "learning_rate": 1e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8326952457427979, + "num_tokens": 100800022.0, + "step": 2639 + }, + { + "epoch": 0.3358351354789467, + "ewc_loss": 0.030460696667432785, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011417727364460006, + "grad_norm": 4.4924139976501465, + "learning_rate": 1e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8368548154830933, + "num_tokens": 100837299.0, + "step": 2640 + }, + { + "epoch": 0.3359623457575372, + "ewc_loss": 0.03043506294488907, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011392094165785238, + "grad_norm": 4.478339672088623, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8494592905044556, + "num_tokens": 100868433.0, + "step": 2641 + }, + { + "epoch": 0.3360895560361277, + "ewc_loss": 0.030410967767238617, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011367998376954347, + "grad_norm": 4.447854518890381, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8378588557243347, + "num_tokens": 100907768.0, + "step": 2642 + }, + { + "epoch": 0.33621676631471825, + "ewc_loss": 0.030393753200769424, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011350784916430712, + "grad_norm": 4.48680305480957, + "learning_rate": 1e-06, + "loss": 0.544, + "mean_token_accuracy": 0.8298025727272034, + "num_tokens": 100942723.0, + "step": 2643 + }, + { + "epoch": 0.3363439765933087, + "ewc_loss": 0.03039098158478737, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011348013504175469, + "grad_norm": 4.385695457458496, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8582314252853394, + "num_tokens": 100984997.0, + "step": 2644 + }, + { + "epoch": 0.33647118687189925, + "ewc_loss": 0.030331309884786606, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011288341920590028, + "grad_norm": 4.373212814331055, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8673751354217529, + "num_tokens": 101026290.0, + "step": 2645 + }, + { + "epoch": 0.3365983971504898, + "ewc_loss": 0.03061118721961975, + "ewc_loss_diag": 1.9311904907226562e-05, + "ewc_loss_parallel": 0.00011324077058816329, + "grad_norm": 4.437303066253662, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8459032773971558, + "num_tokens": 101066766.0, + "step": 2646 + }, + { + "epoch": 0.33672560742908025, + "ewc_loss": 0.030633199959993362, + "ewc_loss_diag": 1.9311904907226562e-05, + "ewc_loss_parallel": 0.00011346091923769563, + "grad_norm": 4.565190315246582, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8571770191192627, + "num_tokens": 101100833.0, + "step": 2647 + }, + { + "epoch": 0.3368528177076708, + "ewc_loss": 0.03040344826877117, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.0001136047940235585, + "grad_norm": 4.378306865692139, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8632702827453613, + "num_tokens": 101145552.0, + "step": 2648 + }, + { + "epoch": 0.3369800279862613, + "ewc_loss": 0.030252795666456223, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011209825606783852, + "grad_norm": 4.440090656280518, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8542460203170776, + "num_tokens": 101186644.0, + "step": 2649 + }, + { + "epoch": 0.3371072382648518, + "ewc_loss": 0.030364401638507843, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011321432975819334, + "grad_norm": 4.423990249633789, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8508145809173584, + "num_tokens": 101223356.0, + "step": 2650 + }, + { + "epoch": 0.3372344485434423, + "ewc_loss": 0.030309518799185753, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011266550427535549, + "grad_norm": 4.422088623046875, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8649588227272034, + "num_tokens": 101260527.0, + "step": 2651 + }, + { + "epoch": 0.33736165882203284, + "ewc_loss": 0.030350781977176666, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011307812383165583, + "grad_norm": 4.356368064880371, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8516535758972168, + "num_tokens": 101305403.0, + "step": 2652 + }, + { + "epoch": 0.3374888691006233, + "ewc_loss": 0.030545122921466827, + "ewc_loss_diag": 1.9311904907226562e-05, + "ewc_loss_parallel": 0.00011258013546466827, + "grad_norm": 4.426222324371338, + "learning_rate": 1e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.838908314704895, + "num_tokens": 101345858.0, + "step": 2653 + }, + { + "epoch": 0.33761607937921384, + "ewc_loss": 0.030632026493549347, + "ewc_loss_diag": 1.9311904907226562e-05, + "ewc_loss_parallel": 0.00011344916129019111, + "grad_norm": 4.488892078399658, + "learning_rate": 1e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.830108106136322, + "num_tokens": 101380321.0, + "step": 2654 + }, + { + "epoch": 0.33774328965780437, + "ewc_loss": 0.03037247434258461, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011329506378388032, + "grad_norm": 4.373327255249023, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8612631559371948, + "num_tokens": 101419120.0, + "step": 2655 + }, + { + "epoch": 0.33787049993639484, + "ewc_loss": 0.030339209362864494, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011296240700175986, + "grad_norm": 4.468209266662598, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8539491295814514, + "num_tokens": 101453272.0, + "step": 2656 + }, + { + "epoch": 0.33799771021498537, + "ewc_loss": 0.030407458543777466, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011364490637788549, + "grad_norm": 4.478127479553223, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8410704135894775, + "num_tokens": 101486879.0, + "step": 2657 + }, + { + "epoch": 0.3381249204935759, + "ewc_loss": 0.030373558402061462, + "ewc_loss_diag": 1.9073486328125e-05, + "ewc_loss_parallel": 0.00011330588313285261, + "grad_norm": 4.454045295715332, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8460918664932251, + "num_tokens": 101523691.0, + "step": 2658 + }, + { + "epoch": 0.33825213077216637, + "ewc_loss": 0.030500106513500214, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011335068120388314, + "grad_norm": 4.459022521972656, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8532603979110718, + "num_tokens": 101555279.0, + "step": 2659 + }, + { + "epoch": 0.3383793410507569, + "ewc_loss": 0.0304938443005085, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011328804976074025, + "grad_norm": 4.405228137969971, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8694103956222534, + "num_tokens": 101591780.0, + "step": 2660 + }, + { + "epoch": 0.3385065513293474, + "ewc_loss": 0.030607067048549652, + "ewc_loss_diag": 1.9311904907226562e-05, + "ewc_loss_parallel": 0.0001131995813921094, + "grad_norm": 4.349667549133301, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8691514730453491, + "num_tokens": 101631041.0, + "step": 2661 + }, + { + "epoch": 0.3386337616079379, + "ewc_loss": 0.03049059584736824, + "ewc_loss_diag": 1.919269561767578e-05, + "ewc_loss_parallel": 0.00011325557716190815, + "grad_norm": 4.435238838195801, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8427561521530151, + "num_tokens": 101666315.0, + "step": 2662 + }, + { + "epoch": 0.33876097188652843, + "ewc_loss": 0.030795034021139145, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011385853576939553, + "grad_norm": 4.412296772003174, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8657567501068115, + "num_tokens": 101704046.0, + "step": 2663 + }, + { + "epoch": 0.33888818216511896, + "ewc_loss": 0.030772045254707336, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011362865916453302, + "grad_norm": 4.371406078338623, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8641887903213501, + "num_tokens": 101743076.0, + "step": 2664 + }, + { + "epoch": 0.33901539244370943, + "ewc_loss": 0.03069009818136692, + "ewc_loss_diag": 1.9311904907226562e-05, + "ewc_loss_parallel": 0.00011402988457120955, + "grad_norm": 4.42176628112793, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8654011487960815, + "num_tokens": 101784006.0, + "step": 2665 + }, + { + "epoch": 0.33914260272229996, + "ewc_loss": 0.0307894516736269, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011380272189853713, + "grad_norm": 4.38144588470459, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8599148988723755, + "num_tokens": 101821845.0, + "step": 2666 + }, + { + "epoch": 0.3392698130008905, + "ewc_loss": 0.030773062258958817, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011363883822923526, + "grad_norm": 4.4147257804870605, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.858741283416748, + "num_tokens": 101861121.0, + "step": 2667 + }, + { + "epoch": 0.33939702327948096, + "ewc_loss": 0.03079685941338539, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011387679114704952, + "grad_norm": 4.498487949371338, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8495153188705444, + "num_tokens": 101894330.0, + "step": 2668 + }, + { + "epoch": 0.3395242335580715, + "ewc_loss": 0.03084796480834484, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011438784713391215, + "grad_norm": 4.420866012573242, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8392623662948608, + "num_tokens": 101934835.0, + "step": 2669 + }, + { + "epoch": 0.339651443836662, + "ewc_loss": 0.030783027410507202, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011373848974471912, + "grad_norm": 4.485669136047363, + "learning_rate": 1e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8368269801139832, + "num_tokens": 101970943.0, + "step": 2670 + }, + { + "epoch": 0.3397786541152525, + "ewc_loss": 0.030846087262034416, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011436907516326755, + "grad_norm": 4.491783618927002, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8413679599761963, + "num_tokens": 102001211.0, + "step": 2671 + }, + { + "epoch": 0.339905864393843, + "ewc_loss": 0.03079790249466896, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011388722487026826, + "grad_norm": 4.402022361755371, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8601667881011963, + "num_tokens": 102039399.0, + "step": 2672 + }, + { + "epoch": 0.34003307467243354, + "ewc_loss": 0.030791817232966423, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011382637603674084, + "grad_norm": 4.522730827331543, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8422441482543945, + "num_tokens": 102070635.0, + "step": 2673 + }, + { + "epoch": 0.340160284951024, + "ewc_loss": 0.030855488032102585, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011446308781160042, + "grad_norm": 4.415921211242676, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8574211597442627, + "num_tokens": 102104590.0, + "step": 2674 + }, + { + "epoch": 0.34028749522961454, + "ewc_loss": 0.030774056911468506, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011364878446329385, + "grad_norm": 4.468536376953125, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8585426211357117, + "num_tokens": 102149267.0, + "step": 2675 + }, + { + "epoch": 0.3404147055082051, + "ewc_loss": 0.030862540006637573, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011453360639279708, + "grad_norm": 4.404482364654541, + "learning_rate": 1e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.8365675806999207, + "num_tokens": 102188089.0, + "step": 2676 + }, + { + "epoch": 0.34054191578679555, + "ewc_loss": 0.030918501317501068, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011387251288397238, + "grad_norm": 4.427613735198975, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.85999596118927, + "num_tokens": 102224335.0, + "step": 2677 + }, + { + "epoch": 0.3406691260653861, + "ewc_loss": 0.030945565551519394, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011414315667934716, + "grad_norm": 4.3755011558532715, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8569879531860352, + "num_tokens": 102263421.0, + "step": 2678 + }, + { + "epoch": 0.3407963363439766, + "ewc_loss": 0.030787404626607895, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011378226190572605, + "grad_norm": 4.476369380950928, + "learning_rate": 1e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.8355535864830017, + "num_tokens": 102301133.0, + "step": 2679 + }, + { + "epoch": 0.34092354662256713, + "ewc_loss": 0.031006546691060066, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.0001147529692389071, + "grad_norm": 4.409771919250488, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8573065996170044, + "num_tokens": 102338516.0, + "step": 2680 + }, + { + "epoch": 0.3410507569011576, + "ewc_loss": 0.03079361468553543, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011384434765204787, + "grad_norm": 4.437061786651611, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8423334956169128, + "num_tokens": 102376010.0, + "step": 2681 + }, + { + "epoch": 0.34117796717974813, + "ewc_loss": 0.030975760892033577, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011444510892033577, + "grad_norm": 4.386760711669922, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8745564222335815, + "num_tokens": 102417080.0, + "step": 2682 + }, + { + "epoch": 0.34130517745833866, + "ewc_loss": 0.030832912772893906, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011423733667470515, + "grad_norm": 4.43009614944458, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8552106618881226, + "num_tokens": 102458042.0, + "step": 2683 + }, + { + "epoch": 0.34143238773692913, + "ewc_loss": 0.030986890196800232, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011455640924395993, + "grad_norm": 4.432628154754639, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8613999485969543, + "num_tokens": 102497152.0, + "step": 2684 + }, + { + "epoch": 0.34155959801551966, + "ewc_loss": 0.030967887490987778, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011436637578299269, + "grad_norm": 4.439272880554199, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8513767719268799, + "num_tokens": 102531486.0, + "step": 2685 + }, + { + "epoch": 0.3416868082941102, + "ewc_loss": 0.031001053750514984, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011469804303487763, + "grad_norm": 4.447269439697266, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8524597883224487, + "num_tokens": 102571552.0, + "step": 2686 + }, + { + "epoch": 0.34181401857270066, + "ewc_loss": 0.03097512386739254, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011443873518146574, + "grad_norm": 4.430622577667236, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8588463068008423, + "num_tokens": 102607338.0, + "step": 2687 + }, + { + "epoch": 0.3419412288512912, + "ewc_loss": 0.030966296792030334, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011435047053964809, + "grad_norm": 4.351742744445801, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8532887697219849, + "num_tokens": 102654445.0, + "step": 2688 + }, + { + "epoch": 0.3420684391298817, + "ewc_loss": 0.030958034098148346, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011426783021306619, + "grad_norm": 4.436974048614502, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8619956970214844, + "num_tokens": 102695265.0, + "step": 2689 + }, + { + "epoch": 0.3421956494084722, + "ewc_loss": 0.03101659193634987, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011485340655781329, + "grad_norm": 4.4295525550842285, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8585678339004517, + "num_tokens": 102728621.0, + "step": 2690 + }, + { + "epoch": 0.3423228596870627, + "ewc_loss": 0.030998632311820984, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011467382137198001, + "grad_norm": 4.508866310119629, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8476724624633789, + "num_tokens": 102766378.0, + "step": 2691 + }, + { + "epoch": 0.34245006996565325, + "ewc_loss": 0.031136123463511467, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011482802801765501, + "grad_norm": 4.39350700378418, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8525924682617188, + "num_tokens": 102806331.0, + "step": 2692 + }, + { + "epoch": 0.3425772802442437, + "ewc_loss": 0.03109303116798401, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.000114397109427955, + "grad_norm": 4.451059341430664, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8566620945930481, + "num_tokens": 102845314.0, + "step": 2693 + }, + { + "epoch": 0.34270449052283425, + "ewc_loss": 0.03127129375934601, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011495905346237123, + "grad_norm": 4.400824546813965, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8525308966636658, + "num_tokens": 102888614.0, + "step": 2694 + }, + { + "epoch": 0.3428317008014248, + "ewc_loss": 0.03108522668480873, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011431907478254288, + "grad_norm": 4.479286193847656, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8484225869178772, + "num_tokens": 102925988.0, + "step": 2695 + }, + { + "epoch": 0.34295891108001525, + "ewc_loss": 0.031147468835115433, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011494148202473298, + "grad_norm": 4.452973365783691, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8615838289260864, + "num_tokens": 102963150.0, + "step": 2696 + }, + { + "epoch": 0.3430861213586058, + "ewc_loss": 0.03109624795615673, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.0001144292764365673, + "grad_norm": 4.533799648284912, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8623030781745911, + "num_tokens": 102999164.0, + "step": 2697 + }, + { + "epoch": 0.3432133316371963, + "ewc_loss": 0.031118184328079224, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011464864655863494, + "grad_norm": 4.451986312866211, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8618600964546204, + "num_tokens": 103034337.0, + "step": 2698 + }, + { + "epoch": 0.3433405419157868, + "ewc_loss": 0.03107185661792755, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011418536450946704, + "grad_norm": 4.42988920211792, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8603237271308899, + "num_tokens": 103075062.0, + "step": 2699 + }, + { + "epoch": 0.3434677521943773, + "ewc_loss": 0.03110085055232048, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011447530414443463, + "grad_norm": 4.48688268661499, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.850037157535553, + "num_tokens": 103113209.0, + "step": 2700 + }, + { + "epoch": 0.34359496247296784, + "ewc_loss": 0.031088830903172493, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011435510532464832, + "grad_norm": 4.409646511077881, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8579229712486267, + "num_tokens": 103153117.0, + "step": 2701 + }, + { + "epoch": 0.3437221727515583, + "ewc_loss": 0.031077397987246513, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011424077820265666, + "grad_norm": 4.449710845947266, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.852826714515686, + "num_tokens": 103192234.0, + "step": 2702 + }, + { + "epoch": 0.34384938303014884, + "ewc_loss": 0.030895834788680077, + "ewc_loss_diag": 1.9431114196777344e-05, + "ewc_loss_parallel": 0.00011486655421322212, + "grad_norm": 4.4768900871276855, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8507353067398071, + "num_tokens": 103224958.0, + "step": 2703 + }, + { + "epoch": 0.34397659330873936, + "ewc_loss": 0.03098250925540924, + "ewc_loss_diag": 1.9550323486328125e-05, + "ewc_loss_parallel": 0.00011451257887529209, + "grad_norm": 4.438849449157715, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8561487793922424, + "num_tokens": 103259679.0, + "step": 2704 + }, + { + "epoch": 0.34410380358732984, + "ewc_loss": 0.031241945922374725, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.0001146655558841303, + "grad_norm": 4.429365158081055, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8461216688156128, + "num_tokens": 103299987.0, + "step": 2705 + }, + { + "epoch": 0.34423101386592037, + "ewc_loss": 0.031117377802729607, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011464057024568319, + "grad_norm": 4.411474227905273, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8404675722122192, + "num_tokens": 103344457.0, + "step": 2706 + }, + { + "epoch": 0.3443582241445109, + "ewc_loss": 0.031122535467147827, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011469215678516775, + "grad_norm": 4.395781517028809, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8591318130493164, + "num_tokens": 103385173.0, + "step": 2707 + }, + { + "epoch": 0.34448543442310137, + "ewc_loss": 0.031155822798609734, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011502502457005903, + "grad_norm": 4.5077972412109375, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8392153382301331, + "num_tokens": 103425972.0, + "step": 2708 + }, + { + "epoch": 0.3446126447016919, + "ewc_loss": 0.031211256980895996, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011557935795281082, + "grad_norm": 4.448210716247559, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8492216467857361, + "num_tokens": 103466819.0, + "step": 2709 + }, + { + "epoch": 0.3447398549802824, + "ewc_loss": 0.03111167624592781, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011458354856586084, + "grad_norm": 4.4012041091918945, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8417317271232605, + "num_tokens": 103509660.0, + "step": 2710 + }, + { + "epoch": 0.3448670652588729, + "ewc_loss": 0.031152263283729553, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011498942330945283, + "grad_norm": 4.504033088684082, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8473589420318604, + "num_tokens": 103544557.0, + "step": 2711 + }, + { + "epoch": 0.3449942755374634, + "ewc_loss": 0.03135937079787254, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011583980085561052, + "grad_norm": 4.423971652984619, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8636329174041748, + "num_tokens": 103585132.0, + "step": 2712 + }, + { + "epoch": 0.34512148581605395, + "ewc_loss": 0.03125566244125366, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011480271496111527, + "grad_norm": 4.420146465301514, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8615039587020874, + "num_tokens": 103626353.0, + "step": 2713 + }, + { + "epoch": 0.3452486960946444, + "ewc_loss": 0.03129767253994942, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.000115222814201843, + "grad_norm": 4.48714017868042, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.855683445930481, + "num_tokens": 103659035.0, + "step": 2714 + }, + { + "epoch": 0.34537590637323495, + "ewc_loss": 0.031297191977500916, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011521801206981763, + "grad_norm": 4.416872501373291, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8622435331344604, + "num_tokens": 103699497.0, + "step": 2715 + }, + { + "epoch": 0.3455031166518255, + "ewc_loss": 0.03125713765621185, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011481747787911445, + "grad_norm": 4.484987735748291, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8426947593688965, + "num_tokens": 103733685.0, + "step": 2716 + }, + { + "epoch": 0.34563032693041595, + "ewc_loss": 0.03132035583257675, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011544965673238039, + "grad_norm": 4.446032524108887, + "learning_rate": 1e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8371942043304443, + "num_tokens": 103769399.0, + "step": 2717 + }, + { + "epoch": 0.3457575372090065, + "ewc_loss": 0.031386394053697586, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011488934251246974, + "grad_norm": 4.566901206970215, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.861132800579071, + "num_tokens": 103809078.0, + "step": 2718 + }, + { + "epoch": 0.345884747487597, + "ewc_loss": 0.03133856877684593, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011563178850337863, + "grad_norm": 4.406370162963867, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8710036277770996, + "num_tokens": 103846764.0, + "step": 2719 + }, + { + "epoch": 0.3460119577661875, + "ewc_loss": 0.031197015196084976, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011421624367358163, + "grad_norm": 4.462436676025391, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8628672361373901, + "num_tokens": 103883089.0, + "step": 2720 + }, + { + "epoch": 0.346139168044778, + "ewc_loss": 0.03133275359869003, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011557364632608369, + "grad_norm": 4.449230194091797, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8724058866500854, + "num_tokens": 103915775.0, + "step": 2721 + }, + { + "epoch": 0.34626637832336854, + "ewc_loss": 0.031242797151207924, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.0001146740687545389, + "grad_norm": 4.46389102935791, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8493461012840271, + "num_tokens": 103953703.0, + "step": 2722 + }, + { + "epoch": 0.346393588601959, + "ewc_loss": 0.03128751367330551, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011512122728163376, + "grad_norm": 4.486659526824951, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8440026044845581, + "num_tokens": 103990775.0, + "step": 2723 + }, + { + "epoch": 0.34652079888054954, + "ewc_loss": 0.03129500895738602, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011519618419697508, + "grad_norm": 4.419012546539307, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8573432564735413, + "num_tokens": 104025927.0, + "step": 2724 + }, + { + "epoch": 0.34664800915914007, + "ewc_loss": 0.03126337379217148, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.000114879832835868, + "grad_norm": 4.420363903045654, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8451459407806396, + "num_tokens": 104065860.0, + "step": 2725 + }, + { + "epoch": 0.34677521943773054, + "ewc_loss": 0.03141314908862114, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011515687219798565, + "grad_norm": 4.6335272789001465, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8558984994888306, + "num_tokens": 104102117.0, + "step": 2726 + }, + { + "epoch": 0.34690242971632107, + "ewc_loss": 0.03136250376701355, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011587110930122435, + "grad_norm": 4.427245616912842, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8509584069252014, + "num_tokens": 104140896.0, + "step": 2727 + }, + { + "epoch": 0.3470296399949116, + "ewc_loss": 0.031236305832862854, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011460915266070515, + "grad_norm": 4.501377582550049, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8527545928955078, + "num_tokens": 104181355.0, + "step": 2728 + }, + { + "epoch": 0.3471568502735021, + "ewc_loss": 0.03131600841879845, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011540616833372042, + "grad_norm": 4.436334133148193, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.855099081993103, + "num_tokens": 104218564.0, + "step": 2729 + }, + { + "epoch": 0.3472840605520926, + "ewc_loss": 0.03144177794456482, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011544316657818854, + "grad_norm": 4.526885986328125, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8596177101135254, + "num_tokens": 104256553.0, + "step": 2730 + }, + { + "epoch": 0.3474112708306831, + "ewc_loss": 0.03135443478822708, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011579043348319829, + "grad_norm": 4.480513572692871, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8682334423065186, + "num_tokens": 104291150.0, + "step": 2731 + }, + { + "epoch": 0.34753848110927366, + "ewc_loss": 0.03139397129416466, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011496510705910623, + "grad_norm": 4.480138301849365, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8467086553573608, + "num_tokens": 104327977.0, + "step": 2732 + }, + { + "epoch": 0.34766569138786413, + "ewc_loss": 0.03119213879108429, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011538818216649815, + "grad_norm": 4.481433868408203, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8511723875999451, + "num_tokens": 104365167.0, + "step": 2733 + }, + { + "epoch": 0.34779290166645466, + "ewc_loss": 0.03133312612771988, + "ewc_loss_diag": 1.9788742065429688e-05, + "ewc_loss_parallel": 0.00011557737161638215, + "grad_norm": 4.615164756774902, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8596124053001404, + "num_tokens": 104400304.0, + "step": 2734 + }, + { + "epoch": 0.3479201119450452, + "ewc_loss": 0.03125131502747536, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011597995035117492, + "grad_norm": 4.451963424682617, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8397154211997986, + "num_tokens": 104437974.0, + "step": 2735 + }, + { + "epoch": 0.34804732222363566, + "ewc_loss": 0.03139832243323326, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011500861000968143, + "grad_norm": 4.507138252258301, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8579747676849365, + "num_tokens": 104474474.0, + "step": 2736 + }, + { + "epoch": 0.3481745325022262, + "ewc_loss": 0.03149332106113434, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.0001159586026915349, + "grad_norm": 4.515653133392334, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8399391770362854, + "num_tokens": 104511004.0, + "step": 2737 + }, + { + "epoch": 0.3483017427808167, + "ewc_loss": 0.03147566691040993, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011578205158002675, + "grad_norm": 4.56588077545166, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8486183881759644, + "num_tokens": 104551889.0, + "step": 2738 + }, + { + "epoch": 0.3484289530594072, + "ewc_loss": 0.03147625923156738, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011578797420952469, + "grad_norm": 4.439661502838135, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8386832475662231, + "num_tokens": 104591962.0, + "step": 2739 + }, + { + "epoch": 0.3485561633379977, + "ewc_loss": 0.03143151104450226, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011534051736816764, + "grad_norm": 4.489711284637451, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8518836498260498, + "num_tokens": 104636201.0, + "step": 2740 + }, + { + "epoch": 0.34868337361658824, + "ewc_loss": 0.031504642218351364, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011607180204009637, + "grad_norm": 4.456352233886719, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8538159728050232, + "num_tokens": 104673448.0, + "step": 2741 + }, + { + "epoch": 0.3488105838951787, + "ewc_loss": 0.03143855184316635, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011541090498212725, + "grad_norm": 4.5397844314575195, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8620409965515137, + "num_tokens": 104708879.0, + "step": 2742 + }, + { + "epoch": 0.34893779417376924, + "ewc_loss": 0.03166965767741203, + "ewc_loss_diag": 2.002716064453125e-05, + "ewc_loss_parallel": 0.00011650125816231593, + "grad_norm": 4.474061012268066, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8644031286239624, + "num_tokens": 104747073.0, + "step": 2743 + }, + { + "epoch": 0.3490650044523598, + "ewc_loss": 0.03155780956149101, + "ewc_loss_diag": 2.002716064453125e-05, + "ewc_loss_parallel": 0.00011538279068190604, + "grad_norm": 12.138662338256836, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8537268042564392, + "num_tokens": 104786950.0, + "step": 2744 + }, + { + "epoch": 0.34919221473095025, + "ewc_loss": 0.037798527628183365, + "ewc_loss_diag": 2.002716064453125e-05, + "ewc_loss_parallel": 0.00017778995970729738, + "grad_norm": 5.758662700653076, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8589901924133301, + "num_tokens": 104828306.0, + "step": 2745 + }, + { + "epoch": 0.3493194250095408, + "ewc_loss": 0.03126191347837448, + "ewc_loss_diag": 2.002716064453125e-05, + "ewc_loss_parallel": 0.00011242381879128516, + "grad_norm": 3.966295003890991, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8464877009391785, + "num_tokens": 104867030.0, + "step": 2746 + }, + { + "epoch": 0.3494466352881313, + "ewc_loss": 0.032940782606601715, + "ewc_loss_diag": 2.002716064453125e-05, + "ewc_loss_parallel": 0.00012921250890940428, + "grad_norm": 5.206602573394775, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8591828346252441, + "num_tokens": 104909182.0, + "step": 2747 + }, + { + "epoch": 0.3495738455667218, + "ewc_loss": 0.03393755853176117, + "ewc_loss_diag": 2.002716064453125e-05, + "ewc_loss_parallel": 0.00013918026525061578, + "grad_norm": 4.739304065704346, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8504804372787476, + "num_tokens": 104950648.0, + "step": 2748 + }, + { + "epoch": 0.3497010558453123, + "ewc_loss": 0.0320308618247509, + "ewc_loss_diag": 2.002716064453125e-05, + "ewc_loss_parallel": 0.00012011329818051308, + "grad_norm": 4.611895561218262, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8595223426818848, + "num_tokens": 104994780.0, + "step": 2749 + }, + { + "epoch": 0.34982826612390283, + "ewc_loss": 0.03253757953643799, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00012640116619877517, + "grad_norm": 4.683279514312744, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.861138105392456, + "num_tokens": 105029469.0, + "step": 2750 + }, + { + "epoch": 0.3499554764024933, + "ewc_loss": 0.03213378041982651, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00012236319889780134, + "grad_norm": 4.63863468170166, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.847109854221344, + "num_tokens": 105066797.0, + "step": 2751 + }, + { + "epoch": 0.35008268668108383, + "ewc_loss": 0.03202398121356964, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00012126521323807538, + "grad_norm": 4.634352207183838, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8694891929626465, + "num_tokens": 105100869.0, + "step": 2752 + }, + { + "epoch": 0.35020989695967436, + "ewc_loss": 0.03194626420736313, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00012048805365338922, + "grad_norm": 4.53600549697876, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8671848177909851, + "num_tokens": 105135409.0, + "step": 2753 + }, + { + "epoch": 0.35033710723826483, + "ewc_loss": 0.03180282935500145, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011905367864528671, + "grad_norm": 4.610891342163086, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8614970445632935, + "num_tokens": 105172839.0, + "step": 2754 + }, + { + "epoch": 0.35046431751685536, + "ewc_loss": 0.031807973980903625, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011910513421753421, + "grad_norm": 4.488389492034912, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8606200814247131, + "num_tokens": 105216656.0, + "step": 2755 + }, + { + "epoch": 0.3505915277954459, + "ewc_loss": 0.03166870027780533, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011771239223890007, + "grad_norm": 4.552215576171875, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8484749794006348, + "num_tokens": 105253740.0, + "step": 2756 + }, + { + "epoch": 0.35071873807403636, + "ewc_loss": 0.03173277899622917, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011835319310193881, + "grad_norm": 4.519164562225342, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8574155569076538, + "num_tokens": 105290073.0, + "step": 2757 + }, + { + "epoch": 0.3508459483526269, + "ewc_loss": 0.0316275954246521, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011730135884135962, + "grad_norm": 4.504444599151611, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.861864447593689, + "num_tokens": 105325933.0, + "step": 2758 + }, + { + "epoch": 0.3509731586312174, + "ewc_loss": 0.03163507580757141, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011737614113371819, + "grad_norm": 4.49030065536499, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8586194515228271, + "num_tokens": 105362962.0, + "step": 2759 + }, + { + "epoch": 0.3511003689098079, + "ewc_loss": 0.03162898123264313, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011731521954061463, + "grad_norm": 4.458006858825684, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8553038239479065, + "num_tokens": 105410074.0, + "step": 2760 + }, + { + "epoch": 0.3512275791883984, + "ewc_loss": 0.03160874545574188, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011711285333149135, + "grad_norm": 4.500358581542969, + "learning_rate": 1e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8399794101715088, + "num_tokens": 105450264.0, + "step": 2761 + }, + { + "epoch": 0.35135478946698895, + "ewc_loss": 0.03163296729326248, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011735506996046752, + "grad_norm": 4.513068675994873, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8459696769714355, + "num_tokens": 105486566.0, + "step": 2762 + }, + { + "epoch": 0.3514819997455794, + "ewc_loss": 0.03159848228096962, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.0001170102259493433, + "grad_norm": 4.473388195037842, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8593190908432007, + "num_tokens": 105527185.0, + "step": 2763 + }, + { + "epoch": 0.35160921002416995, + "ewc_loss": 0.031320810317993164, + "ewc_loss_diag": 1.9669532775878906e-05, + "ewc_loss_parallel": 0.00011667490616673604, + "grad_norm": 4.558531761169434, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8535599708557129, + "num_tokens": 105557401.0, + "step": 2764 + }, + { + "epoch": 0.3517364203027605, + "ewc_loss": 0.03160709887742996, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011709636601153761, + "grad_norm": 4.532504558563232, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8627367615699768, + "num_tokens": 105586877.0, + "step": 2765 + }, + { + "epoch": 0.35186363058135095, + "ewc_loss": 0.03156086802482605, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.0001166340516647324, + "grad_norm": 4.455769062042236, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8600335121154785, + "num_tokens": 105626988.0, + "step": 2766 + }, + { + "epoch": 0.3519908408599415, + "ewc_loss": 0.031560979783535004, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011663518671412021, + "grad_norm": 4.49879789352417, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.846542477607727, + "num_tokens": 105665796.0, + "step": 2767 + }, + { + "epoch": 0.352118051138532, + "ewc_loss": 0.03161075338721275, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011713292042259127, + "grad_norm": 4.427379608154297, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8482558727264404, + "num_tokens": 105708892.0, + "step": 2768 + }, + { + "epoch": 0.3522452614171225, + "ewc_loss": 0.03179185837507248, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011650256055872887, + "grad_norm": 4.427478790283203, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8492898344993591, + "num_tokens": 105748221.0, + "step": 2769 + }, + { + "epoch": 0.352372471695713, + "ewc_loss": 0.031614065170288086, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011716603330569342, + "grad_norm": 4.5023417472839355, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8528079986572266, + "num_tokens": 105784568.0, + "step": 2770 + }, + { + "epoch": 0.35249968197430354, + "ewc_loss": 0.03162669762969017, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011729235848179087, + "grad_norm": 4.440670967102051, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8424463272094727, + "num_tokens": 105827223.0, + "step": 2771 + }, + { + "epoch": 0.352626892252894, + "ewc_loss": 0.03157271444797516, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.0001167525551863946, + "grad_norm": 4.426168918609619, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8730121850967407, + "num_tokens": 105871014.0, + "step": 2772 + }, + { + "epoch": 0.35275410253148454, + "ewc_loss": 0.031866803765296936, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011725201329682022, + "grad_norm": 4.56246280670166, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8504219055175781, + "num_tokens": 105908995.0, + "step": 2773 + }, + { + "epoch": 0.35288131281007507, + "ewc_loss": 0.031899355351924896, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.0001175775469164364, + "grad_norm": 4.47250509262085, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8488510847091675, + "num_tokens": 105948243.0, + "step": 2774 + }, + { + "epoch": 0.35300852308866554, + "ewc_loss": 0.03180074691772461, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011659145093290135, + "grad_norm": 4.502482891082764, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8574216365814209, + "num_tokens": 105983997.0, + "step": 2775 + }, + { + "epoch": 0.35313573336725607, + "ewc_loss": 0.03160609304904938, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.00011708632519003004, + "grad_norm": 4.527744770050049, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8417274951934814, + "num_tokens": 106020336.0, + "step": 2776 + }, + { + "epoch": 0.3532629436458466, + "ewc_loss": 0.03182254731655121, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011680947500281036, + "grad_norm": 4.4918975830078125, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8559777736663818, + "num_tokens": 106053300.0, + "step": 2777 + }, + { + "epoch": 0.35339015392443707, + "ewc_loss": 0.03182687982916832, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011685279605444521, + "grad_norm": 4.399829864501953, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8696885704994202, + "num_tokens": 106091790.0, + "step": 2778 + }, + { + "epoch": 0.3535173642030276, + "ewc_loss": 0.03182793781161308, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011686337529681623, + "grad_norm": 4.539758205413818, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8573175668716431, + "num_tokens": 106123768.0, + "step": 2779 + }, + { + "epoch": 0.3536445744816181, + "ewc_loss": 0.03167494386434555, + "ewc_loss_diag": 1.990795135498047e-05, + "ewc_loss_parallel": 0.0001177748417831026, + "grad_norm": 4.484500408172607, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8623443245887756, + "num_tokens": 106158219.0, + "step": 2780 + }, + { + "epoch": 0.35377178476020865, + "ewc_loss": 0.03185846656560898, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011716863082256168, + "grad_norm": 4.50435209274292, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8506540060043335, + "num_tokens": 106195795.0, + "step": 2781 + }, + { + "epoch": 0.3538989950387991, + "ewc_loss": 0.03190930560231209, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011767704563681036, + "grad_norm": 4.506014823913574, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8554922342300415, + "num_tokens": 106234973.0, + "step": 2782 + }, + { + "epoch": 0.35402620531738965, + "ewc_loss": 0.03190279379487038, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011761191126424819, + "grad_norm": 4.564794063568115, + "learning_rate": 1e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8407459855079651, + "num_tokens": 106267297.0, + "step": 2783 + }, + { + "epoch": 0.3541534155959802, + "ewc_loss": 0.03195224702358246, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011810644355136901, + "grad_norm": 4.415686130523682, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8621900677680969, + "num_tokens": 106309070.0, + "step": 2784 + }, + { + "epoch": 0.35428062587457065, + "ewc_loss": 0.031875722110271454, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011734119470929727, + "grad_norm": 4.498042583465576, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8590997457504272, + "num_tokens": 106345784.0, + "step": 2785 + }, + { + "epoch": 0.3544078361531612, + "ewc_loss": 0.0319819301366806, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011840328079415485, + "grad_norm": 4.538356304168701, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.863303542137146, + "num_tokens": 106383172.0, + "step": 2786 + }, + { + "epoch": 0.3545350464317517, + "ewc_loss": 0.031921595335006714, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011779995111282915, + "grad_norm": 4.532247543334961, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.853117048740387, + "num_tokens": 106416916.0, + "step": 2787 + }, + { + "epoch": 0.3546622567103422, + "ewc_loss": 0.03189525753259659, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011753653961932287, + "grad_norm": 4.439741611480713, + "learning_rate": 1e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8400743007659912, + "num_tokens": 106456969.0, + "step": 2788 + }, + { + "epoch": 0.3547894669889327, + "ewc_loss": 0.03191604092717171, + "ewc_loss_diag": 2.014636993408203e-05, + "ewc_loss_parallel": 0.00011774439190048724, + "grad_norm": 4.568563938140869, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8525103330612183, + "num_tokens": 106490476.0, + "step": 2789 + }, + { + "epoch": 0.35491667726752324, + "ewc_loss": 0.032114848494529724, + "ewc_loss_diag": 2.0265579223632812e-05, + "ewc_loss_parallel": 0.00011851176532218233, + "grad_norm": 4.535286903381348, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8545773029327393, + "num_tokens": 106525990.0, + "step": 2790 + }, + { + "epoch": 0.3550438875461137, + "ewc_loss": 0.032018933445215225, + "ewc_loss_diag": 2.0265579223632812e-05, + "ewc_loss_parallel": 0.00011755261948565021, + "grad_norm": 4.543514251708984, + "learning_rate": 1e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.8297436237335205, + "num_tokens": 106562953.0, + "step": 2791 + }, + { + "epoch": 0.35517109782470424, + "ewc_loss": 0.03208381310105324, + "ewc_loss_diag": 2.0265579223632812e-05, + "ewc_loss_parallel": 0.00011820142390206456, + "grad_norm": 4.480993270874023, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8573140501976013, + "num_tokens": 106599142.0, + "step": 2792 + }, + { + "epoch": 0.35529830810329477, + "ewc_loss": 0.03206653147935867, + "ewc_loss_diag": 2.0265579223632812e-05, + "ewc_loss_parallel": 0.00011802860535681248, + "grad_norm": 4.420134544372559, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8669686317443848, + "num_tokens": 106636743.0, + "step": 2793 + }, + { + "epoch": 0.35542551838188524, + "ewc_loss": 0.0320504829287529, + "ewc_loss_diag": 2.0265579223632812e-05, + "ewc_loss_parallel": 0.00011786809045588598, + "grad_norm": 4.407003402709961, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8593563437461853, + "num_tokens": 106678992.0, + "step": 2794 + }, + { + "epoch": 0.35555272866047577, + "ewc_loss": 0.032105423510074615, + "ewc_loss_diag": 2.0265579223632812e-05, + "ewc_loss_parallel": 0.00011841749801533297, + "grad_norm": 4.510638236999512, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8453950881958008, + "num_tokens": 106719884.0, + "step": 2795 + }, + { + "epoch": 0.3556799389390663, + "ewc_loss": 0.03213544189929962, + "ewc_loss_diag": 2.0265579223632812e-05, + "ewc_loss_parallel": 0.00011871770402649418, + "grad_norm": 4.454854488372803, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8510711789131165, + "num_tokens": 106755224.0, + "step": 2796 + }, + { + "epoch": 0.35580714921765677, + "ewc_loss": 0.032225675880908966, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011839932267321274, + "grad_norm": 4.480379104614258, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8665260076522827, + "num_tokens": 106789366.0, + "step": 2797 + }, + { + "epoch": 0.3559343594962473, + "ewc_loss": 0.03238455951213837, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011876748612849042, + "grad_norm": 4.520606517791748, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.846410870552063, + "num_tokens": 106826382.0, + "step": 2798 + }, + { + "epoch": 0.3560615697748378, + "ewc_loss": 0.032237708568573, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.0001185196524602361, + "grad_norm": 4.489120006561279, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8474773168563843, + "num_tokens": 106860027.0, + "step": 2799 + }, + { + "epoch": 0.3561887800534283, + "ewc_loss": 0.032207950949668884, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011822207306977361, + "grad_norm": 4.480446815490723, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8634436130523682, + "num_tokens": 106898684.0, + "step": 2800 + }, + { + "epoch": 0.35631599033201883, + "ewc_loss": 0.0322340652346611, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.0001184832290164195, + "grad_norm": 4.52288818359375, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8551497459411621, + "num_tokens": 106934496.0, + "step": 2801 + }, + { + "epoch": 0.35644320061060936, + "ewc_loss": 0.03238019719719887, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011872384493472055, + "grad_norm": 4.4181294441223145, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.86860191822052, + "num_tokens": 106974335.0, + "step": 2802 + }, + { + "epoch": 0.35657041088919983, + "ewc_loss": 0.03232456371188164, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011816751793958247, + "grad_norm": 4.550364017486572, + "learning_rate": 1e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.8304107189178467, + "num_tokens": 107013914.0, + "step": 2803 + }, + { + "epoch": 0.35669762116779036, + "ewc_loss": 0.03243347257375717, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011925659055123106, + "grad_norm": 4.425130367279053, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8588297367095947, + "num_tokens": 107053747.0, + "step": 2804 + }, + { + "epoch": 0.3568248314463809, + "ewc_loss": 0.03227600082755089, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.0001176818841486238, + "grad_norm": 4.449012279510498, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.845011293888092, + "num_tokens": 107095245.0, + "step": 2805 + }, + { + "epoch": 0.35695204172497136, + "ewc_loss": 0.03238840773701668, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011880594684043899, + "grad_norm": 4.475048065185547, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8545545339584351, + "num_tokens": 107133627.0, + "step": 2806 + }, + { + "epoch": 0.3570792520035619, + "ewc_loss": 0.03236308693885803, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011855274351546541, + "grad_norm": 4.479322910308838, + "learning_rate": 1e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8298377990722656, + "num_tokens": 107170128.0, + "step": 2807 + }, + { + "epoch": 0.3572064622821524, + "ewc_loss": 0.03234446793794632, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011836653720820323, + "grad_norm": 4.422417640686035, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8650714159011841, + "num_tokens": 107209137.0, + "step": 2808 + }, + { + "epoch": 0.3573336725607429, + "ewc_loss": 0.03235336393117905, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011845550761790946, + "grad_norm": 4.518686294555664, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8497861623764038, + "num_tokens": 107244866.0, + "step": 2809 + }, + { + "epoch": 0.3574608828393334, + "ewc_loss": 0.032404571771621704, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011896757496288046, + "grad_norm": 4.452451229095459, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8553962707519531, + "num_tokens": 107287399.0, + "step": 2810 + }, + { + "epoch": 0.35758809311792394, + "ewc_loss": 0.03224257752299309, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011856835772050545, + "grad_norm": 4.496545791625977, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8454651832580566, + "num_tokens": 107325627.0, + "step": 2811 + }, + { + "epoch": 0.3577153033965144, + "ewc_loss": 0.032399795949459076, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.0001189198374049738, + "grad_norm": 4.498733997344971, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8696063756942749, + "num_tokens": 107359375.0, + "step": 2812 + }, + { + "epoch": 0.35784251367510495, + "ewc_loss": 0.032368045300245285, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011860232189064845, + "grad_norm": 4.465023994445801, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8421874642372131, + "num_tokens": 107400286.0, + "step": 2813 + }, + { + "epoch": 0.3579697239536955, + "ewc_loss": 0.032513029873371124, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00011883149272762239, + "grad_norm": 4.578798770904541, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8455970883369446, + "num_tokens": 107431768.0, + "step": 2814 + }, + { + "epoch": 0.35809693423228595, + "ewc_loss": 0.032428424805402756, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011920610995730385, + "grad_norm": 4.465486526489258, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.864262044429779, + "num_tokens": 107470875.0, + "step": 2815 + }, + { + "epoch": 0.3582241445108765, + "ewc_loss": 0.03233099356293678, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.000118231815577019, + "grad_norm": 4.5022196769714355, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8458263278007507, + "num_tokens": 107510988.0, + "step": 2816 + }, + { + "epoch": 0.358351354789467, + "ewc_loss": 0.032399579882621765, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011891769099747762, + "grad_norm": 4.440584659576416, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8610333204269409, + "num_tokens": 107552423.0, + "step": 2817 + }, + { + "epoch": 0.3584785650680575, + "ewc_loss": 0.03235704451799393, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011849232396343723, + "grad_norm": 4.528276443481445, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8473031520843506, + "num_tokens": 107591693.0, + "step": 2818 + }, + { + "epoch": 0.358605775346648, + "ewc_loss": 0.0323847234249115, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011876909411512315, + "grad_norm": 4.485602855682373, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.858838677406311, + "num_tokens": 107626017.0, + "step": 2819 + }, + { + "epoch": 0.35873298562523853, + "ewc_loss": 0.0323634147644043, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.0001185560249723494, + "grad_norm": 4.5003275871276855, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8708018064498901, + "num_tokens": 107664927.0, + "step": 2820 + }, + { + "epoch": 0.358860195903829, + "ewc_loss": 0.032393839210271835, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.0001188602764159441, + "grad_norm": 4.565160751342773, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8464815616607666, + "num_tokens": 107702487.0, + "step": 2821 + }, + { + "epoch": 0.35898740618241953, + "ewc_loss": 0.032389990985393524, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011882177932420745, + "grad_norm": 4.425118446350098, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8676655292510986, + "num_tokens": 107741263.0, + "step": 2822 + }, + { + "epoch": 0.35911461646101006, + "ewc_loss": 0.03233486786484718, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011827056005131453, + "grad_norm": 4.525446891784668, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8570536375045776, + "num_tokens": 107781079.0, + "step": 2823 + }, + { + "epoch": 0.35924182673960053, + "ewc_loss": 0.03239873796701431, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011890923633472994, + "grad_norm": 4.429815292358398, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8732399940490723, + "num_tokens": 107819309.0, + "step": 2824 + }, + { + "epoch": 0.35936903701819106, + "ewc_loss": 0.032317154109478, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.0001180934050353244, + "grad_norm": 4.514785289764404, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8567061424255371, + "num_tokens": 107853251.0, + "step": 2825 + }, + { + "epoch": 0.3594962472967816, + "ewc_loss": 0.03229016810655594, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011904425628017634, + "grad_norm": 4.504292011260986, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8748109340667725, + "num_tokens": 107893476.0, + "step": 2826 + }, + { + "epoch": 0.35962345757537206, + "ewc_loss": 0.03232689946889877, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011819086648756638, + "grad_norm": 4.442140579223633, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8643800020217896, + "num_tokens": 107931204.0, + "step": 2827 + }, + { + "epoch": 0.3597506678539626, + "ewc_loss": 0.03234238177537918, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011834569158963859, + "grad_norm": 4.535868167877197, + "learning_rate": 1e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8372562527656555, + "num_tokens": 107966471.0, + "step": 2828 + }, + { + "epoch": 0.3598778781325531, + "ewc_loss": 0.032399311661720276, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011891499161720276, + "grad_norm": 4.4305806159973145, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8561209440231323, + "num_tokens": 108005328.0, + "step": 2829 + }, + { + "epoch": 0.36000508841114365, + "ewc_loss": 0.032196082174777985, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011810340220108628, + "grad_norm": 4.506998538970947, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8434078097343445, + "num_tokens": 108046295.0, + "step": 2830 + }, + { + "epoch": 0.3601322986897341, + "ewc_loss": 0.03233017772436142, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011944437574129552, + "grad_norm": 4.564627170562744, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8565264940261841, + "num_tokens": 108078164.0, + "step": 2831 + }, + { + "epoch": 0.36025950896832465, + "ewc_loss": 0.03236885368824005, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011861042730743065, + "grad_norm": 4.443682670593262, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.849962055683136, + "num_tokens": 108117105.0, + "step": 2832 + }, + { + "epoch": 0.3603867192469152, + "ewc_loss": 0.03232520818710327, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011817397171398625, + "grad_norm": 4.4923248291015625, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8530175685882568, + "num_tokens": 108155301.0, + "step": 2833 + }, + { + "epoch": 0.36051392952550565, + "ewc_loss": 0.03241770714521408, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011909892782568932, + "grad_norm": 4.4890899658203125, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8647409081459045, + "num_tokens": 108193091.0, + "step": 2834 + }, + { + "epoch": 0.3606411398040962, + "ewc_loss": 0.03235035017132759, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.0001184253633255139, + "grad_norm": 4.475572109222412, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8527352809906006, + "num_tokens": 108233719.0, + "step": 2835 + }, + { + "epoch": 0.3607683500826867, + "ewc_loss": 0.032215096056461334, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011829353752546012, + "grad_norm": 4.509063243865967, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8618583679199219, + "num_tokens": 108265775.0, + "step": 2836 + }, + { + "epoch": 0.3608955603612772, + "ewc_loss": 0.03227749839425087, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011891756730619818, + "grad_norm": 4.440449237823486, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8646461963653564, + "num_tokens": 108307953.0, + "step": 2837 + }, + { + "epoch": 0.3610227706398677, + "ewc_loss": 0.03224416449666023, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011858421203214675, + "grad_norm": 4.490443706512451, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8511465787887573, + "num_tokens": 108351998.0, + "step": 2838 + }, + { + "epoch": 0.36114998091845824, + "ewc_loss": 0.03233438357710838, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011948642350034788, + "grad_norm": 4.47390079498291, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8416369557380676, + "num_tokens": 108395968.0, + "step": 2839 + }, + { + "epoch": 0.3612771911970487, + "ewc_loss": 0.032273147255182266, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011887404252775013, + "grad_norm": 4.503609657287598, + "learning_rate": 1e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8352190852165222, + "num_tokens": 108433045.0, + "step": 2840 + }, + { + "epoch": 0.36140440147563924, + "ewc_loss": 0.0323527567088604, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.0001196701341541484, + "grad_norm": 4.490944862365723, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8497560024261475, + "num_tokens": 108470964.0, + "step": 2841 + }, + { + "epoch": 0.36153161175422976, + "ewc_loss": 0.03229066729545593, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.0001190492621390149, + "grad_norm": 4.474792003631592, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8553537130355835, + "num_tokens": 108512572.0, + "step": 2842 + }, + { + "epoch": 0.36165882203282024, + "ewc_loss": 0.03230449557304382, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011918752716155723, + "grad_norm": 4.503281593322754, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8403956890106201, + "num_tokens": 108548526.0, + "step": 2843 + }, + { + "epoch": 0.36178603231141077, + "ewc_loss": 0.032362040132284164, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011976298992522061, + "grad_norm": 4.484989166259766, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8518916368484497, + "num_tokens": 108588447.0, + "step": 2844 + }, + { + "epoch": 0.3619132425900013, + "ewc_loss": 0.03230462223291397, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011918880773009732, + "grad_norm": 4.471407890319824, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8444161415100098, + "num_tokens": 108628426.0, + "step": 2845 + }, + { + "epoch": 0.36204045286859177, + "ewc_loss": 0.03235359117388725, + "ewc_loss_diag": 2.0384788513183594e-05, + "ewc_loss_parallel": 0.00011967850150540471, + "grad_norm": 4.568778038024902, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8498018383979797, + "num_tokens": 108661355.0, + "step": 2846 + }, + { + "epoch": 0.3621676631471823, + "ewc_loss": 0.03251264989376068, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.0001200483602588065, + "grad_norm": 4.4704155921936035, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8714555501937866, + "num_tokens": 108698505.0, + "step": 2847 + }, + { + "epoch": 0.3622948734257728, + "ewc_loss": 0.032483406364917755, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011975593224633485, + "grad_norm": 4.504419326782227, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8600754737854004, + "num_tokens": 108737409.0, + "step": 2848 + }, + { + "epoch": 0.3624220837043633, + "ewc_loss": 0.03250995650887489, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012002145376754925, + "grad_norm": 4.47627592086792, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8554027080535889, + "num_tokens": 108772969.0, + "step": 2849 + }, + { + "epoch": 0.3625492939829538, + "ewc_loss": 0.0325046181678772, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.0001199680773424916, + "grad_norm": 4.555517196655273, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8610013723373413, + "num_tokens": 108805129.0, + "step": 2850 + }, + { + "epoch": 0.36267650426154435, + "ewc_loss": 0.03256066143512726, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012052847887389362, + "grad_norm": 4.512784481048584, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8397698998451233, + "num_tokens": 108841197.0, + "step": 2851 + }, + { + "epoch": 0.3628037145401348, + "ewc_loss": 0.032528169453144073, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012020357098663226, + "grad_norm": 4.528002738952637, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8685681819915771, + "num_tokens": 108876442.0, + "step": 2852 + }, + { + "epoch": 0.36293092481872535, + "ewc_loss": 0.03255375847220421, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012045946641592309, + "grad_norm": 4.555116653442383, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8415539264678955, + "num_tokens": 108911432.0, + "step": 2853 + }, + { + "epoch": 0.3630581350973159, + "ewc_loss": 0.03255784511566162, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012050032091792673, + "grad_norm": 4.487028121948242, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8412946462631226, + "num_tokens": 108952977.0, + "step": 2854 + }, + { + "epoch": 0.36318534537590635, + "ewc_loss": 0.032533563673496246, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012025749310851097, + "grad_norm": 4.505187511444092, + "learning_rate": 1e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.8323028087615967, + "num_tokens": 108995602.0, + "step": 2855 + }, + { + "epoch": 0.3633125556544969, + "ewc_loss": 0.032583266496658325, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012075452832505107, + "grad_norm": 4.452639102935791, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8568338751792908, + "num_tokens": 109040226.0, + "step": 2856 + }, + { + "epoch": 0.3634397659330874, + "ewc_loss": 0.032650839537382126, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012020956637570634, + "grad_norm": 4.528188705444336, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8565165996551514, + "num_tokens": 109082898.0, + "step": 2857 + }, + { + "epoch": 0.3635669762116779, + "ewc_loss": 0.03260943666100502, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012101623724447563, + "grad_norm": 4.504730701446533, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8457919955253601, + "num_tokens": 109122460.0, + "step": 2858 + }, + { + "epoch": 0.3636941864902684, + "ewc_loss": 0.03256801515817642, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012060202425345778, + "grad_norm": 4.522087097167969, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8542493581771851, + "num_tokens": 109161680.0, + "step": 2859 + }, + { + "epoch": 0.36382139676885894, + "ewc_loss": 0.03257960081100464, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012071788660250604, + "grad_norm": 4.546059608459473, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8732777833938599, + "num_tokens": 109198943.0, + "step": 2860 + }, + { + "epoch": 0.3639486070474494, + "ewc_loss": 0.03257005661725998, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012062245514243841, + "grad_norm": 4.500910758972168, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8615014553070068, + "num_tokens": 109235719.0, + "step": 2861 + }, + { + "epoch": 0.36407581732603994, + "ewc_loss": 0.032542139291763306, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012034325482090935, + "grad_norm": 4.538468837738037, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8475538492202759, + "num_tokens": 109272681.0, + "step": 2862 + }, + { + "epoch": 0.36420302760463047, + "ewc_loss": 0.03257656842470169, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012068757496308535, + "grad_norm": 4.5700273513793945, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8420664072036743, + "num_tokens": 109306114.0, + "step": 2863 + }, + { + "epoch": 0.36433023788322094, + "ewc_loss": 0.03257495164871216, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012067137868143618, + "grad_norm": 4.55492639541626, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8462352752685547, + "num_tokens": 109337728.0, + "step": 2864 + }, + { + "epoch": 0.36445744816181147, + "ewc_loss": 0.032564856112003326, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012057041749358177, + "grad_norm": 4.488185882568359, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8459516167640686, + "num_tokens": 109377809.0, + "step": 2865 + }, + { + "epoch": 0.364584658440402, + "ewc_loss": 0.03256971389055252, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012061899906257167, + "grad_norm": 4.490189552307129, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8679647445678711, + "num_tokens": 109413499.0, + "step": 2866 + }, + { + "epoch": 0.36471186871899247, + "ewc_loss": 0.03258552774786949, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012077714200131595, + "grad_norm": 4.484163761138916, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8440770506858826, + "num_tokens": 109460961.0, + "step": 2867 + }, + { + "epoch": 0.364839078997583, + "ewc_loss": 0.03258150815963745, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012073697143932804, + "grad_norm": 4.463562488555908, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8656330704689026, + "num_tokens": 109500615.0, + "step": 2868 + }, + { + "epoch": 0.36496628927617353, + "ewc_loss": 0.032606128603219986, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012098316074116156, + "grad_norm": 4.586804389953613, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8569915890693665, + "num_tokens": 109533921.0, + "step": 2869 + }, + { + "epoch": 0.365093499554764, + "ewc_loss": 0.032639991492033005, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012132179836044088, + "grad_norm": 4.581329345703125, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8441975116729736, + "num_tokens": 109567069.0, + "step": 2870 + }, + { + "epoch": 0.36522070983335453, + "ewc_loss": 0.03257162868976593, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012063814210705459, + "grad_norm": 4.520706653594971, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8676204681396484, + "num_tokens": 109603696.0, + "step": 2871 + }, + { + "epoch": 0.36534792011194506, + "ewc_loss": 0.03269067406654358, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012060790322721004, + "grad_norm": 6.253238677978516, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8718043565750122, + "num_tokens": 109644590.0, + "step": 2872 + }, + { + "epoch": 0.36547513039053553, + "ewc_loss": 0.03398483246564865, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00013477022002916783, + "grad_norm": 4.731669902801514, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8489842414855957, + "num_tokens": 109678869.0, + "step": 2873 + }, + { + "epoch": 0.36560234066912606, + "ewc_loss": 0.032005950808525085, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00011498137610033154, + "grad_norm": 4.341357231140137, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.849501371383667, + "num_tokens": 109722517.0, + "step": 2874 + }, + { + "epoch": 0.3657295509477166, + "ewc_loss": 0.03264714032411575, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012139329919591546, + "grad_norm": 4.601461410522461, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8509207367897034, + "num_tokens": 109757869.0, + "step": 2875 + }, + { + "epoch": 0.36585676122630706, + "ewc_loss": 0.032728441059589386, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.0001209855981869623, + "grad_norm": 4.58369255065918, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8499995470046997, + "num_tokens": 109793162.0, + "step": 2876 + }, + { + "epoch": 0.3659839715048976, + "ewc_loss": 0.032639507204294205, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012009623605990782, + "grad_norm": 4.517070770263672, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8611781597137451, + "num_tokens": 109832942.0, + "step": 2877 + }, + { + "epoch": 0.3661111817834881, + "ewc_loss": 0.03268349915742874, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.0001205361622851342, + "grad_norm": 4.534979343414307, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8529165983200073, + "num_tokens": 109876230.0, + "step": 2878 + }, + { + "epoch": 0.3662383920620786, + "ewc_loss": 0.03265970200300217, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012029819481540471, + "grad_norm": 4.52418851852417, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.845038115978241, + "num_tokens": 109911844.0, + "step": 2879 + }, + { + "epoch": 0.3663656023406691, + "ewc_loss": 0.03268609195947647, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.0001205620719701983, + "grad_norm": 6.282310962677002, + "learning_rate": 1e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8422428369522095, + "num_tokens": 109957619.0, + "step": 2880 + }, + { + "epoch": 0.36649281261925964, + "ewc_loss": 0.03409009054303169, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00013460207264870405, + "grad_norm": 4.693435192108154, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8437460660934448, + "num_tokens": 109999117.0, + "step": 2881 + }, + { + "epoch": 0.3666200228978502, + "ewc_loss": 0.03217647224664688, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.0001154658748419024, + "grad_norm": 4.45879602432251, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8611629605293274, + "num_tokens": 110039557.0, + "step": 2882 + }, + { + "epoch": 0.36674723317644065, + "ewc_loss": 0.03286147117614746, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012231590517330915, + "grad_norm": 4.605320453643799, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.846166729927063, + "num_tokens": 110074418.0, + "step": 2883 + }, + { + "epoch": 0.3668744434550312, + "ewc_loss": 0.03270125761628151, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012071375385858119, + "grad_norm": 4.531563758850098, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.859149158000946, + "num_tokens": 110109255.0, + "step": 2884 + }, + { + "epoch": 0.3670016537336217, + "ewc_loss": 0.032642971724271774, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.0001201308987219818, + "grad_norm": 4.508512496948242, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8605771660804749, + "num_tokens": 110150180.0, + "step": 2885 + }, + { + "epoch": 0.3671288640122122, + "ewc_loss": 0.032731108367443085, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012101225729566067, + "grad_norm": 4.562035083770752, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8381919860839844, + "num_tokens": 110193653.0, + "step": 2886 + }, + { + "epoch": 0.3672560742908027, + "ewc_loss": 0.03270350769162178, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012073625111952424, + "grad_norm": 4.527590751647949, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8618578910827637, + "num_tokens": 110231020.0, + "step": 2887 + }, + { + "epoch": 0.36738328456939323, + "ewc_loss": 0.032564111053943634, + "ewc_loss_diag": 2.0503997802734375e-05, + "ewc_loss_parallel": 0.00012056298874085769, + "grad_norm": 4.509657859802246, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8699181079864502, + "num_tokens": 110269397.0, + "step": 2888 + }, + { + "epoch": 0.3675104948479837, + "ewc_loss": 0.03270076960325241, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.0001207088862429373, + "grad_norm": 4.562594890594482, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8690396547317505, + "num_tokens": 110311188.0, + "step": 2889 + }, + { + "epoch": 0.36763770512657423, + "ewc_loss": 0.032846543937921524, + "ewc_loss_diag": 2.0742416381835938e-05, + "ewc_loss_parallel": 0.00012094591511413455, + "grad_norm": 4.894649982452393, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8594578504562378, + "num_tokens": 110343801.0, + "step": 2890 + }, + { + "epoch": 0.36776491540516476, + "ewc_loss": 0.03287462517619133, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012244742538314313, + "grad_norm": 4.519082546234131, + "learning_rate": 1e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8381642699241638, + "num_tokens": 110385830.0, + "step": 2891 + }, + { + "epoch": 0.36789212568375523, + "ewc_loss": 0.032557591795921326, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00011927710147574544, + "grad_norm": 4.617212772369385, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8538476824760437, + "num_tokens": 110421494.0, + "step": 2892 + }, + { + "epoch": 0.36801933596234576, + "ewc_loss": 0.03277180716395378, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012141925253672525, + "grad_norm": 4.482447624206543, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8532558083534241, + "num_tokens": 110463132.0, + "step": 2893 + }, + { + "epoch": 0.3681465462409363, + "ewc_loss": 0.03263860195875168, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.0001200871920445934, + "grad_norm": 4.585447311401367, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8568224906921387, + "num_tokens": 110498330.0, + "step": 2894 + }, + { + "epoch": 0.36827375651952676, + "ewc_loss": 0.032762832939624786, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012132951087551191, + "grad_norm": 4.505687713623047, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8627310395240784, + "num_tokens": 110534438.0, + "step": 2895 + }, + { + "epoch": 0.3684009667981173, + "ewc_loss": 0.0326816663146019, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012051783414790407, + "grad_norm": 4.5457353591918945, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8496440649032593, + "num_tokens": 110572059.0, + "step": 2896 + }, + { + "epoch": 0.3685281770767078, + "ewc_loss": 0.03276071697473526, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012130834511481225, + "grad_norm": 4.52458381652832, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8563376665115356, + "num_tokens": 110611012.0, + "step": 2897 + }, + { + "epoch": 0.3686553873552983, + "ewc_loss": 0.03274761885404587, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012117735604988411, + "grad_norm": 4.546672821044922, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8621348142623901, + "num_tokens": 110647429.0, + "step": 2898 + }, + { + "epoch": 0.3687825976338888, + "ewc_loss": 0.032742928713560104, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012113045522710308, + "grad_norm": 4.5332183837890625, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8480043411254883, + "num_tokens": 110685052.0, + "step": 2899 + }, + { + "epoch": 0.36890980791247935, + "ewc_loss": 0.03277163952589035, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.0001214175863424316, + "grad_norm": 4.597459316253662, + "learning_rate": 1e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8349020481109619, + "num_tokens": 110725222.0, + "step": 2900 + }, + { + "epoch": 0.3690370181910698, + "ewc_loss": 0.03276374191045761, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012133857671869919, + "grad_norm": 4.547728538513184, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8431496620178223, + "num_tokens": 110764314.0, + "step": 2901 + }, + { + "epoch": 0.36916422846966035, + "ewc_loss": 0.03274218365550041, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012112299737054855, + "grad_norm": 4.477299213409424, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8665007948875427, + "num_tokens": 110803585.0, + "step": 2902 + }, + { + "epoch": 0.3692914387482509, + "ewc_loss": 0.03272930905222893, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012099425657652318, + "grad_norm": 4.513993263244629, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8494740128517151, + "num_tokens": 110846532.0, + "step": 2903 + }, + { + "epoch": 0.36941864902684135, + "ewc_loss": 0.03290955722332001, + "ewc_loss_diag": 2.0742416381835938e-05, + "ewc_loss_parallel": 0.00012157602031948045, + "grad_norm": 4.509562015533447, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8522598743438721, + "num_tokens": 110889807.0, + "step": 2904 + }, + { + "epoch": 0.3695458593054319, + "ewc_loss": 0.032785724848508835, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012155842705396935, + "grad_norm": 4.5065388679504395, + "learning_rate": 1e-06, + "loss": 0.5275, + "mean_token_accuracy": 0.8354915380477905, + "num_tokens": 110930567.0, + "step": 2905 + }, + { + "epoch": 0.3696730695840224, + "ewc_loss": 0.03280632942914963, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012176448944956064, + "grad_norm": 4.532164096832275, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8616366386413574, + "num_tokens": 110968725.0, + "step": 2906 + }, + { + "epoch": 0.3698002798626129, + "ewc_loss": 0.03282422572374344, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012194341979920864, + "grad_norm": 4.5093278884887695, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.878775417804718, + "num_tokens": 111002724.0, + "step": 2907 + }, + { + "epoch": 0.3699274901412034, + "ewc_loss": 0.03279900550842285, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.0001216912132804282, + "grad_norm": 4.52869987487793, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8537580966949463, + "num_tokens": 111036668.0, + "step": 2908 + }, + { + "epoch": 0.37005470041979394, + "ewc_loss": 0.0328422486782074, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012212367437314242, + "grad_norm": 4.534917831420898, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8447847366333008, + "num_tokens": 111077712.0, + "step": 2909 + }, + { + "epoch": 0.3701819106983844, + "ewc_loss": 0.03285730630159378, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012227425759192556, + "grad_norm": 4.537239074707031, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8453426361083984, + "num_tokens": 111118856.0, + "step": 2910 + }, + { + "epoch": 0.37030912097697494, + "ewc_loss": 0.03298445791006088, + "ewc_loss_diag": 2.0742416381835938e-05, + "ewc_loss_parallel": 0.00012232504377607256, + "grad_norm": 4.521622657775879, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8464952111244202, + "num_tokens": 111156712.0, + "step": 2911 + }, + { + "epoch": 0.37043633125556547, + "ewc_loss": 0.032870933413505554, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012241049262229353, + "grad_norm": 4.543987274169922, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8561844229698181, + "num_tokens": 111195542.0, + "step": 2912 + }, + { + "epoch": 0.37056354153415594, + "ewc_loss": 0.032887186855077744, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012257305206730962, + "grad_norm": 4.5062642097473145, + "learning_rate": 1e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8358514308929443, + "num_tokens": 111237827.0, + "step": 2913 + }, + { + "epoch": 0.37069075181274647, + "ewc_loss": 0.03286472335457802, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012234841415192932, + "grad_norm": 4.529399871826172, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8623192310333252, + "num_tokens": 111278802.0, + "step": 2914 + }, + { + "epoch": 0.370817962091337, + "ewc_loss": 0.03292224183678627, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012292357860133052, + "grad_norm": 4.574792861938477, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8515040874481201, + "num_tokens": 111313795.0, + "step": 2915 + }, + { + "epoch": 0.37094517236992747, + "ewc_loss": 0.03289666026830673, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012266775593161583, + "grad_norm": 4.476778507232666, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8612401485443115, + "num_tokens": 111353798.0, + "step": 2916 + }, + { + "epoch": 0.371072382648518, + "ewc_loss": 0.032887693494558334, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012257810158189386, + "grad_norm": 4.550891399383545, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8538535833358765, + "num_tokens": 111390613.0, + "step": 2917 + }, + { + "epoch": 0.3711995929271085, + "ewc_loss": 0.03292839974164963, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012298514775466174, + "grad_norm": 4.51278829574585, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8554700016975403, + "num_tokens": 111432043.0, + "step": 2918 + }, + { + "epoch": 0.371326803205699, + "ewc_loss": 0.032913751900196075, + "ewc_loss_diag": 2.0623207092285156e-05, + "ewc_loss_parallel": 0.00012283871183171868, + "grad_norm": 4.475388050079346, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8604521751403809, + "num_tokens": 111476575.0, + "step": 2919 + }, + { + "epoch": 0.3714540134842895, + "ewc_loss": 0.033175595104694366, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012301573588047177, + "grad_norm": 4.554562091827393, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.854708194732666, + "num_tokens": 111515304.0, + "step": 2920 + }, + { + "epoch": 0.37158122376288005, + "ewc_loss": 0.03320043906569481, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012326416617725044, + "grad_norm": 4.545877933502197, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.863932728767395, + "num_tokens": 111550246.0, + "step": 2921 + }, + { + "epoch": 0.3717084340414705, + "ewc_loss": 0.0331614688038826, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012287446588743478, + "grad_norm": 4.495316505432129, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8557958602905273, + "num_tokens": 111592189.0, + "step": 2922 + }, + { + "epoch": 0.37183564432006105, + "ewc_loss": 0.03315949812531471, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012285474804230034, + "grad_norm": 4.526163101196289, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8575185537338257, + "num_tokens": 111638177.0, + "step": 2923 + }, + { + "epoch": 0.3719628545986516, + "ewc_loss": 0.03316240385174751, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.0001228838082170114, + "grad_norm": 4.640745639801025, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8549104332923889, + "num_tokens": 111674365.0, + "step": 2924 + }, + { + "epoch": 0.37209006487724205, + "ewc_loss": 0.03319226950407028, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012318244262132794, + "grad_norm": 4.574046611785889, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.850019097328186, + "num_tokens": 111714571.0, + "step": 2925 + }, + { + "epoch": 0.3722172751558326, + "ewc_loss": 0.033147234469652176, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012273210450075567, + "grad_norm": 4.543212890625, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.854511022567749, + "num_tokens": 111755214.0, + "step": 2926 + }, + { + "epoch": 0.3723444854344231, + "ewc_loss": 0.0333692729473114, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012251109001226723, + "grad_norm": 11.843727111816406, + "learning_rate": 1e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8271398544311523, + "num_tokens": 111796516.0, + "step": 2927 + }, + { + "epoch": 0.3724716957130136, + "ewc_loss": 0.03946717455983162, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00018593150889500976, + "grad_norm": 5.85473108291626, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8394640684127808, + "num_tokens": 111832615.0, + "step": 2928 + }, + { + "epoch": 0.3725989059916041, + "ewc_loss": 0.032656267285346985, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00011782244109781459, + "grad_norm": 4.122490406036377, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8443328142166138, + "num_tokens": 111874682.0, + "step": 2929 + }, + { + "epoch": 0.37272611627019464, + "ewc_loss": 0.03448616713285446, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00013612143811769783, + "grad_norm": 5.287125587463379, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8418807983398438, + "num_tokens": 111916369.0, + "step": 2930 + }, + { + "epoch": 0.37285332654878517, + "ewc_loss": 0.03523924946784973, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00014365225797519088, + "grad_norm": 4.756906032562256, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8477333784103394, + "num_tokens": 111950671.0, + "step": 2931 + }, + { + "epoch": 0.37298053682737564, + "ewc_loss": 0.03349458798766136, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.0001262056321138516, + "grad_norm": 4.761219501495361, + "learning_rate": 1e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8368873000144958, + "num_tokens": 111988846.0, + "step": 2932 + }, + { + "epoch": 0.37310774710596617, + "ewc_loss": 0.034174613654613495, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00013300590217113495, + "grad_norm": 4.793181419372559, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.86328125, + "num_tokens": 112022853.0, + "step": 2933 + }, + { + "epoch": 0.3732349573845567, + "ewc_loss": 0.03373105451464653, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.000128570303786546, + "grad_norm": 4.6510233879089355, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8503460884094238, + "num_tokens": 112065631.0, + "step": 2934 + }, + { + "epoch": 0.37336216766314717, + "ewc_loss": 0.03362244367599487, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012748422159347683, + "grad_norm": 4.682056427001953, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8440151810646057, + "num_tokens": 112107236.0, + "step": 2935 + }, + { + "epoch": 0.3734893779417377, + "ewc_loss": 0.03376275300979614, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012766661529894918, + "grad_norm": 7.816831588745117, + "learning_rate": 1e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8315361738204956, + "num_tokens": 112153914.0, + "step": 2936 + }, + { + "epoch": 0.3736165882203282, + "ewc_loss": 0.036581143736839294, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00015707117563579232, + "grad_norm": 5.237525939941406, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8570730090141296, + "num_tokens": 112187596.0, + "step": 2937 + }, + { + "epoch": 0.3737437984989187, + "ewc_loss": 0.03260321915149689, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00011729197285603732, + "grad_norm": 4.3377814292907715, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8527117371559143, + "num_tokens": 112222409.0, + "step": 2938 + }, + { + "epoch": 0.37387100877750923, + "ewc_loss": 0.033748842775821686, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.000128748215502128, + "grad_norm": 4.954111576080322, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8418920636177063, + "num_tokens": 112257699.0, + "step": 2939 + }, + { + "epoch": 0.37399821905609976, + "ewc_loss": 0.03388651832938194, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00013012494309805334, + "grad_norm": 4.585507869720459, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8617780208587646, + "num_tokens": 112292431.0, + "step": 2940 + }, + { + "epoch": 0.37412542933469023, + "ewc_loss": 0.03321564942598343, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012341627734713256, + "grad_norm": 4.67412805557251, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.855172872543335, + "num_tokens": 112331241.0, + "step": 2941 + }, + { + "epoch": 0.37425263961328076, + "ewc_loss": 0.03358875960111618, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012714734475594014, + "grad_norm": 4.663912296295166, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8501998782157898, + "num_tokens": 112368486.0, + "step": 2942 + }, + { + "epoch": 0.3743798498918713, + "ewc_loss": 0.033328186720609665, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012454163515940309, + "grad_norm": 4.626635551452637, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8449792861938477, + "num_tokens": 112409638.0, + "step": 2943 + }, + { + "epoch": 0.37450706017046176, + "ewc_loss": 0.033344678580760956, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012470653746277094, + "grad_norm": 4.6080241203308105, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8490138053894043, + "num_tokens": 112448257.0, + "step": 2944 + }, + { + "epoch": 0.3746342704490523, + "ewc_loss": 0.0332808718085289, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.0001240684650838375, + "grad_norm": 4.577910423278809, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.868964433670044, + "num_tokens": 112488739.0, + "step": 2945 + }, + { + "epoch": 0.3747614807276428, + "ewc_loss": 0.033282212913036346, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012408191105350852, + "grad_norm": 4.6025285720825195, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8384566903114319, + "num_tokens": 112528621.0, + "step": 2946 + }, + { + "epoch": 0.3748886910062333, + "ewc_loss": 0.033386632800102234, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012390539632178843, + "grad_norm": 4.637918949127197, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8491623401641846, + "num_tokens": 112564615.0, + "step": 2947 + }, + { + "epoch": 0.3750159012848238, + "ewc_loss": 0.03323878347873688, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012364762369543314, + "grad_norm": 4.716691970825195, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8427934050559998, + "num_tokens": 112598190.0, + "step": 2948 + }, + { + "epoch": 0.37514311156341434, + "ewc_loss": 0.03323609381914139, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012362068810034543, + "grad_norm": 4.573052883148193, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8528802394866943, + "num_tokens": 112632078.0, + "step": 2949 + }, + { + "epoch": 0.3752703218420048, + "ewc_loss": 0.033152900636196136, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012278876965865493, + "grad_norm": 4.652160167694092, + "learning_rate": 1e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8304134607315063, + "num_tokens": 112667848.0, + "step": 2950 + }, + { + "epoch": 0.37539753212059535, + "ewc_loss": 0.03338710591197014, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012391011114232242, + "grad_norm": 4.596518039703369, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8517917990684509, + "num_tokens": 112707037.0, + "step": 2951 + }, + { + "epoch": 0.3755247423991859, + "ewc_loss": 0.03318158909678459, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012307564611546695, + "grad_norm": 4.5305304527282715, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.86269611120224, + "num_tokens": 112751921.0, + "step": 2952 + }, + { + "epoch": 0.37565195267777635, + "ewc_loss": 0.03320697695016861, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012332951882854104, + "grad_norm": 4.562900543212891, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8582738637924194, + "num_tokens": 112793061.0, + "step": 2953 + }, + { + "epoch": 0.3757791629563669, + "ewc_loss": 0.03321895748376846, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.0001234493392985314, + "grad_norm": 4.590932369232178, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8624515533447266, + "num_tokens": 112833852.0, + "step": 2954 + }, + { + "epoch": 0.3759063732349574, + "ewc_loss": 0.0332285538315773, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.0001235453091794625, + "grad_norm": 4.631749629974365, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8492220640182495, + "num_tokens": 112866718.0, + "step": 2955 + }, + { + "epoch": 0.3760335835135479, + "ewc_loss": 0.033216238021850586, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012342212721705437, + "grad_norm": 4.538730621337891, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8549587726593018, + "num_tokens": 112909420.0, + "step": 2956 + }, + { + "epoch": 0.3761607937921384, + "ewc_loss": 0.03313985839486122, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.00012265834084246308, + "grad_norm": 4.644444942474365, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8556464314460754, + "num_tokens": 112944570.0, + "step": 2957 + }, + { + "epoch": 0.37628800407072893, + "ewc_loss": 0.03327679634094238, + "ewc_loss_diag": 2.086162567138672e-05, + "ewc_loss_parallel": 0.0001240277342731133, + "grad_norm": 4.513307571411133, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8655339479446411, + "num_tokens": 112984107.0, + "step": 2958 + }, + { + "epoch": 0.3764152143493194, + "ewc_loss": 0.03328271955251694, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.0001228662731591612, + "grad_norm": 4.54269552230835, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8765612840652466, + "num_tokens": 113023066.0, + "step": 2959 + }, + { + "epoch": 0.37654242462790993, + "ewc_loss": 0.033341530710458755, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012345437426120043, + "grad_norm": 4.755782604217529, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8524683117866516, + "num_tokens": 113060093.0, + "step": 2960 + }, + { + "epoch": 0.37666963490650046, + "ewc_loss": 0.03337908908724785, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012382995919324458, + "grad_norm": 4.629777431488037, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8454591035842896, + "num_tokens": 113095940.0, + "step": 2961 + }, + { + "epoch": 0.37679684518509093, + "ewc_loss": 0.03326772525906563, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.0001227163156727329, + "grad_norm": 4.563107490539551, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8647009134292603, + "num_tokens": 113130699.0, + "step": 2962 + }, + { + "epoch": 0.37692405546368146, + "ewc_loss": 0.03332722932100296, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.0001233113434864208, + "grad_norm": 4.592813491821289, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.853518545627594, + "num_tokens": 113169145.0, + "step": 2963 + }, + { + "epoch": 0.377051265742272, + "ewc_loss": 0.033338867127895355, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012342771515250206, + "grad_norm": 4.5785136222839355, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8672025203704834, + "num_tokens": 113201987.0, + "step": 2964 + }, + { + "epoch": 0.37717847602086246, + "ewc_loss": 0.03333646059036255, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012340366083662957, + "grad_norm": 4.571496963500977, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8609758615493774, + "num_tokens": 113239588.0, + "step": 2965 + }, + { + "epoch": 0.377305686299453, + "ewc_loss": 0.03335496783256531, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012358873209450394, + "grad_norm": 4.576385974884033, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.855789065361023, + "num_tokens": 113277075.0, + "step": 2966 + }, + { + "epoch": 0.3774328965780435, + "ewc_loss": 0.033364687114953995, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012368593888822943, + "grad_norm": 4.544732570648193, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.881300151348114, + "num_tokens": 113313708.0, + "step": 2967 + }, + { + "epoch": 0.377560106856634, + "ewc_loss": 0.03336825966835022, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.0001237216783920303, + "grad_norm": 4.665901184082031, + "learning_rate": 1e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8370733261108398, + "num_tokens": 113344027.0, + "step": 2968 + }, + { + "epoch": 0.3776873171352245, + "ewc_loss": 0.03343035653233528, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012434263771865517, + "grad_norm": 4.487509250640869, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8577558994293213, + "num_tokens": 113388444.0, + "step": 2969 + }, + { + "epoch": 0.37781452741381505, + "ewc_loss": 0.033317822962999344, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012321729445829988, + "grad_norm": 13.893566131591797, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8494192957878113, + "num_tokens": 113428660.0, + "step": 2970 + }, + { + "epoch": 0.3779417376924055, + "ewc_loss": 0.041293833404779434, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00020175670215394348, + "grad_norm": 6.204843044281006, + "learning_rate": 1e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8329638242721558, + "num_tokens": 113469160.0, + "step": 2971 + }, + { + "epoch": 0.37806894797099605, + "ewc_loss": 0.03353597968816757, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.0001241781428689137, + "grad_norm": 4.0797438621521, + "learning_rate": 1e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8298320174217224, + "num_tokens": 113504291.0, + "step": 2972 + }, + { + "epoch": 0.3781961582495866, + "ewc_loss": 0.03535249084234238, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00014356395695358515, + "grad_norm": 5.485087871551514, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8558099269866943, + "num_tokens": 113548085.0, + "step": 2973 + }, + { + "epoch": 0.37832336852817705, + "ewc_loss": 0.03682122379541397, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.0001582513068569824, + "grad_norm": 5.041738033294678, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8591594696044922, + "num_tokens": 113581664.0, + "step": 2974 + }, + { + "epoch": 0.3784505788067676, + "ewc_loss": 0.034139484167099, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00013143388787284493, + "grad_norm": 4.715710163116455, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8665677905082703, + "num_tokens": 113618552.0, + "step": 2975 + }, + { + "epoch": 0.3785777890853581, + "ewc_loss": 0.0349641777575016, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00013846013462170959, + "grad_norm": 4.963381767272949, + "learning_rate": 1e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.833511471748352, + "num_tokens": 113653406.0, + "step": 2976 + }, + { + "epoch": 0.3787049993639486, + "ewc_loss": 0.034781165421009064, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00013663002755492926, + "grad_norm": 4.677285671234131, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8628512024879456, + "num_tokens": 113693803.0, + "step": 2977 + }, + { + "epoch": 0.3788322096425391, + "ewc_loss": 0.034194428473711014, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00013076265167910606, + "grad_norm": 4.706282138824463, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8682234287261963, + "num_tokens": 113734637.0, + "step": 2978 + }, + { + "epoch": 0.37895941992112964, + "ewc_loss": 0.034270551055669785, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00013274457887746394, + "grad_norm": 4.680830478668213, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8476548194885254, + "num_tokens": 113776579.0, + "step": 2979 + }, + { + "epoch": 0.37908663019972016, + "ewc_loss": 0.033966872841119766, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012970779789611697, + "grad_norm": 4.664035320281982, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8437790274620056, + "num_tokens": 113818857.0, + "step": 2980 + }, + { + "epoch": 0.37921384047831064, + "ewc_loss": 0.03396523743867874, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012969142699148506, + "grad_norm": 4.789355278015137, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.846198558807373, + "num_tokens": 113853248.0, + "step": 2981 + }, + { + "epoch": 0.37934105075690117, + "ewc_loss": 0.03399977087974548, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012881605653092265, + "grad_norm": 4.628498554229736, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8598962426185608, + "num_tokens": 113894206.0, + "step": 2982 + }, + { + "epoch": 0.3794682610354917, + "ewc_loss": 0.03368903324007988, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012692938616964966, + "grad_norm": 4.658255100250244, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8610561490058899, + "num_tokens": 113931309.0, + "step": 2983 + }, + { + "epoch": 0.37959547131408217, + "ewc_loss": 0.033723559230566025, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012727465946227312, + "grad_norm": 4.598133563995361, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8429169058799744, + "num_tokens": 113976559.0, + "step": 2984 + }, + { + "epoch": 0.3797226815926727, + "ewc_loss": 0.03363523632287979, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012639141641557217, + "grad_norm": 4.693097114562988, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8531382083892822, + "num_tokens": 114010814.0, + "step": 2985 + }, + { + "epoch": 0.3798498918712632, + "ewc_loss": 0.03376942127943039, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012651257566176355, + "grad_norm": 4.606175422668457, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8727407455444336, + "num_tokens": 114049389.0, + "step": 2986 + }, + { + "epoch": 0.3799771021498537, + "ewc_loss": 0.033556364476680756, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012560268805827945, + "grad_norm": 4.62785005569458, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8715728521347046, + "num_tokens": 114081463.0, + "step": 2987 + }, + { + "epoch": 0.3801043124284442, + "ewc_loss": 0.033586762845516205, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012590667756740004, + "grad_norm": 4.603269577026367, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8696675300598145, + "num_tokens": 114117175.0, + "step": 2988 + }, + { + "epoch": 0.38023152270703475, + "ewc_loss": 0.03355206176638603, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.0001255596725968644, + "grad_norm": 4.58688497543335, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8430715799331665, + "num_tokens": 114153764.0, + "step": 2989 + }, + { + "epoch": 0.3803587329856252, + "ewc_loss": 0.03364429250359535, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012526128557510674, + "grad_norm": 4.5570244789123535, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.853585958480835, + "num_tokens": 114191487.0, + "step": 2990 + }, + { + "epoch": 0.38048594326421575, + "ewc_loss": 0.033656537532806396, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012538373994175345, + "grad_norm": 4.544584274291992, + "learning_rate": 1e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.8296293020248413, + "num_tokens": 114233880.0, + "step": 2991 + }, + { + "epoch": 0.3806131535428063, + "ewc_loss": 0.033637795597314835, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012519631127361208, + "grad_norm": 4.625381946563721, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8517194390296936, + "num_tokens": 114269238.0, + "step": 2992 + }, + { + "epoch": 0.38074036382139675, + "ewc_loss": 0.03368697687983513, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012568813690450042, + "grad_norm": 4.5663628578186035, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8664212822914124, + "num_tokens": 114307645.0, + "step": 2993 + }, + { + "epoch": 0.3808675740999873, + "ewc_loss": 0.03362290561199188, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012504743062891066, + "grad_norm": 4.603607177734375, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8579641580581665, + "num_tokens": 114341800.0, + "step": 2994 + }, + { + "epoch": 0.3809947843785778, + "ewc_loss": 0.03365456685423851, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.000125364022096619, + "grad_norm": 4.588317394256592, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8628726601600647, + "num_tokens": 114378363.0, + "step": 2995 + }, + { + "epoch": 0.3811219946571683, + "ewc_loss": 0.03359326720237732, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.0001247510517714545, + "grad_norm": 4.607198238372803, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8429657220840454, + "num_tokens": 114414489.0, + "step": 2996 + }, + { + "epoch": 0.3812492049357588, + "ewc_loss": 0.0336422361433506, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012524072371888906, + "grad_norm": 4.579108238220215, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8491754531860352, + "num_tokens": 114449197.0, + "step": 2997 + }, + { + "epoch": 0.38137641521434934, + "ewc_loss": 0.03360838070511818, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012490215885918587, + "grad_norm": 4.633584499359131, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8519898653030396, + "num_tokens": 114478654.0, + "step": 2998 + }, + { + "epoch": 0.3815036254929398, + "ewc_loss": 0.033663906157016754, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012545741628855467, + "grad_norm": 4.500270843505859, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8626943826675415, + "num_tokens": 114521807.0, + "step": 2999 + }, + { + "epoch": 0.38163083577153034, + "ewc_loss": 0.033608146011829376, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012489983055274934, + "grad_norm": 4.525578498840332, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8678628206253052, + "num_tokens": 114558297.0, + "step": 3000 + }, + { + "epoch": 0.38175804605012087, + "ewc_loss": 0.03369937092065811, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.0001258120610145852, + "grad_norm": 4.511298656463623, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8606358766555786, + "num_tokens": 114599630.0, + "step": 3001 + }, + { + "epoch": 0.38188525632871134, + "ewc_loss": 0.03365905210375786, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012540887109935284, + "grad_norm": 4.5703349113464355, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8513846397399902, + "num_tokens": 114638697.0, + "step": 3002 + }, + { + "epoch": 0.38201246660730187, + "ewc_loss": 0.03369650989770889, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012578346650116146, + "grad_norm": 4.575366020202637, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8532803654670715, + "num_tokens": 114676106.0, + "step": 3003 + }, + { + "epoch": 0.3821396768858924, + "ewc_loss": 0.03368377313017845, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012565607903525233, + "grad_norm": 4.585971355438232, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8520016670227051, + "num_tokens": 114712026.0, + "step": 3004 + }, + { + "epoch": 0.38226688716448287, + "ewc_loss": 0.033710163086652756, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012591997801791877, + "grad_norm": 4.553427219390869, + "learning_rate": 1e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8331746459007263, + "num_tokens": 114752348.0, + "step": 3005 + }, + { + "epoch": 0.3823940974430734, + "ewc_loss": 0.03368929773569107, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012571133265737444, + "grad_norm": 4.534398555755615, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8580374717712402, + "num_tokens": 114789922.0, + "step": 3006 + }, + { + "epoch": 0.38252130772166393, + "ewc_loss": 0.03383398801088333, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012593755673151463, + "grad_norm": 4.492754936218262, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8493376970291138, + "num_tokens": 114838834.0, + "step": 3007 + }, + { + "epoch": 0.3826485180002544, + "ewc_loss": 0.03369836136698723, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.0001258019619854167, + "grad_norm": 4.609043121337891, + "learning_rate": 1e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8334481716156006, + "num_tokens": 114873320.0, + "step": 3008 + }, + { + "epoch": 0.38277572827884493, + "ewc_loss": 0.033786557614803314, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.0001266839390154928, + "grad_norm": 4.557551383972168, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8652563095092773, + "num_tokens": 114908293.0, + "step": 3009 + }, + { + "epoch": 0.38290293855743546, + "ewc_loss": 0.03369764983654022, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012579484609887004, + "grad_norm": 4.563514232635498, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.844476044178009, + "num_tokens": 114950083.0, + "step": 3010 + }, + { + "epoch": 0.38303014883602593, + "ewc_loss": 0.033874817192554474, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001263458252651617, + "grad_norm": 4.6455464363098145, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8577407598495483, + "num_tokens": 114982353.0, + "step": 3011 + }, + { + "epoch": 0.38315735911461646, + "ewc_loss": 0.03375931829214096, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012641152716241777, + "grad_norm": 4.563875675201416, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8584744930267334, + "num_tokens": 115017668.0, + "step": 3012 + }, + { + "epoch": 0.383284569393207, + "ewc_loss": 0.03365965932607651, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012541496835183352, + "grad_norm": 4.564542293548584, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8649235367774963, + "num_tokens": 115055266.0, + "step": 3013 + }, + { + "epoch": 0.38341177967179746, + "ewc_loss": 0.033713653683662415, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012595490261446685, + "grad_norm": 4.561829090118408, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8490301370620728, + "num_tokens": 115089728.0, + "step": 3014 + }, + { + "epoch": 0.383538989950388, + "ewc_loss": 0.033581770956516266, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012585676449816674, + "grad_norm": 4.533425331115723, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.839005708694458, + "num_tokens": 115135192.0, + "step": 3015 + }, + { + "epoch": 0.3836662002289785, + "ewc_loss": 0.03370451554656029, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.0001258635165868327, + "grad_norm": 4.686187744140625, + "learning_rate": 1e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8362143039703369, + "num_tokens": 115170350.0, + "step": 3016 + }, + { + "epoch": 0.383793410507569, + "ewc_loss": 0.03365885093808174, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.0001266275648958981, + "grad_norm": 4.5423784255981445, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8610090017318726, + "num_tokens": 115202724.0, + "step": 3017 + }, + { + "epoch": 0.3839206207861595, + "ewc_loss": 0.03364367038011551, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.000125255057355389, + "grad_norm": 4.555098533630371, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8540250658988953, + "num_tokens": 115239274.0, + "step": 3018 + }, + { + "epoch": 0.38404783106475004, + "ewc_loss": 0.03363091126084328, + "ewc_loss_diag": 2.09808349609375e-05, + "ewc_loss_parallel": 0.00012634816812351346, + "grad_norm": 4.543244361877441, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8624787330627441, + "num_tokens": 115278330.0, + "step": 3019 + }, + { + "epoch": 0.3841750413433405, + "ewc_loss": 0.03370517119765282, + "ewc_loss_diag": 2.110004425048828e-05, + "ewc_loss_parallel": 0.00012587006494868547, + "grad_norm": 4.527437686920166, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8505358099937439, + "num_tokens": 115321258.0, + "step": 3020 + }, + { + "epoch": 0.38430225162193105, + "ewc_loss": 0.03388074040412903, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012640505156014115, + "grad_norm": 4.573354244232178, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8556603193283081, + "num_tokens": 115362609.0, + "step": 3021 + }, + { + "epoch": 0.3844294619005216, + "ewc_loss": 0.03386398404836655, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012623751536011696, + "grad_norm": 4.564740180969238, + "learning_rate": 1e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.8280428051948547, + "num_tokens": 115405814.0, + "step": 3022 + }, + { + "epoch": 0.38455667217911205, + "ewc_loss": 0.03389108553528786, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012650851567741483, + "grad_norm": 4.596242427825928, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8475184440612793, + "num_tokens": 115440856.0, + "step": 3023 + }, + { + "epoch": 0.3846838824577026, + "ewc_loss": 0.03389039635658264, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012650163262151182, + "grad_norm": 4.546873569488525, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8602044582366943, + "num_tokens": 115482582.0, + "step": 3024 + }, + { + "epoch": 0.3848110927362931, + "ewc_loss": 0.0338810458779335, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012640812201425433, + "grad_norm": 4.554042339324951, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8450390100479126, + "num_tokens": 115527268.0, + "step": 3025 + }, + { + "epoch": 0.3849383030148836, + "ewc_loss": 0.03390268236398697, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001266244798898697, + "grad_norm": 4.584089279174805, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8490855693817139, + "num_tokens": 115563757.0, + "step": 3026 + }, + { + "epoch": 0.3850655132934741, + "ewc_loss": 0.033910371363162994, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012670135765802115, + "grad_norm": 4.568501949310303, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8573270440101624, + "num_tokens": 115599629.0, + "step": 3027 + }, + { + "epoch": 0.38519272357206463, + "ewc_loss": 0.033871229737997055, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012630994024220854, + "grad_norm": 4.644659519195557, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8584372997283936, + "num_tokens": 115634261.0, + "step": 3028 + }, + { + "epoch": 0.3853199338506551, + "ewc_loss": 0.033921368420124054, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012681135558523238, + "grad_norm": 4.528831481933594, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8669632077217102, + "num_tokens": 115671150.0, + "step": 3029 + }, + { + "epoch": 0.38544714412924563, + "ewc_loss": 0.0338558703660965, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012615635932888836, + "grad_norm": 4.56757116317749, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8665800094604492, + "num_tokens": 115714328.0, + "step": 3030 + }, + { + "epoch": 0.38557435440783616, + "ewc_loss": 0.03392544761300087, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012685213005170226, + "grad_norm": 4.5579705238342285, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8696699738502502, + "num_tokens": 115747118.0, + "step": 3031 + }, + { + "epoch": 0.3857015646864267, + "ewc_loss": 0.033876463770866394, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001263622980332002, + "grad_norm": 4.707395553588867, + "learning_rate": 1e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8344727158546448, + "num_tokens": 115790309.0, + "step": 3032 + }, + { + "epoch": 0.38582877496501716, + "ewc_loss": 0.03397567570209503, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001273544185096398, + "grad_norm": 4.643425464630127, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8673514723777771, + "num_tokens": 115825630.0, + "step": 3033 + }, + { + "epoch": 0.3859559852436077, + "ewc_loss": 0.03383040428161621, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012590170081239194, + "grad_norm": 4.62514066696167, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8615751266479492, + "num_tokens": 115858848.0, + "step": 3034 + }, + { + "epoch": 0.3860831955221982, + "ewc_loss": 0.033816151320934296, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001257591793546453, + "grad_norm": 4.526429653167725, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8626070022583008, + "num_tokens": 115900024.0, + "step": 3035 + }, + { + "epoch": 0.3862104058007887, + "ewc_loss": 0.03380180895328522, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012561575567815453, + "grad_norm": 4.615951061248779, + "learning_rate": 1e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.8306231498718262, + "num_tokens": 115938899.0, + "step": 3036 + }, + { + "epoch": 0.3863376160793792, + "ewc_loss": 0.0338524766266346, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001261224242625758, + "grad_norm": 4.662193298339844, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8615256547927856, + "num_tokens": 115972199.0, + "step": 3037 + }, + { + "epoch": 0.38646482635796975, + "ewc_loss": 0.0338057316839695, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012565497308969498, + "grad_norm": 4.5611186027526855, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8639323711395264, + "num_tokens": 116011506.0, + "step": 3038 + }, + { + "epoch": 0.3865920366365602, + "ewc_loss": 0.033798836171627045, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012558601156342775, + "grad_norm": 4.622828960418701, + "learning_rate": 1e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8319357633590698, + "num_tokens": 116051428.0, + "step": 3039 + }, + { + "epoch": 0.38671924691515075, + "ewc_loss": 0.03383941575884819, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001259918208234012, + "grad_norm": 4.58624792098999, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8631531596183777, + "num_tokens": 116090307.0, + "step": 3040 + }, + { + "epoch": 0.3868464571937413, + "ewc_loss": 0.03377893567085266, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012538702867459506, + "grad_norm": 4.597073078155518, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.844798743724823, + "num_tokens": 116123381.0, + "step": 3041 + }, + { + "epoch": 0.38697366747233175, + "ewc_loss": 0.03384275734424591, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012602523202076554, + "grad_norm": 4.61175537109375, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.857079029083252, + "num_tokens": 116157755.0, + "step": 3042 + }, + { + "epoch": 0.3871008777509223, + "ewc_loss": 0.0338284932076931, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012588257959578186, + "grad_norm": 4.502134323120117, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8551512956619263, + "num_tokens": 116197371.0, + "step": 3043 + }, + { + "epoch": 0.3872280880295128, + "ewc_loss": 0.033796004951000214, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012555770808830857, + "grad_norm": 4.518876552581787, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8392893075942993, + "num_tokens": 116240544.0, + "step": 3044 + }, + { + "epoch": 0.3873552983081033, + "ewc_loss": 0.033854685723781586, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001261445286218077, + "grad_norm": 4.607876777648926, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8517859578132629, + "num_tokens": 116272079.0, + "step": 3045 + }, + { + "epoch": 0.3874825085866938, + "ewc_loss": 0.033916980028152466, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001267674524569884, + "grad_norm": 4.552005767822266, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8517007827758789, + "num_tokens": 116309641.0, + "step": 3046 + }, + { + "epoch": 0.38760971886528434, + "ewc_loss": 0.03387022763490677, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012629994307644665, + "grad_norm": 4.654878616333008, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8494068384170532, + "num_tokens": 116344470.0, + "step": 3047 + }, + { + "epoch": 0.3877369291438748, + "ewc_loss": 0.03395078331232071, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012710550799965858, + "grad_norm": 4.637629985809326, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8573211431503296, + "num_tokens": 116380177.0, + "step": 3048 + }, + { + "epoch": 0.38786413942246534, + "ewc_loss": 0.033883173018693924, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012642938236240298, + "grad_norm": 4.50729513168335, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8649028539657593, + "num_tokens": 116416482.0, + "step": 3049 + }, + { + "epoch": 0.38799134970105587, + "ewc_loss": 0.0338793508708477, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012639115448109806, + "grad_norm": 4.5813703536987305, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8414813280105591, + "num_tokens": 116456103.0, + "step": 3050 + }, + { + "epoch": 0.38811855997964634, + "ewc_loss": 0.03394521772861481, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012704983237199485, + "grad_norm": 4.553462028503418, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8602624535560608, + "num_tokens": 116496810.0, + "step": 3051 + }, + { + "epoch": 0.38824577025823687, + "ewc_loss": 0.033888280391693115, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012648047413676977, + "grad_norm": 4.608645915985107, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8497048020362854, + "num_tokens": 116532999.0, + "step": 3052 + }, + { + "epoch": 0.3883729805368274, + "ewc_loss": 0.033959150314331055, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012718916696030647, + "grad_norm": 4.606649398803711, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8622811436653137, + "num_tokens": 116565700.0, + "step": 3053 + }, + { + "epoch": 0.38850019081541787, + "ewc_loss": 0.033927761018276215, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001268752821488306, + "grad_norm": 4.55919075012207, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8585618138313293, + "num_tokens": 116602798.0, + "step": 3054 + }, + { + "epoch": 0.3886274010940084, + "ewc_loss": 0.03393403813242912, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001269380300072953, + "grad_norm": 4.608722686767578, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8666210174560547, + "num_tokens": 116638743.0, + "step": 3055 + }, + { + "epoch": 0.3887546113725989, + "ewc_loss": 0.033964045345783234, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012723809049930423, + "grad_norm": 4.601503849029541, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8670920133590698, + "num_tokens": 116672290.0, + "step": 3056 + }, + { + "epoch": 0.3888818216511894, + "ewc_loss": 0.033921387046575546, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012681153020821512, + "grad_norm": 4.52424955368042, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8555248379707336, + "num_tokens": 116712593.0, + "step": 3057 + }, + { + "epoch": 0.3890090319297799, + "ewc_loss": 0.03391849994659424, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012678267376031727, + "grad_norm": 4.6043877601623535, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8536431789398193, + "num_tokens": 116748663.0, + "step": 3058 + }, + { + "epoch": 0.38913624220837045, + "ewc_loss": 0.03396853804588318, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012728304136544466, + "grad_norm": 4.554379463195801, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8502281308174133, + "num_tokens": 116796018.0, + "step": 3059 + }, + { + "epoch": 0.3892634524869609, + "ewc_loss": 0.03391418978571892, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012673955643549562, + "grad_norm": 4.600246906280518, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8525433540344238, + "num_tokens": 116837287.0, + "step": 3060 + }, + { + "epoch": 0.38939066276555145, + "ewc_loss": 0.03395695611834526, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001271672226721421, + "grad_norm": 4.633275985717773, + "learning_rate": 1e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8383150696754456, + "num_tokens": 116872163.0, + "step": 3061 + }, + { + "epoch": 0.389517873044142, + "ewc_loss": 0.03394476696848869, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001270453358301893, + "grad_norm": 4.525472640991211, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8628445267677307, + "num_tokens": 116913445.0, + "step": 3062 + }, + { + "epoch": 0.38964508332273246, + "ewc_loss": 0.03391222655773163, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012671992590185255, + "grad_norm": 4.583191394805908, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8461121320724487, + "num_tokens": 116954469.0, + "step": 3063 + }, + { + "epoch": 0.389772293601323, + "ewc_loss": 0.03395944833755493, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012719215010292828, + "grad_norm": 4.5486860275268555, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.85442054271698, + "num_tokens": 116994339.0, + "step": 3064 + }, + { + "epoch": 0.3898995038799135, + "ewc_loss": 0.033936403691768646, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001269616768695414, + "grad_norm": 4.585371971130371, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8675189018249512, + "num_tokens": 117030867.0, + "step": 3065 + }, + { + "epoch": 0.390026714158504, + "ewc_loss": 0.03395412862300873, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012713894830085337, + "grad_norm": 4.612938404083252, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.863101065158844, + "num_tokens": 117068309.0, + "step": 3066 + }, + { + "epoch": 0.3901539244370945, + "ewc_loss": 0.03396026790142059, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012720032827928662, + "grad_norm": 4.612802505493164, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8487064838409424, + "num_tokens": 117105278.0, + "step": 3067 + }, + { + "epoch": 0.39028113471568504, + "ewc_loss": 0.033929530531167984, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012689294817391783, + "grad_norm": 4.616500377655029, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8469908237457275, + "num_tokens": 117143096.0, + "step": 3068 + }, + { + "epoch": 0.3904083449942755, + "ewc_loss": 0.03396555036306381, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012725318083539605, + "grad_norm": 4.654504776000977, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8551467657089233, + "num_tokens": 117179638.0, + "step": 3069 + }, + { + "epoch": 0.39053555527286604, + "ewc_loss": 0.033923570066690445, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012683335808105767, + "grad_norm": 4.632810592651367, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8677479028701782, + "num_tokens": 117211560.0, + "step": 3070 + }, + { + "epoch": 0.39066276555145657, + "ewc_loss": 0.03391742706298828, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012677194899879396, + "grad_norm": 4.591436386108398, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8465362787246704, + "num_tokens": 117250622.0, + "step": 3071 + }, + { + "epoch": 0.39078997583004704, + "ewc_loss": 0.03389818221330643, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001265794999198988, + "grad_norm": 4.564823627471924, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.875476598739624, + "num_tokens": 117287600.0, + "step": 3072 + }, + { + "epoch": 0.39091718610863757, + "ewc_loss": 0.03393193706870079, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012691701704170555, + "grad_norm": 4.595502853393555, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8391009569168091, + "num_tokens": 117328188.0, + "step": 3073 + }, + { + "epoch": 0.3910443963872281, + "ewc_loss": 0.033938560634851456, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001269832719117403, + "grad_norm": 4.580626964569092, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8627169132232666, + "num_tokens": 117367282.0, + "step": 3074 + }, + { + "epoch": 0.39117160666581857, + "ewc_loss": 0.03406679630279541, + "ewc_loss_diag": 2.1338462829589844e-05, + "ewc_loss_parallel": 0.00012704489927273244, + "grad_norm": 4.644846439361572, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.865383505821228, + "num_tokens": 117403376.0, + "step": 3075 + }, + { + "epoch": 0.3912988169444091, + "ewc_loss": 0.03408409655094147, + "ewc_loss_diag": 2.1338462829589844e-05, + "ewc_loss_parallel": 0.00012721793609671295, + "grad_norm": 4.6638569831848145, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8450279235839844, + "num_tokens": 117438661.0, + "step": 3076 + }, + { + "epoch": 0.39142602722299963, + "ewc_loss": 0.0340447723865509, + "ewc_loss_diag": 2.1338462829589844e-05, + "ewc_loss_parallel": 0.0001268246996914968, + "grad_norm": 4.630222797393799, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.859074056148529, + "num_tokens": 117477351.0, + "step": 3077 + }, + { + "epoch": 0.3915532375015901, + "ewc_loss": 0.03406058996915817, + "ewc_loss_diag": 2.1338462829589844e-05, + "ewc_loss_parallel": 0.00012698283535428345, + "grad_norm": 4.627795696258545, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8522895574569702, + "num_tokens": 117512773.0, + "step": 3078 + }, + { + "epoch": 0.39168044778018063, + "ewc_loss": 0.03417414426803589, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012689769209828228, + "grad_norm": 4.609401702880859, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8397074937820435, + "num_tokens": 117555370.0, + "step": 3079 + }, + { + "epoch": 0.39180765805877116, + "ewc_loss": 0.03403317183256149, + "ewc_loss_diag": 2.1338462829589844e-05, + "ewc_loss_parallel": 0.00012670867727138102, + "grad_norm": 5.1808881759643555, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.862276554107666, + "num_tokens": 117594208.0, + "step": 3080 + }, + { + "epoch": 0.3919348683373617, + "ewc_loss": 0.0343204066157341, + "ewc_loss_diag": 2.1338462829589844e-05, + "ewc_loss_parallel": 0.00012958102161064744, + "grad_norm": 4.524412155151367, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8585966229438782, + "num_tokens": 117632715.0, + "step": 3081 + }, + { + "epoch": 0.39206207861595216, + "ewc_loss": 0.03389652818441391, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.0001241215504705906, + "grad_norm": 4.623929977416992, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8495126962661743, + "num_tokens": 117669221.0, + "step": 3082 + }, + { + "epoch": 0.3921892888945427, + "ewc_loss": 0.03426463529467583, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012780260294675827, + "grad_norm": 4.541478633880615, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8489832878112793, + "num_tokens": 117709559.0, + "step": 3083 + }, + { + "epoch": 0.3923164991731332, + "ewc_loss": 0.03382231295108795, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012582079216372222, + "grad_norm": 4.598108291625977, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8619890213012695, + "num_tokens": 117745240.0, + "step": 3084 + }, + { + "epoch": 0.3924437094517237, + "ewc_loss": 0.034199267625808716, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012714894546661526, + "grad_norm": 4.54102087020874, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8572568893432617, + "num_tokens": 117787897.0, + "step": 3085 + }, + { + "epoch": 0.3925709197303142, + "ewc_loss": 0.033927131444215775, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012686896661762148, + "grad_norm": 4.639803409576416, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8509083390235901, + "num_tokens": 117826862.0, + "step": 3086 + }, + { + "epoch": 0.39269813000890474, + "ewc_loss": 0.033980417996644974, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012740182864945382, + "grad_norm": 4.57318115234375, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8504060506820679, + "num_tokens": 117866932.0, + "step": 3087 + }, + { + "epoch": 0.3928253402874952, + "ewc_loss": 0.0341932438313961, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.0001270886859856546, + "grad_norm": 4.600561618804932, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8556920886039734, + "num_tokens": 117905538.0, + "step": 3088 + }, + { + "epoch": 0.39295255056608575, + "ewc_loss": 0.03424452245235443, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012760145182255656, + "grad_norm": 4.617758274078369, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8429515361785889, + "num_tokens": 117945123.0, + "step": 3089 + }, + { + "epoch": 0.3930797608446763, + "ewc_loss": 0.03423064947128296, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012746274296659976, + "grad_norm": 4.57997465133667, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8568775653839111, + "num_tokens": 117988743.0, + "step": 3090 + }, + { + "epoch": 0.39320697112326675, + "ewc_loss": 0.03398469462990761, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012744459672831, + "grad_norm": 4.608188152313232, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.868644118309021, + "num_tokens": 118026078.0, + "step": 3091 + }, + { + "epoch": 0.3933341814018573, + "ewc_loss": 0.0342220664024353, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012737693032249808, + "grad_norm": 4.566714286804199, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.866175651550293, + "num_tokens": 118060105.0, + "step": 3092 + }, + { + "epoch": 0.3934613916804478, + "ewc_loss": 0.03424607962369919, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012761705147568136, + "grad_norm": 4.6385955810546875, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8833228349685669, + "num_tokens": 118089677.0, + "step": 3093 + }, + { + "epoch": 0.3935886019590383, + "ewc_loss": 0.03424975648522377, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012765380961354822, + "grad_norm": 4.5530242919921875, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8682935833930969, + "num_tokens": 118128449.0, + "step": 3094 + }, + { + "epoch": 0.3937158122376288, + "ewc_loss": 0.03422887623310089, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012744498963002115, + "grad_norm": 4.642068862915039, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8638185262680054, + "num_tokens": 118168411.0, + "step": 3095 + }, + { + "epoch": 0.39384302251621933, + "ewc_loss": 0.03429053723812103, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012806161248590797, + "grad_norm": 4.613402366638184, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8652381896972656, + "num_tokens": 118206097.0, + "step": 3096 + }, + { + "epoch": 0.3939702327948098, + "ewc_loss": 0.033967580646276474, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012727345165330917, + "grad_norm": 4.593672752380371, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8677302598953247, + "num_tokens": 118244429.0, + "step": 3097 + }, + { + "epoch": 0.39409744307340033, + "ewc_loss": 0.03399110585451126, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001275087270187214, + "grad_norm": 4.626121997833252, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8608113527297974, + "num_tokens": 118279125.0, + "step": 3098 + }, + { + "epoch": 0.39422465335199086, + "ewc_loss": 0.034015290439128876, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012775056529790163, + "grad_norm": 4.637829303741455, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8526943922042847, + "num_tokens": 118313156.0, + "step": 3099 + }, + { + "epoch": 0.39435186363058133, + "ewc_loss": 0.03423777222633362, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012753397459164262, + "grad_norm": 4.606266975402832, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8416153192520142, + "num_tokens": 118349868.0, + "step": 3100 + }, + { + "epoch": 0.39447907390917186, + "ewc_loss": 0.03400131314992905, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012761077960021794, + "grad_norm": 4.573005199432373, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8454370498657227, + "num_tokens": 118394998.0, + "step": 3101 + }, + { + "epoch": 0.3946062841877624, + "ewc_loss": 0.033984988927841187, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012744756531901658, + "grad_norm": 4.55620002746582, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8667538166046143, + "num_tokens": 118436002.0, + "step": 3102 + }, + { + "epoch": 0.39473349446635286, + "ewc_loss": 0.034061308950185776, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001282107550650835, + "grad_norm": 4.607669830322266, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8596837520599365, + "num_tokens": 118475094.0, + "step": 3103 + }, + { + "epoch": 0.3948607047449434, + "ewc_loss": 0.03405964747071266, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012819412222597748, + "grad_norm": 4.5950727462768555, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8530985116958618, + "num_tokens": 118514164.0, + "step": 3104 + }, + { + "epoch": 0.3949879150235339, + "ewc_loss": 0.034030236303806305, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012789999891538173, + "grad_norm": 4.609443187713623, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8506127595901489, + "num_tokens": 118554271.0, + "step": 3105 + }, + { + "epoch": 0.3951151253021244, + "ewc_loss": 0.03403495252132416, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012794719077646732, + "grad_norm": 4.563115119934082, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8554919958114624, + "num_tokens": 118597123.0, + "step": 3106 + }, + { + "epoch": 0.3952423355807149, + "ewc_loss": 0.03402174264192581, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012781505938619375, + "grad_norm": 4.6246018409729, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8498891592025757, + "num_tokens": 118636683.0, + "step": 3107 + }, + { + "epoch": 0.39536954585930545, + "ewc_loss": 0.03405512124300003, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012814886576961726, + "grad_norm": 4.708656311035156, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8592114448547363, + "num_tokens": 118665770.0, + "step": 3108 + }, + { + "epoch": 0.3954967561378959, + "ewc_loss": 0.034054361283779144, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012814125511795282, + "grad_norm": 4.583896160125732, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8659777641296387, + "num_tokens": 118701652.0, + "step": 3109 + }, + { + "epoch": 0.39562396641648645, + "ewc_loss": 0.033983513712882996, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012743278057314456, + "grad_norm": 4.594162464141846, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8618957996368408, + "num_tokens": 118741756.0, + "step": 3110 + }, + { + "epoch": 0.395751176695077, + "ewc_loss": 0.034059517085552216, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012819284165743738, + "grad_norm": 4.602583408355713, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.857251763343811, + "num_tokens": 118784320.0, + "step": 3111 + }, + { + "epoch": 0.39587838697366745, + "ewc_loss": 0.03400643542408943, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012766201689373702, + "grad_norm": 4.603682041168213, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8492231965065002, + "num_tokens": 118821456.0, + "step": 3112 + }, + { + "epoch": 0.396005597252258, + "ewc_loss": 0.03409278765320778, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012852554209530354, + "grad_norm": 4.623100757598877, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8543980121612549, + "num_tokens": 118861746.0, + "step": 3113 + }, + { + "epoch": 0.3961328075308485, + "ewc_loss": 0.03404519706964493, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001280496217077598, + "grad_norm": 4.604565620422363, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8606151342391968, + "num_tokens": 118900346.0, + "step": 3114 + }, + { + "epoch": 0.396260017809439, + "ewc_loss": 0.03408984839916229, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.00012849611812271178, + "grad_norm": 4.636460781097412, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8566400408744812, + "num_tokens": 118939707.0, + "step": 3115 + }, + { + "epoch": 0.3963872280880295, + "ewc_loss": 0.034080103039741516, + "ewc_loss_diag": 2.1219253540039062e-05, + "ewc_loss_parallel": 0.0001283987076021731, + "grad_norm": 4.627991676330566, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8556070327758789, + "num_tokens": 118971154.0, + "step": 3116 + }, + { + "epoch": 0.39651443836662004, + "ewc_loss": 0.03419888764619827, + "ewc_loss_diag": 2.1338462829589844e-05, + "ewc_loss_parallel": 0.0001283658348256722, + "grad_norm": 4.589628219604492, + "learning_rate": 1e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8407580852508545, + "num_tokens": 119012344.0, + "step": 3117 + }, + { + "epoch": 0.3966416486452105, + "ewc_loss": 0.03435485437512398, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012870479258708656, + "grad_norm": 4.643982410430908, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8550685048103333, + "num_tokens": 119047677.0, + "step": 3118 + }, + { + "epoch": 0.39676885892380104, + "ewc_loss": 0.03436053916811943, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.0001287616469198838, + "grad_norm": 4.576642036437988, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8694517612457275, + "num_tokens": 119086357.0, + "step": 3119 + }, + { + "epoch": 0.39689606920239157, + "ewc_loss": 0.03430946171283722, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012825088924728334, + "grad_norm": 4.597135543823242, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8628333806991577, + "num_tokens": 119122854.0, + "step": 3120 + }, + { + "epoch": 0.39702327948098204, + "ewc_loss": 0.034352805465459824, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012868430349044502, + "grad_norm": 4.657890319824219, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8631857633590698, + "num_tokens": 119155449.0, + "step": 3121 + }, + { + "epoch": 0.39715048975957257, + "ewc_loss": 0.03435473144054413, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012870354112237692, + "grad_norm": 4.560486316680908, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8502582311630249, + "num_tokens": 119197287.0, + "step": 3122 + }, + { + "epoch": 0.3972777000381631, + "ewc_loss": 0.03434022516012192, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012855851673521101, + "grad_norm": 4.61993408203125, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8432925939559937, + "num_tokens": 119235749.0, + "step": 3123 + }, + { + "epoch": 0.39740491031675357, + "ewc_loss": 0.03436313569545746, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012878762208856642, + "grad_norm": 4.622694969177246, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8517638444900513, + "num_tokens": 119267282.0, + "step": 3124 + }, + { + "epoch": 0.3975321205953441, + "ewc_loss": 0.03438130021095276, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012896927364636213, + "grad_norm": 4.630302906036377, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.84759920835495, + "num_tokens": 119304387.0, + "step": 3125 + }, + { + "epoch": 0.3976593308739346, + "ewc_loss": 0.03438454866409302, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.0001290017389692366, + "grad_norm": 4.629239559173584, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.845458984375, + "num_tokens": 119341470.0, + "step": 3126 + }, + { + "epoch": 0.3977865411525251, + "ewc_loss": 0.034402329474687576, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012917954882141203, + "grad_norm": 4.591464519500732, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8445334434509277, + "num_tokens": 119381067.0, + "step": 3127 + }, + { + "epoch": 0.3979137514311156, + "ewc_loss": 0.034409306943416595, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012924932525493205, + "grad_norm": 4.637723922729492, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8556404709815979, + "num_tokens": 119415680.0, + "step": 3128 + }, + { + "epoch": 0.39804096170970615, + "ewc_loss": 0.03445039689540863, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012966024223715067, + "grad_norm": 4.622533321380615, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8458800315856934, + "num_tokens": 119454300.0, + "step": 3129 + }, + { + "epoch": 0.3981681719882967, + "ewc_loss": 0.03442547470331192, + "ewc_loss_diag": 2.1457672119140625e-05, + "ewc_loss_parallel": 0.00012941101158503443, + "grad_norm": 4.681143283843994, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8681133985519409, + "num_tokens": 119485599.0, + "step": 3130 + }, + { + "epoch": 0.39829538226688715, + "ewc_loss": 0.03459012880921364, + "ewc_loss_diag": 2.1576881408691406e-05, + "ewc_loss_parallel": 0.00012983684428036213, + "grad_norm": 4.632654190063477, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8410574793815613, + "num_tokens": 119525820.0, + "step": 3131 + }, + { + "epoch": 0.3984225925454777, + "ewc_loss": 0.03453454375267029, + "ewc_loss_diag": 2.1576881408691406e-05, + "ewc_loss_parallel": 0.00012928099022246897, + "grad_norm": 4.577980041503906, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8609335422515869, + "num_tokens": 119568216.0, + "step": 3132 + }, + { + "epoch": 0.3985498028240682, + "ewc_loss": 0.03466027230024338, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.00012931754463352263, + "grad_norm": 4.7460246086120605, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8423479795455933, + "num_tokens": 119602914.0, + "step": 3133 + }, + { + "epoch": 0.3986770131026587, + "ewc_loss": 0.03459995239973068, + "ewc_loss_diag": 2.1576881408691406e-05, + "ewc_loss_parallel": 0.00012993505515623838, + "grad_norm": 4.6481781005859375, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8515611886978149, + "num_tokens": 119637024.0, + "step": 3134 + }, + { + "epoch": 0.3988042233812492, + "ewc_loss": 0.0344608910381794, + "ewc_loss_diag": 2.1576881408691406e-05, + "ewc_loss_parallel": 0.00012854444503318518, + "grad_norm": 4.59062385559082, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8466517329216003, + "num_tokens": 119679265.0, + "step": 3135 + }, + { + "epoch": 0.39893143365983974, + "ewc_loss": 0.03453339636325836, + "ewc_loss_diag": 2.1576881408691406e-05, + "ewc_loss_parallel": 0.00012926949420943856, + "grad_norm": 4.634377956390381, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.846014678478241, + "num_tokens": 119716254.0, + "step": 3136 + }, + { + "epoch": 0.3990586439384302, + "ewc_loss": 0.03454705327749252, + "ewc_loss_diag": 2.1576881408691406e-05, + "ewc_loss_parallel": 0.00012940606393385679, + "grad_norm": 4.5423760414123535, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8488204479217529, + "num_tokens": 119761325.0, + "step": 3137 + }, + { + "epoch": 0.39918585421702074, + "ewc_loss": 0.03450309485197067, + "ewc_loss_diag": 2.1576881408691406e-05, + "ewc_loss_parallel": 0.00012896647967863828, + "grad_norm": 4.653244972229004, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8779102563858032, + "num_tokens": 119795468.0, + "step": 3138 + }, + { + "epoch": 0.39931306449561127, + "ewc_loss": 0.03456004336476326, + "ewc_loss_diag": 2.1576881408691406e-05, + "ewc_loss_parallel": 0.00012953596888110042, + "grad_norm": 4.619305610656738, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8457400798797607, + "num_tokens": 119833804.0, + "step": 3139 + }, + { + "epoch": 0.39944027477420174, + "ewc_loss": 0.0344993993639946, + "ewc_loss_diag": 2.1576881408691406e-05, + "ewc_loss_parallel": 0.00012892951781395823, + "grad_norm": 4.6646294593811035, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8507610559463501, + "num_tokens": 119870852.0, + "step": 3140 + }, + { + "epoch": 0.39956748505279227, + "ewc_loss": 0.03476057946681976, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00012909994984511286, + "grad_norm": 4.579226970672607, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8582075238227844, + "num_tokens": 119909258.0, + "step": 3141 + }, + { + "epoch": 0.3996946953313828, + "ewc_loss": 0.03473636135458946, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00012885776231996715, + "grad_norm": 4.690507888793945, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8430295586585999, + "num_tokens": 119940701.0, + "step": 3142 + }, + { + "epoch": 0.39982190560997327, + "ewc_loss": 0.03469540923833847, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.00012966894428245723, + "grad_norm": 4.642107963562012, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8549091815948486, + "num_tokens": 119976780.0, + "step": 3143 + }, + { + "epoch": 0.3999491158885638, + "ewc_loss": 0.0346197672188282, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.00012891250662505627, + "grad_norm": 4.600454807281494, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8472466468811035, + "num_tokens": 120017543.0, + "step": 3144 + }, + { + "epoch": 0.40007632616715433, + "ewc_loss": 0.034632228314876556, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.000129037129227072, + "grad_norm": 4.684290885925293, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8520355224609375, + "num_tokens": 120050036.0, + "step": 3145 + }, + { + "epoch": 0.4002035364457448, + "ewc_loss": 0.03467360511422157, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.00012945089838467538, + "grad_norm": 4.617066383361816, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8695976734161377, + "num_tokens": 120084582.0, + "step": 3146 + }, + { + "epoch": 0.40033074672433533, + "ewc_loss": 0.03460826724767685, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.00012879753194283694, + "grad_norm": 4.658993721008301, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8495745062828064, + "num_tokens": 120120218.0, + "step": 3147 + }, + { + "epoch": 0.40045795700292586, + "ewc_loss": 0.03465529531240463, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.00012926779163535684, + "grad_norm": 4.556955814361572, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8754068613052368, + "num_tokens": 120161650.0, + "step": 3148 + }, + { + "epoch": 0.40058516728151633, + "ewc_loss": 0.034577805548906326, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.00012849288759753108, + "grad_norm": 4.612034320831299, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8639590740203857, + "num_tokens": 120197409.0, + "step": 3149 + }, + { + "epoch": 0.40071237756010686, + "ewc_loss": 0.03466174006462097, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.0001293322566198185, + "grad_norm": 4.586756229400635, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8432454466819763, + "num_tokens": 120240700.0, + "step": 3150 + }, + { + "epoch": 0.4008395878386974, + "ewc_loss": 0.03462100028991699, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.00012892484664916992, + "grad_norm": 4.5743842124938965, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.844746470451355, + "num_tokens": 120286741.0, + "step": 3151 + }, + { + "epoch": 0.40096679811728786, + "ewc_loss": 0.034668829292058945, + "ewc_loss_diag": 2.1696090698242188e-05, + "ewc_loss_parallel": 0.00012940312444698066, + "grad_norm": 4.684549808502197, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8576417565345764, + "num_tokens": 120323236.0, + "step": 3152 + }, + { + "epoch": 0.4010940083958784, + "ewc_loss": 0.03480170667171478, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00012951120152138174, + "grad_norm": 4.581779956817627, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8543704748153687, + "num_tokens": 120362185.0, + "step": 3153 + }, + { + "epoch": 0.4012212186744689, + "ewc_loss": 0.03486065939068794, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00012888004130218178, + "grad_norm": 4.653533935546875, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8473469018936157, + "num_tokens": 120397524.0, + "step": 3154 + }, + { + "epoch": 0.4013484289530594, + "ewc_loss": 0.034833334386348724, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.0001298274873988703, + "grad_norm": 4.588321685791016, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8584825992584229, + "num_tokens": 120439723.0, + "step": 3155 + }, + { + "epoch": 0.4014756392316499, + "ewc_loss": 0.0347309485077858, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.0001288036146434024, + "grad_norm": 4.5895304679870605, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8594342470169067, + "num_tokens": 120483693.0, + "step": 3156 + }, + { + "epoch": 0.40160284951024044, + "ewc_loss": 0.0348241813480854, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00012973594130016863, + "grad_norm": 4.630987167358398, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8650316596031189, + "num_tokens": 120520558.0, + "step": 3157 + }, + { + "epoch": 0.4017300597888309, + "ewc_loss": 0.03478600084781647, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.0001293541572522372, + "grad_norm": 4.628683567047119, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8649175763130188, + "num_tokens": 120557901.0, + "step": 3158 + }, + { + "epoch": 0.40185727006742145, + "ewc_loss": 0.03481115400791168, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00012960570165887475, + "grad_norm": 4.639978408813477, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8615353107452393, + "num_tokens": 120593851.0, + "step": 3159 + }, + { + "epoch": 0.401984480346012, + "ewc_loss": 0.03479471057653427, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00012944123591296375, + "grad_norm": 4.734271049499512, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8470231294631958, + "num_tokens": 120624533.0, + "step": 3160 + }, + { + "epoch": 0.40211169062460245, + "ewc_loss": 0.03483346849679947, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00012982881162315607, + "grad_norm": 4.658473968505859, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8505319356918335, + "num_tokens": 120660820.0, + "step": 3161 + }, + { + "epoch": 0.402238900903193, + "ewc_loss": 0.0347859188914299, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00012935332779306918, + "grad_norm": 4.539846420288086, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8672673106193542, + "num_tokens": 120701485.0, + "step": 3162 + }, + { + "epoch": 0.4023661111817835, + "ewc_loss": 0.03478964790701866, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.0001293906243517995, + "grad_norm": 4.697342395782471, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8635801076889038, + "num_tokens": 120736677.0, + "step": 3163 + }, + { + "epoch": 0.402493321460374, + "ewc_loss": 0.03487524017691612, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00013024653890170157, + "grad_norm": 4.622507095336914, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8529320359230042, + "num_tokens": 120773933.0, + "step": 3164 + }, + { + "epoch": 0.4026205317389645, + "ewc_loss": 0.034772105515003204, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00012921521556563675, + "grad_norm": 4.677505016326904, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8609809279441833, + "num_tokens": 120804671.0, + "step": 3165 + }, + { + "epoch": 0.40274774201755503, + "ewc_loss": 0.034814320504665375, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.00012963735207449645, + "grad_norm": 4.627824306488037, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8592754602432251, + "num_tokens": 120837042.0, + "step": 3166 + }, + { + "epoch": 0.4028749522961455, + "ewc_loss": 0.03479363024234772, + "ewc_loss_diag": 2.181529998779297e-05, + "ewc_loss_parallel": 0.0001294304383918643, + "grad_norm": 4.604762077331543, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8516000509262085, + "num_tokens": 120875127.0, + "step": 3167 + }, + { + "epoch": 0.40300216257473603, + "ewc_loss": 0.035042181611061096, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00012947454524692148, + "grad_norm": 4.596010684967041, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8540435433387756, + "num_tokens": 120915937.0, + "step": 3168 + }, + { + "epoch": 0.40312937285332656, + "ewc_loss": 0.034945521503686905, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00012972865079063922, + "grad_norm": 4.661825656890869, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8387284278869629, + "num_tokens": 120953334.0, + "step": 3169 + }, + { + "epoch": 0.40325658313191703, + "ewc_loss": 0.0350889228284359, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00012994196731597185, + "grad_norm": 4.63639497756958, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8557354211807251, + "num_tokens": 120986674.0, + "step": 3170 + }, + { + "epoch": 0.40338379341050756, + "ewc_loss": 0.03495638072490692, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00012983722263015807, + "grad_norm": 4.616390228271484, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8609240055084229, + "num_tokens": 121026305.0, + "step": 3171 + }, + { + "epoch": 0.4035110036890981, + "ewc_loss": 0.03498511016368866, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.0001301245210925117, + "grad_norm": 4.640474796295166, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8689850568771362, + "num_tokens": 121061644.0, + "step": 3172 + }, + { + "epoch": 0.40363821396768856, + "ewc_loss": 0.034974876791238785, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013002220657654107, + "grad_norm": 4.6504316329956055, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8514941334724426, + "num_tokens": 121102860.0, + "step": 3173 + }, + { + "epoch": 0.4037654242462791, + "ewc_loss": 0.03498532995581627, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013012673298362643, + "grad_norm": 4.61484432220459, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8682360649108887, + "num_tokens": 121136566.0, + "step": 3174 + }, + { + "epoch": 0.4038926345248696, + "ewc_loss": 0.03493808954954147, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00012965434871148318, + "grad_norm": 4.628634452819824, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8649691343307495, + "num_tokens": 121173759.0, + "step": 3175 + }, + { + "epoch": 0.4040198448034601, + "ewc_loss": 0.034969255328178406, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00012996597797609866, + "grad_norm": 4.6342244148254395, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8551592230796814, + "num_tokens": 121219881.0, + "step": 3176 + }, + { + "epoch": 0.4041470550820506, + "ewc_loss": 0.03497014939785004, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00012997494195587933, + "grad_norm": 4.616466045379639, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8539072275161743, + "num_tokens": 121258719.0, + "step": 3177 + }, + { + "epoch": 0.40427426536064115, + "ewc_loss": 0.0350804403424263, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001298571442021057, + "grad_norm": 4.612545967102051, + "learning_rate": 1e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8328505158424377, + "num_tokens": 121298013.0, + "step": 3178 + }, + { + "epoch": 0.4044014756392316, + "ewc_loss": 0.03493206202983856, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00012959404557477683, + "grad_norm": 4.6853837966918945, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8673933744430542, + "num_tokens": 121329615.0, + "step": 3179 + }, + { + "epoch": 0.40452868591782215, + "ewc_loss": 0.034967269748449326, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00012994612916372716, + "grad_norm": 4.580533504486084, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8656691908836365, + "num_tokens": 121365199.0, + "step": 3180 + }, + { + "epoch": 0.4046558961964127, + "ewc_loss": 0.03504415974020958, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00012949433585163206, + "grad_norm": 4.653476238250732, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8507739305496216, + "num_tokens": 121402393.0, + "step": 3181 + }, + { + "epoch": 0.4047831064750032, + "ewc_loss": 0.0349876806139946, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013015023432672024, + "grad_norm": 4.600964546203613, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8561196327209473, + "num_tokens": 121439678.0, + "step": 3182 + }, + { + "epoch": 0.4049103167535937, + "ewc_loss": 0.034936487674713135, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00012963832705281675, + "grad_norm": 4.63223123550415, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8503590226173401, + "num_tokens": 121480781.0, + "step": 3183 + }, + { + "epoch": 0.4050375270321842, + "ewc_loss": 0.034973032772541046, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.0001300037547480315, + "grad_norm": 4.578622817993164, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8619602918624878, + "num_tokens": 121517614.0, + "step": 3184 + }, + { + "epoch": 0.40516473731077474, + "ewc_loss": 0.03493405878543854, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00012961401080247015, + "grad_norm": 4.646708965301514, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.852963924407959, + "num_tokens": 121552225.0, + "step": 3185 + }, + { + "epoch": 0.4052919475893652, + "ewc_loss": 0.03499852120876312, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013025863154325634, + "grad_norm": 4.754318714141846, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.850304365158081, + "num_tokens": 121587177.0, + "step": 3186 + }, + { + "epoch": 0.40541915786795574, + "ewc_loss": 0.03498975187540054, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013017097080592066, + "grad_norm": 4.658856391906738, + "learning_rate": 1e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8369983434677124, + "num_tokens": 121619547.0, + "step": 3187 + }, + { + "epoch": 0.40554636814654627, + "ewc_loss": 0.03504886105656624, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001295413530897349, + "grad_norm": 4.591106414794922, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8619492053985596, + "num_tokens": 121657396.0, + "step": 3188 + }, + { + "epoch": 0.40567357842513674, + "ewc_loss": 0.03497232496738434, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.0001299966825172305, + "grad_norm": 4.640344619750977, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8393509984016418, + "num_tokens": 121691867.0, + "step": 3189 + }, + { + "epoch": 0.40580078870372727, + "ewc_loss": 0.035016778856515884, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013044121442362666, + "grad_norm": 4.709322929382324, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8496764898300171, + "num_tokens": 121723546.0, + "step": 3190 + }, + { + "epoch": 0.4059279989823178, + "ewc_loss": 0.035031743347644806, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013059085176791996, + "grad_norm": 4.56052303314209, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8550413846969604, + "num_tokens": 121764589.0, + "step": 3191 + }, + { + "epoch": 0.40605520926090827, + "ewc_loss": 0.034931592643260956, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.0001295893598580733, + "grad_norm": 4.733358383178711, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8440415859222412, + "num_tokens": 121803281.0, + "step": 3192 + }, + { + "epoch": 0.4061824195394988, + "ewc_loss": 0.035225823521614075, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001313109532929957, + "grad_norm": 4.658407688140869, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8428018093109131, + "num_tokens": 121841710.0, + "step": 3193 + }, + { + "epoch": 0.4063096298180893, + "ewc_loss": 0.034926868975162506, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.0001295421097893268, + "grad_norm": 4.618563175201416, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8537087440490723, + "num_tokens": 121881979.0, + "step": 3194 + }, + { + "epoch": 0.4064368400966798, + "ewc_loss": 0.03503482788801193, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.0001306217018282041, + "grad_norm": 4.618600368499756, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8598535060882568, + "num_tokens": 121921599.0, + "step": 3195 + }, + { + "epoch": 0.4065640503752703, + "ewc_loss": 0.03512221574783325, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001302748714806512, + "grad_norm": 4.594099521636963, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8642072677612305, + "num_tokens": 121960545.0, + "step": 3196 + }, + { + "epoch": 0.40669126065386085, + "ewc_loss": 0.0350172221660614, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.0001304456527577713, + "grad_norm": 4.61712646484375, + "learning_rate": 1e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8365186452865601, + "num_tokens": 121999259.0, + "step": 3197 + }, + { + "epoch": 0.4068184709324513, + "ewc_loss": 0.03506913408637047, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013096477778162807, + "grad_norm": 4.599921703338623, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8681386709213257, + "num_tokens": 122039066.0, + "step": 3198 + }, + { + "epoch": 0.40694568121104185, + "ewc_loss": 0.03500087186694145, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013028214743826538, + "grad_norm": 4.582723140716553, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8633618354797363, + "num_tokens": 122085035.0, + "step": 3199 + }, + { + "epoch": 0.4070728914896324, + "ewc_loss": 0.03502986207604408, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013057206524536014, + "grad_norm": 4.590940952301025, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8689807653427124, + "num_tokens": 122128007.0, + "step": 3200 + }, + { + "epoch": 0.40720010176822286, + "ewc_loss": 0.0349980928003788, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013025436783209443, + "grad_norm": 4.628860950469971, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8531780242919922, + "num_tokens": 122169623.0, + "step": 3201 + }, + { + "epoch": 0.4073273120468134, + "ewc_loss": 0.03500831127166748, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.0001303565368289128, + "grad_norm": 4.71969747543335, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8507599830627441, + "num_tokens": 122201755.0, + "step": 3202 + }, + { + "epoch": 0.4074545223254039, + "ewc_loss": 0.03504571691155434, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.0001307306083617732, + "grad_norm": 4.69692850112915, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.865713357925415, + "num_tokens": 122233680.0, + "step": 3203 + }, + { + "epoch": 0.4075817326039944, + "ewc_loss": 0.03498627245426178, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013013617717660964, + "grad_norm": 4.871864318847656, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8524053692817688, + "num_tokens": 122265107.0, + "step": 3204 + }, + { + "epoch": 0.4077089428825849, + "ewc_loss": 0.035191673785448074, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013096947805024683, + "grad_norm": 4.5923004150390625, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8599047660827637, + "num_tokens": 122302750.0, + "step": 3205 + }, + { + "epoch": 0.40783615316117544, + "ewc_loss": 0.035010360181331635, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00012915633851662278, + "grad_norm": 4.58966064453125, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8464338779449463, + "num_tokens": 122346820.0, + "step": 3206 + }, + { + "epoch": 0.4079633634397659, + "ewc_loss": 0.035080451518297195, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001298572460655123, + "grad_norm": 4.622757911682129, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8590855002403259, + "num_tokens": 122383254.0, + "step": 3207 + }, + { + "epoch": 0.40809057371835644, + "ewc_loss": 0.03508324921131134, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00012988522939849645, + "grad_norm": 4.61422872543335, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8677752017974854, + "num_tokens": 122416693.0, + "step": 3208 + }, + { + "epoch": 0.40821778399694697, + "ewc_loss": 0.03510480001568794, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013010074326302856, + "grad_norm": 4.679830074310303, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8609431982040405, + "num_tokens": 122452403.0, + "step": 3209 + }, + { + "epoch": 0.40834499427553744, + "ewc_loss": 0.035123471170663834, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001302874443354085, + "grad_norm": 4.673756122589111, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8447533845901489, + "num_tokens": 122489685.0, + "step": 3210 + }, + { + "epoch": 0.40847220455412797, + "ewc_loss": 0.035121094435453415, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013026366650592536, + "grad_norm": 4.5990190505981445, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8379029631614685, + "num_tokens": 122529079.0, + "step": 3211 + }, + { + "epoch": 0.4085994148327185, + "ewc_loss": 0.03512588143348694, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001303115568589419, + "grad_norm": 4.631690502166748, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8486707210540771, + "num_tokens": 122569933.0, + "step": 3212 + }, + { + "epoch": 0.40872662511130897, + "ewc_loss": 0.03517560288310051, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013080876669846475, + "grad_norm": 4.64193058013916, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8354158401489258, + "num_tokens": 122607884.0, + "step": 3213 + }, + { + "epoch": 0.4088538353898995, + "ewc_loss": 0.03517618030309677, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013081451470497996, + "grad_norm": 4.583144187927246, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8563188910484314, + "num_tokens": 122651968.0, + "step": 3214 + }, + { + "epoch": 0.40898104566849003, + "ewc_loss": 0.03517371416091919, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013078987831249833, + "grad_norm": 4.73587703704834, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8585302233695984, + "num_tokens": 122688680.0, + "step": 3215 + }, + { + "epoch": 0.4091082559470805, + "ewc_loss": 0.03513244539499283, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013159790250938386, + "grad_norm": 4.578100204467773, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.860824704170227, + "num_tokens": 122728271.0, + "step": 3216 + }, + { + "epoch": 0.40923546622567103, + "ewc_loss": 0.03513605520129204, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013041328929830343, + "grad_norm": 4.722928047180176, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8536890745162964, + "num_tokens": 122759691.0, + "step": 3217 + }, + { + "epoch": 0.40936267650426156, + "ewc_loss": 0.035281114280223846, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013186388241592795, + "grad_norm": 4.691012859344482, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.853636622428894, + "num_tokens": 122800185.0, + "step": 3218 + }, + { + "epoch": 0.40948988678285203, + "ewc_loss": 0.035177137702703476, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013082411896903068, + "grad_norm": 4.624129295349121, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8646334409713745, + "num_tokens": 122839548.0, + "step": 3219 + }, + { + "epoch": 0.40961709706144256, + "ewc_loss": 0.03519037738442421, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013095649774186313, + "grad_norm": 4.658103942871094, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8416798710823059, + "num_tokens": 122879621.0, + "step": 3220 + }, + { + "epoch": 0.4097443073400331, + "ewc_loss": 0.03518465906381607, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013089932326693088, + "grad_norm": 4.701622009277344, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8503972291946411, + "num_tokens": 122921495.0, + "step": 3221 + }, + { + "epoch": 0.40987151761862356, + "ewc_loss": 0.03507431596517563, + "ewc_loss_diag": 2.193450927734375e-05, + "ewc_loss_parallel": 0.00013101661170367151, + "grad_norm": 4.654338836669922, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8621910214424133, + "num_tokens": 122957197.0, + "step": 3222 + }, + { + "epoch": 0.4099987278972141, + "ewc_loss": 0.03516802564263344, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013073298032395542, + "grad_norm": 4.661624908447266, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8373476266860962, + "num_tokens": 122999999.0, + "step": 3223 + }, + { + "epoch": 0.4101259381758046, + "ewc_loss": 0.035200126469135284, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013105398102197796, + "grad_norm": 4.6451005935668945, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.860140860080719, + "num_tokens": 123040402.0, + "step": 3224 + }, + { + "epoch": 0.4102531484543951, + "ewc_loss": 0.035203494131565094, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001310876541538164, + "grad_norm": 4.632957458496094, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8588622212409973, + "num_tokens": 123080086.0, + "step": 3225 + }, + { + "epoch": 0.4103803587329856, + "ewc_loss": 0.03522222116589546, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013127493730280548, + "grad_norm": 4.69230842590332, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8569907546043396, + "num_tokens": 123120058.0, + "step": 3226 + }, + { + "epoch": 0.41050756901157615, + "ewc_loss": 0.035225190222263336, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013130463776178658, + "grad_norm": 4.617877960205078, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.856374979019165, + "num_tokens": 123162003.0, + "step": 3227 + }, + { + "epoch": 0.4106347792901666, + "ewc_loss": 0.035162974148988724, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013068247062619776, + "grad_norm": 4.705148220062256, + "learning_rate": 1e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8232378363609314, + "num_tokens": 123202249.0, + "step": 3228 + }, + { + "epoch": 0.41076198956875715, + "ewc_loss": 0.03524784371256828, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013153116742614657, + "grad_norm": 4.692972660064697, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8496147394180298, + "num_tokens": 123237401.0, + "step": 3229 + }, + { + "epoch": 0.4108891998473477, + "ewc_loss": 0.03520883619785309, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013114111789036542, + "grad_norm": 4.736063480377197, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8713682889938354, + "num_tokens": 123269364.0, + "step": 3230 + }, + { + "epoch": 0.4110164101259382, + "ewc_loss": 0.03520843759179115, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013113711611367762, + "grad_norm": 4.669848442077637, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8585672378540039, + "num_tokens": 123307743.0, + "step": 3231 + }, + { + "epoch": 0.4111436204045287, + "ewc_loss": 0.035171397030353546, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013076668255962431, + "grad_norm": 4.688505172729492, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8604961633682251, + "num_tokens": 123340102.0, + "step": 3232 + }, + { + "epoch": 0.4112708306831192, + "ewc_loss": 0.03522435575723648, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013129631406627595, + "grad_norm": 4.705156326293945, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8507338166236877, + "num_tokens": 123374956.0, + "step": 3233 + }, + { + "epoch": 0.41139804096170973, + "ewc_loss": 0.035195231437683105, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001310050574829802, + "grad_norm": 4.673185348510742, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8651319742202759, + "num_tokens": 123418070.0, + "step": 3234 + }, + { + "epoch": 0.4115252512403002, + "ewc_loss": 0.03523176908493042, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013137042697053403, + "grad_norm": 4.727089881896973, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8531168103218079, + "num_tokens": 123453183.0, + "step": 3235 + }, + { + "epoch": 0.41165246151889073, + "ewc_loss": 0.03525166213512421, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001315693516517058, + "grad_norm": 4.63964319229126, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8561722040176392, + "num_tokens": 123491526.0, + "step": 3236 + }, + { + "epoch": 0.41177967179748126, + "ewc_loss": 0.03521697595715523, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013122249220032245, + "grad_norm": 4.643109321594238, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8573033809661865, + "num_tokens": 123529743.0, + "step": 3237 + }, + { + "epoch": 0.41190688207607173, + "ewc_loss": 0.035387683659791946, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.00013170886086300015, + "grad_norm": 4.6544976234436035, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8584653735160828, + "num_tokens": 123573402.0, + "step": 3238 + }, + { + "epoch": 0.41203409235466226, + "ewc_loss": 0.035427432507276535, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.00013210636097937822, + "grad_norm": 4.809025287628174, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.852868914604187, + "num_tokens": 123604167.0, + "step": 3239 + }, + { + "epoch": 0.4121613026332528, + "ewc_loss": 0.03549879789352417, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.00013282000145409256, + "grad_norm": 4.5570478439331055, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8522548675537109, + "num_tokens": 123648104.0, + "step": 3240 + }, + { + "epoch": 0.41228851291184326, + "ewc_loss": 0.03538622334599495, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.0001316942652920261, + "grad_norm": 4.677642822265625, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.84550940990448, + "num_tokens": 123689678.0, + "step": 3241 + }, + { + "epoch": 0.4124157231904338, + "ewc_loss": 0.03555319458246231, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.0001333639957010746, + "grad_norm": 4.632694721221924, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8634687662124634, + "num_tokens": 123729257.0, + "step": 3242 + }, + { + "epoch": 0.4125429334690243, + "ewc_loss": 0.03542684018611908, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.00013210043834988028, + "grad_norm": 4.669118404388428, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8565649390220642, + "num_tokens": 123772251.0, + "step": 3243 + }, + { + "epoch": 0.4126701437476148, + "ewc_loss": 0.03550192713737488, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.000132851317175664, + "grad_norm": 4.6911396980285645, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8677308559417725, + "num_tokens": 123810345.0, + "step": 3244 + }, + { + "epoch": 0.4127973540262053, + "ewc_loss": 0.03545263409614563, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.0001323583855992183, + "grad_norm": 4.6209893226623535, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8524057269096375, + "num_tokens": 123855276.0, + "step": 3245 + }, + { + "epoch": 0.41292456430479585, + "ewc_loss": 0.03545835614204407, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.0001324155746260658, + "grad_norm": 4.698785781860352, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.858340859413147, + "num_tokens": 123896613.0, + "step": 3246 + }, + { + "epoch": 0.4130517745833863, + "ewc_loss": 0.03547472879290581, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.0001325793273281306, + "grad_norm": 4.6937994956970215, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8395835757255554, + "num_tokens": 123940713.0, + "step": 3247 + }, + { + "epoch": 0.41317898486197685, + "ewc_loss": 0.03529030829668045, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.000131955835968256, + "grad_norm": 4.699542045593262, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.844497561454773, + "num_tokens": 123981485.0, + "step": 3248 + }, + { + "epoch": 0.4133061951405674, + "ewc_loss": 0.0352991446852684, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001320441806456074, + "grad_norm": 4.666497707366943, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8568781018257141, + "num_tokens": 124018159.0, + "step": 3249 + }, + { + "epoch": 0.41343340541915785, + "ewc_loss": 0.035284727811813354, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013190001482143998, + "grad_norm": 4.7231221199035645, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8652692437171936, + "num_tokens": 124049911.0, + "step": 3250 + }, + { + "epoch": 0.4135606156977484, + "ewc_loss": 0.03530995547771454, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013215228682383895, + "grad_norm": 4.671259880065918, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.87650465965271, + "num_tokens": 124084260.0, + "step": 3251 + }, + { + "epoch": 0.4136878259763389, + "ewc_loss": 0.03539878502488136, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.0001318198919761926, + "grad_norm": 4.700389385223389, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8585550785064697, + "num_tokens": 124122741.0, + "step": 3252 + }, + { + "epoch": 0.4138150362549294, + "ewc_loss": 0.03530113771557808, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013206410221755505, + "grad_norm": 4.645294189453125, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8523234128952026, + "num_tokens": 124162793.0, + "step": 3253 + }, + { + "epoch": 0.4139422465335199, + "ewc_loss": 0.03542255610227585, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.00013205758295953274, + "grad_norm": 4.676506519317627, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8480495810508728, + "num_tokens": 124206889.0, + "step": 3254 + }, + { + "epoch": 0.41406945681211044, + "ewc_loss": 0.035447414964437485, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.00013230617332737893, + "grad_norm": 4.705983638763428, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.866462230682373, + "num_tokens": 124244551.0, + "step": 3255 + }, + { + "epoch": 0.4141966670907009, + "ewc_loss": 0.035356175154447556, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013261449930723757, + "grad_norm": 4.663730621337891, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8582863807678223, + "num_tokens": 124290485.0, + "step": 3256 + }, + { + "epoch": 0.41432387736929144, + "ewc_loss": 0.035311099141836166, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013216373918112367, + "grad_norm": 4.7332072257995605, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8628639578819275, + "num_tokens": 124323763.0, + "step": 3257 + }, + { + "epoch": 0.41445108764788197, + "ewc_loss": 0.03550088405609131, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.00013284088345244527, + "grad_norm": 4.758329391479492, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8560314178466797, + "num_tokens": 124363318.0, + "step": 3258 + }, + { + "epoch": 0.41457829792647244, + "ewc_loss": 0.035451289266347885, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.00013234491052571684, + "grad_norm": 4.628072738647461, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8778706789016724, + "num_tokens": 124400845.0, + "step": 3259 + }, + { + "epoch": 0.41470550820506297, + "ewc_loss": 0.035448454320430756, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.0001323165779467672, + "grad_norm": 4.621227264404297, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8633409738540649, + "num_tokens": 124442546.0, + "step": 3260 + }, + { + "epoch": 0.4148327184836535, + "ewc_loss": 0.035359062254428864, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.0001326433412032202, + "grad_norm": 4.724651336669922, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8509964942932129, + "num_tokens": 124477737.0, + "step": 3261 + }, + { + "epoch": 0.41495992876224397, + "ewc_loss": 0.03549833595752716, + "ewc_loss_diag": 2.2172927856445312e-05, + "ewc_loss_parallel": 0.00013281537394504994, + "grad_norm": 4.692761421203613, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8437271118164062, + "num_tokens": 124518081.0, + "step": 3262 + }, + { + "epoch": 0.4150871390408345, + "ewc_loss": 0.035335320979356766, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013240595581009984, + "grad_norm": 4.670304775238037, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8458971381187439, + "num_tokens": 124556361.0, + "step": 3263 + }, + { + "epoch": 0.415214349319425, + "ewc_loss": 0.03560950607061386, + "ewc_loss_diag": 2.2292137145996094e-05, + "ewc_loss_parallel": 0.00013270640920381993, + "grad_norm": 4.744558334350586, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8554095029830933, + "num_tokens": 124588490.0, + "step": 3264 + }, + { + "epoch": 0.4153415595980155, + "ewc_loss": 0.03565588593482971, + "ewc_loss_diag": 2.2292137145996094e-05, + "ewc_loss_parallel": 0.00013317020784597844, + "grad_norm": 4.908480644226074, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8550529479980469, + "num_tokens": 124624656.0, + "step": 3265 + }, + { + "epoch": 0.415468769876606, + "ewc_loss": 0.0356670506298542, + "ewc_loss_diag": 2.2292137145996094e-05, + "ewc_loss_parallel": 0.00013328183558769524, + "grad_norm": 4.715256214141846, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8414005041122437, + "num_tokens": 124660681.0, + "step": 3266 + }, + { + "epoch": 0.41559598015519655, + "ewc_loss": 0.035528358072042465, + "ewc_loss_diag": 2.2292137145996094e-05, + "ewc_loss_parallel": 0.00013189490709919482, + "grad_norm": 4.679409503936768, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8565332889556885, + "num_tokens": 124703448.0, + "step": 3267 + }, + { + "epoch": 0.415723190433787, + "ewc_loss": 0.035341594368219376, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013246868911664933, + "grad_norm": 4.671436309814453, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8539862036705017, + "num_tokens": 124742078.0, + "step": 3268 + }, + { + "epoch": 0.41585040071237755, + "ewc_loss": 0.03555911034345627, + "ewc_loss_diag": 2.2292137145996094e-05, + "ewc_loss_parallel": 0.00013220241817180067, + "grad_norm": 4.806919574737549, + "learning_rate": 1e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8439674973487854, + "num_tokens": 124779894.0, + "step": 3269 + }, + { + "epoch": 0.4159776109909681, + "ewc_loss": 0.03564640134572983, + "ewc_loss_diag": 2.2292137145996094e-05, + "ewc_loss_parallel": 0.0001330753293586895, + "grad_norm": 4.659778118133545, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8509311676025391, + "num_tokens": 124817918.0, + "step": 3270 + }, + { + "epoch": 0.41610482126955856, + "ewc_loss": 0.03558427467942238, + "ewc_loss_diag": 2.2292137145996094e-05, + "ewc_loss_parallel": 0.00013245407899376005, + "grad_norm": 4.683629512786865, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8717048168182373, + "num_tokens": 124854839.0, + "step": 3271 + }, + { + "epoch": 0.4162320315481491, + "ewc_loss": 0.035371847450733185, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013277120888233185, + "grad_norm": 4.726455211639404, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8399162292480469, + "num_tokens": 124886779.0, + "step": 3272 + }, + { + "epoch": 0.4163592418267396, + "ewc_loss": 0.03538917377591133, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.000132944478536956, + "grad_norm": 4.6991119384765625, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8634403944015503, + "num_tokens": 124927919.0, + "step": 3273 + }, + { + "epoch": 0.4164864521053301, + "ewc_loss": 0.03535089269280434, + "ewc_loss_diag": 2.205371856689453e-05, + "ewc_loss_parallel": 0.00013256166130304337, + "grad_norm": 4.6707258224487305, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8534522652626038, + "num_tokens": 124966475.0, + "step": 3274 + }, + { + "epoch": 0.4166136623839206, + "ewc_loss": 0.035638850182294846, + "ewc_loss_diag": 2.2292137145996094e-05, + "ewc_loss_parallel": 0.00013299983402248472, + "grad_norm": 4.966554641723633, + "learning_rate": 1e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8307230472564697, + "num_tokens": 125004110.0, + "step": 3275 + }, + { + "epoch": 0.41674087266251114, + "ewc_loss": 0.03572965785861015, + "ewc_loss_diag": 2.2292137145996094e-05, + "ewc_loss_parallel": 0.00013390790263656527, + "grad_norm": 4.630305767059326, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8690855503082275, + "num_tokens": 125043164.0, + "step": 3276 + }, + { + "epoch": 0.4168680829411016, + "ewc_loss": 0.03573310375213623, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001315009722020477, + "grad_norm": 4.692469120025635, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8594574332237244, + "num_tokens": 125084255.0, + "step": 3277 + }, + { + "epoch": 0.41699529321969214, + "ewc_loss": 0.03565909340977669, + "ewc_loss_diag": 2.2292137145996094e-05, + "ewc_loss_parallel": 0.0001332022511633113, + "grad_norm": 4.637001991271973, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8681726455688477, + "num_tokens": 125124971.0, + "step": 3278 + }, + { + "epoch": 0.41712250349828267, + "ewc_loss": 0.035827189683914185, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001324417971773073, + "grad_norm": 4.74245023727417, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8524112701416016, + "num_tokens": 125161470.0, + "step": 3279 + }, + { + "epoch": 0.4172497137768732, + "ewc_loss": 0.03595753386616707, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001337452558800578, + "grad_norm": 4.70311975479126, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8394137620925903, + "num_tokens": 125199117.0, + "step": 3280 + }, + { + "epoch": 0.41737692405546367, + "ewc_loss": 0.03586389869451523, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013280891289468855, + "grad_norm": 4.7171854972839355, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8364340662956238, + "num_tokens": 125238136.0, + "step": 3281 + }, + { + "epoch": 0.4175041343340542, + "ewc_loss": 0.03594838082790375, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001336537388851866, + "grad_norm": 4.656289100646973, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8525104522705078, + "num_tokens": 125281041.0, + "step": 3282 + }, + { + "epoch": 0.41763134461264473, + "ewc_loss": 0.03590832278132439, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001332531392108649, + "grad_norm": 4.717962741851807, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8580223321914673, + "num_tokens": 125314739.0, + "step": 3283 + }, + { + "epoch": 0.4177585548912352, + "ewc_loss": 0.035962820053100586, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001337981375399977, + "grad_norm": 4.711791038513184, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8497684001922607, + "num_tokens": 125353129.0, + "step": 3284 + }, + { + "epoch": 0.41788576516982573, + "ewc_loss": 0.03591890260577202, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013335894618649036, + "grad_norm": 4.681218147277832, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.853495717048645, + "num_tokens": 125394092.0, + "step": 3285 + }, + { + "epoch": 0.41801297544841626, + "ewc_loss": 0.03591641038656235, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013333401875570416, + "grad_norm": 4.8463592529296875, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8500306010246277, + "num_tokens": 125438262.0, + "step": 3286 + }, + { + "epoch": 0.41814018572700673, + "ewc_loss": 0.0359921008348465, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013409093662630767, + "grad_norm": 4.642910957336426, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8541129231452942, + "num_tokens": 125477842.0, + "step": 3287 + }, + { + "epoch": 0.41826739600559726, + "ewc_loss": 0.03586326912045479, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013280261191539466, + "grad_norm": 4.68019962310791, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8669446706771851, + "num_tokens": 125517236.0, + "step": 3288 + }, + { + "epoch": 0.4183946062841878, + "ewc_loss": 0.035843025892972946, + "ewc_loss_diag": 2.2411346435546875e-05, + "ewc_loss_parallel": 0.00013382088218349963, + "grad_norm": 4.685164928436279, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.848602294921875, + "num_tokens": 125561598.0, + "step": 3289 + }, + { + "epoch": 0.41852181656277826, + "ewc_loss": 0.035952840000391006, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001336983114015311, + "grad_norm": 4.7225775718688965, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8530685901641846, + "num_tokens": 125599104.0, + "step": 3290 + }, + { + "epoch": 0.4186490268413688, + "ewc_loss": 0.03594871982932091, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001336571149295196, + "grad_norm": 4.7373366355896, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8557660579681396, + "num_tokens": 125635286.0, + "step": 3291 + }, + { + "epoch": 0.4187762371199593, + "ewc_loss": 0.03582696244120598, + "ewc_loss_diag": 2.2411346435546875e-05, + "ewc_loss_parallel": 0.00013366025814320892, + "grad_norm": 4.753952980041504, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8595658540725708, + "num_tokens": 125669534.0, + "step": 3292 + }, + { + "epoch": 0.4189034473985498, + "ewc_loss": 0.03582038730382919, + "ewc_loss_diag": 2.2411346435546875e-05, + "ewc_loss_parallel": 0.00013359451259020716, + "grad_norm": 4.694770336151123, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8542170524597168, + "num_tokens": 125707966.0, + "step": 3293 + }, + { + "epoch": 0.4190306576771403, + "ewc_loss": 0.03579656034708023, + "ewc_loss_diag": 2.2411346435546875e-05, + "ewc_loss_parallel": 0.00013335621042642742, + "grad_norm": 4.769674777984619, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8503578901290894, + "num_tokens": 125741448.0, + "step": 3294 + }, + { + "epoch": 0.41915786795573085, + "ewc_loss": 0.03582789748907089, + "ewc_loss_diag": 2.2411346435546875e-05, + "ewc_loss_parallel": 0.00013366961502470076, + "grad_norm": 4.689082145690918, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8488219976425171, + "num_tokens": 125776887.0, + "step": 3295 + }, + { + "epoch": 0.4192850782343213, + "ewc_loss": 0.03578998148441315, + "ewc_loss_diag": 2.2411346435546875e-05, + "ewc_loss_parallel": 0.00013329042121767998, + "grad_norm": 4.734238147735596, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8598659038543701, + "num_tokens": 125807899.0, + "step": 3296 + }, + { + "epoch": 0.41941228851291185, + "ewc_loss": 0.03584444522857666, + "ewc_loss_diag": 2.2411346435546875e-05, + "ewc_loss_parallel": 0.0001338350703008473, + "grad_norm": 4.636678695678711, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8623909950256348, + "num_tokens": 125847711.0, + "step": 3297 + }, + { + "epoch": 0.4195394987915024, + "ewc_loss": 0.03582049161195755, + "ewc_loss_diag": 2.2411346435546875e-05, + "ewc_loss_parallel": 0.00013359554577618837, + "grad_norm": 4.736485958099365, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8679004311561584, + "num_tokens": 125883248.0, + "step": 3298 + }, + { + "epoch": 0.41966670907009285, + "ewc_loss": 0.03603101521730423, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013448008394334465, + "grad_norm": 4.6756672859191895, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8443352580070496, + "num_tokens": 125925771.0, + "step": 3299 + }, + { + "epoch": 0.4197939193486834, + "ewc_loss": 0.0359402671456337, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013357261195778847, + "grad_norm": 4.650325298309326, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8431825637817383, + "num_tokens": 125968815.0, + "step": 3300 + }, + { + "epoch": 0.4199211296272739, + "ewc_loss": 0.03599163889884949, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013408629456534982, + "grad_norm": 4.677694320678711, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8494036197662354, + "num_tokens": 126007761.0, + "step": 3301 + }, + { + "epoch": 0.4200483399058644, + "ewc_loss": 0.036001771688461304, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001341876486549154, + "grad_norm": 4.708432674407959, + "learning_rate": 1e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8364763855934143, + "num_tokens": 126047872.0, + "step": 3302 + }, + { + "epoch": 0.4201755501844549, + "ewc_loss": 0.03603590652346611, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001345289929304272, + "grad_norm": 4.680235385894775, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.861510694026947, + "num_tokens": 126081273.0, + "step": 3303 + }, + { + "epoch": 0.42030276046304543, + "ewc_loss": 0.035978056490421295, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001339504960924387, + "grad_norm": 4.708993911743164, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8395580053329468, + "num_tokens": 126121590.0, + "step": 3304 + }, + { + "epoch": 0.4204299707416359, + "ewc_loss": 0.036013416945934296, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013430407852865756, + "grad_norm": 4.633935928344727, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.838223397731781, + "num_tokens": 126168396.0, + "step": 3305 + }, + { + "epoch": 0.42055718102022643, + "ewc_loss": 0.03597759082913399, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013394583947956562, + "grad_norm": 4.74569845199585, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.84977126121521, + "num_tokens": 126205220.0, + "step": 3306 + }, + { + "epoch": 0.42068439129881696, + "ewc_loss": 0.03604111820459366, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013458108878694475, + "grad_norm": 4.725762367248535, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8488487005233765, + "num_tokens": 126239131.0, + "step": 3307 + }, + { + "epoch": 0.42081160157740743, + "ewc_loss": 0.035981595516204834, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013398587179835886, + "grad_norm": 4.750810146331787, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8571834564208984, + "num_tokens": 126271931.0, + "step": 3308 + }, + { + "epoch": 0.42093881185599796, + "ewc_loss": 0.03599397838115692, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001341097231488675, + "grad_norm": 4.674961090087891, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8635470271110535, + "num_tokens": 126307730.0, + "step": 3309 + }, + { + "epoch": 0.4210660221345885, + "ewc_loss": 0.03593390807509422, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013350900553632528, + "grad_norm": 4.729452133178711, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.857428789138794, + "num_tokens": 126344528.0, + "step": 3310 + }, + { + "epoch": 0.42119323241317896, + "ewc_loss": 0.03602218255400658, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013439175381790847, + "grad_norm": 4.72132682800293, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8633487224578857, + "num_tokens": 126377260.0, + "step": 3311 + }, + { + "epoch": 0.4213204426917695, + "ewc_loss": 0.03596194088459015, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013378934818319976, + "grad_norm": 4.732723712921143, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8506903648376465, + "num_tokens": 126409723.0, + "step": 3312 + }, + { + "epoch": 0.42144765297036, + "ewc_loss": 0.036001965403556824, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.000134189598611556, + "grad_norm": 4.708090782165527, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8586113452911377, + "num_tokens": 126450929.0, + "step": 3313 + }, + { + "epoch": 0.4215748632489505, + "ewc_loss": 0.0360136479139328, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013430642138700932, + "grad_norm": 4.697159290313721, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8641403913497925, + "num_tokens": 126486878.0, + "step": 3314 + }, + { + "epoch": 0.421702073527541, + "ewc_loss": 0.035978808999061584, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013395800488069654, + "grad_norm": 4.892277240753174, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.843052864074707, + "num_tokens": 126522884.0, + "step": 3315 + }, + { + "epoch": 0.42182928380613155, + "ewc_loss": 0.03608861565589905, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001350560924038291, + "grad_norm": 4.707866668701172, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8560976982116699, + "num_tokens": 126560848.0, + "step": 3316 + }, + { + "epoch": 0.421956494084722, + "ewc_loss": 0.035928092896938324, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013345085608307272, + "grad_norm": 4.7484307289123535, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8574760556221008, + "num_tokens": 126592947.0, + "step": 3317 + }, + { + "epoch": 0.42208370436331255, + "ewc_loss": 0.03604446351528168, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013461455819197, + "grad_norm": 4.658453941345215, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.865382730960846, + "num_tokens": 126630631.0, + "step": 3318 + }, + { + "epoch": 0.4222109146419031, + "ewc_loss": 0.03596213459968567, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013379128358792514, + "grad_norm": 4.672384262084961, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8517980575561523, + "num_tokens": 126673395.0, + "step": 3319 + }, + { + "epoch": 0.42233812492049355, + "ewc_loss": 0.03605082631111145, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013467820826917887, + "grad_norm": 4.7898454666137695, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.871397078037262, + "num_tokens": 126711504.0, + "step": 3320 + }, + { + "epoch": 0.4224653351990841, + "ewc_loss": 0.036022745072841644, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013439738540910184, + "grad_norm": 4.598095893859863, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8680474758148193, + "num_tokens": 126753217.0, + "step": 3321 + }, + { + "epoch": 0.4225925454776746, + "ewc_loss": 0.035946719348430634, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013363712059799582, + "grad_norm": 4.683107852935791, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8608807325363159, + "num_tokens": 126792982.0, + "step": 3322 + }, + { + "epoch": 0.4227197557562651, + "ewc_loss": 0.036013875156641006, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001343086623819545, + "grad_norm": 4.777997016906738, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8493443131446838, + "num_tokens": 126823771.0, + "step": 3323 + }, + { + "epoch": 0.4228469660348556, + "ewc_loss": 0.03601893037557602, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013435923028737307, + "grad_norm": 4.657588481903076, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8589544296264648, + "num_tokens": 126867137.0, + "step": 3324 + }, + { + "epoch": 0.42297417631344614, + "ewc_loss": 0.035982683300971985, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013399674207903445, + "grad_norm": 4.695491790771484, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8612567782402039, + "num_tokens": 126902680.0, + "step": 3325 + }, + { + "epoch": 0.4231013865920366, + "ewc_loss": 0.036038681864738464, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013455675798468292, + "grad_norm": 4.726779460906982, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8678534626960754, + "num_tokens": 126941037.0, + "step": 3326 + }, + { + "epoch": 0.42322859687062714, + "ewc_loss": 0.036030713468790054, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013447705714497715, + "grad_norm": 4.690981864929199, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8594163656234741, + "num_tokens": 126980009.0, + "step": 3327 + }, + { + "epoch": 0.42335580714921767, + "ewc_loss": 0.03602416068315506, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013441152987070382, + "grad_norm": 4.743838787078857, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8680310845375061, + "num_tokens": 127020022.0, + "step": 3328 + }, + { + "epoch": 0.42348301742780814, + "ewc_loss": 0.03601127117872238, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013428264355752617, + "grad_norm": 4.693926811218262, + "learning_rate": 1e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8374342918395996, + "num_tokens": 127060500.0, + "step": 3329 + }, + { + "epoch": 0.42361022770639867, + "ewc_loss": 0.035978421568870544, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013395414862316102, + "grad_norm": 4.71903133392334, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8518363833427429, + "num_tokens": 127095886.0, + "step": 3330 + }, + { + "epoch": 0.4237374379849892, + "ewc_loss": 0.036144230514764786, + "ewc_loss_diag": 2.2649765014648438e-05, + "ewc_loss_parallel": 0.0001343915209872648, + "grad_norm": 4.735042572021484, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8458003997802734, + "num_tokens": 127134334.0, + "step": 3331 + }, + { + "epoch": 0.4238646482635797, + "ewc_loss": 0.036001961678266525, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013418952585197985, + "grad_norm": 4.761068344116211, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8481025099754333, + "num_tokens": 127174288.0, + "step": 3332 + }, + { + "epoch": 0.4239918585421702, + "ewc_loss": 0.036035940051078796, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013452932762447745, + "grad_norm": 4.70940637588501, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.853313684463501, + "num_tokens": 127210311.0, + "step": 3333 + }, + { + "epoch": 0.4241190688207607, + "ewc_loss": 0.0359780415892601, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013395035057328641, + "grad_norm": 4.666913032531738, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8768212795257568, + "num_tokens": 127252813.0, + "step": 3334 + }, + { + "epoch": 0.42424627909935125, + "ewc_loss": 0.03600138798356056, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013418379239737988, + "grad_norm": 4.7556843757629395, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8432351350784302, + "num_tokens": 127285847.0, + "step": 3335 + }, + { + "epoch": 0.4243734893779417, + "ewc_loss": 0.03605573996901512, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001347273209830746, + "grad_norm": 4.796810150146484, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8694595694541931, + "num_tokens": 127325360.0, + "step": 3336 + }, + { + "epoch": 0.42450069965653225, + "ewc_loss": 0.0360160693526268, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013433063577394933, + "grad_norm": 4.645304203033447, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8593966960906982, + "num_tokens": 127366446.0, + "step": 3337 + }, + { + "epoch": 0.4246279099351228, + "ewc_loss": 0.0359669029712677, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001338389702141285, + "grad_norm": 4.883981704711914, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8655661344528198, + "num_tokens": 127402608.0, + "step": 3338 + }, + { + "epoch": 0.42475512021371326, + "ewc_loss": 0.03612086549401283, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013537857739720494, + "grad_norm": 4.665964126586914, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8609657287597656, + "num_tokens": 127445355.0, + "step": 3339 + }, + { + "epoch": 0.4248823304923038, + "ewc_loss": 0.03594467416405678, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013361666060518473, + "grad_norm": 4.745265483856201, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8508027791976929, + "num_tokens": 127480523.0, + "step": 3340 + }, + { + "epoch": 0.4250095407708943, + "ewc_loss": 0.03609253466129303, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013509526615962386, + "grad_norm": 4.701453685760498, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8757119178771973, + "num_tokens": 127520787.0, + "step": 3341 + }, + { + "epoch": 0.4251367510494848, + "ewc_loss": 0.036020174622535706, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013437167217489332, + "grad_norm": 4.711109638214111, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8519628047943115, + "num_tokens": 127565576.0, + "step": 3342 + }, + { + "epoch": 0.4252639613280753, + "ewc_loss": 0.036058101803064346, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013475093874149024, + "grad_norm": 4.7474365234375, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8667416572570801, + "num_tokens": 127599144.0, + "step": 3343 + }, + { + "epoch": 0.42539117160666584, + "ewc_loss": 0.03605101630091667, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013468008546624333, + "grad_norm": 4.726445198059082, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.873942494392395, + "num_tokens": 127637361.0, + "step": 3344 + }, + { + "epoch": 0.4255183818852563, + "ewc_loss": 0.03603847324848175, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001345546479569748, + "grad_norm": 4.714563846588135, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8461583852767944, + "num_tokens": 127675139.0, + "step": 3345 + }, + { + "epoch": 0.42564559216384684, + "ewc_loss": 0.03602523356676102, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013442225463222712, + "grad_norm": 4.667430400848389, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8622359037399292, + "num_tokens": 127718822.0, + "step": 3346 + }, + { + "epoch": 0.42577280244243737, + "ewc_loss": 0.03602498769760132, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013441980991046876, + "grad_norm": 4.672403812408447, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8597699403762817, + "num_tokens": 127763332.0, + "step": 3347 + }, + { + "epoch": 0.42590001272102784, + "ewc_loss": 0.03603627160191536, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013453264546114951, + "grad_norm": 4.737316608428955, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8675982356071472, + "num_tokens": 127800346.0, + "step": 3348 + }, + { + "epoch": 0.42602722299961837, + "ewc_loss": 0.03603788837790489, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.000134548798087053, + "grad_norm": 4.705353260040283, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8535956144332886, + "num_tokens": 127845142.0, + "step": 3349 + }, + { + "epoch": 0.4261544332782089, + "ewc_loss": 0.036052852869033813, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001346984354313463, + "grad_norm": 4.7842302322387695, + "learning_rate": 1e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8393286466598511, + "num_tokens": 127881131.0, + "step": 3350 + }, + { + "epoch": 0.4262816435567994, + "ewc_loss": 0.03607528284192085, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013492275320459157, + "grad_norm": 4.747988224029541, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8703058958053589, + "num_tokens": 127917941.0, + "step": 3351 + }, + { + "epoch": 0.4264088538353899, + "ewc_loss": 0.03600604087114334, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013423031487036496, + "grad_norm": 4.705407619476318, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8432737588882446, + "num_tokens": 127963345.0, + "step": 3352 + }, + { + "epoch": 0.42653606411398043, + "ewc_loss": 0.036014921963214874, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013431913976091892, + "grad_norm": 4.697439193725586, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8652001023292542, + "num_tokens": 128004839.0, + "step": 3353 + }, + { + "epoch": 0.4266632743925709, + "ewc_loss": 0.036040470004081726, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013457464228849858, + "grad_norm": 4.742882251739502, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8500886559486389, + "num_tokens": 128042709.0, + "step": 3354 + }, + { + "epoch": 0.42679048467116143, + "ewc_loss": 0.03607148677110672, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013488477270584553, + "grad_norm": 4.773168563842773, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8409804105758667, + "num_tokens": 128085069.0, + "step": 3355 + }, + { + "epoch": 0.42691769494975196, + "ewc_loss": 0.036013826727867126, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013430816761683673, + "grad_norm": 4.700467109680176, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8510929942131042, + "num_tokens": 128124300.0, + "step": 3356 + }, + { + "epoch": 0.42704490522834243, + "ewc_loss": 0.03606776148080826, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013484753435477614, + "grad_norm": 4.7111992835998535, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8535611033439636, + "num_tokens": 128164410.0, + "step": 3357 + }, + { + "epoch": 0.42717211550693296, + "ewc_loss": 0.036090388894081116, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013507381663657725, + "grad_norm": 4.66918420791626, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8669520616531372, + "num_tokens": 128206524.0, + "step": 3358 + }, + { + "epoch": 0.4272993257855235, + "ewc_loss": 0.03609269857406616, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013509691052604467, + "grad_norm": 4.749383926391602, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8470079898834229, + "num_tokens": 128246387.0, + "step": 3359 + }, + { + "epoch": 0.42742653606411396, + "ewc_loss": 0.036137767136096954, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001355475833406672, + "grad_norm": 4.730478286743164, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8662478923797607, + "num_tokens": 128287794.0, + "step": 3360 + }, + { + "epoch": 0.4275537463427045, + "ewc_loss": 0.03610790893435478, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013524900714401156, + "grad_norm": 4.730356693267822, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8530134558677673, + "num_tokens": 128328467.0, + "step": 3361 + }, + { + "epoch": 0.427680956621295, + "ewc_loss": 0.03611738979816437, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001353438274236396, + "grad_norm": 4.779793739318848, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8654312491416931, + "num_tokens": 128364319.0, + "step": 3362 + }, + { + "epoch": 0.4278081668998855, + "ewc_loss": 0.036129020154476166, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013546014088205993, + "grad_norm": 4.6988606452941895, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8511354923248291, + "num_tokens": 128403892.0, + "step": 3363 + }, + { + "epoch": 0.427935377178476, + "ewc_loss": 0.03607172146439552, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013488714466802776, + "grad_norm": 4.743572235107422, + "learning_rate": 1e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.836422324180603, + "num_tokens": 128446760.0, + "step": 3364 + }, + { + "epoch": 0.42806258745706655, + "ewc_loss": 0.03612690791487694, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013543901150114834, + "grad_norm": 4.756826877593994, + "learning_rate": 1e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8313421010971069, + "num_tokens": 128485498.0, + "step": 3365 + }, + { + "epoch": 0.428189797735657, + "ewc_loss": 0.036118071526288986, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013535063771996647, + "grad_norm": 4.744764804840088, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.844458818435669, + "num_tokens": 128527871.0, + "step": 3366 + }, + { + "epoch": 0.42831700801424755, + "ewc_loss": 0.03612393140792847, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013540922373067588, + "grad_norm": 4.713104724884033, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8421642780303955, + "num_tokens": 128567792.0, + "step": 3367 + }, + { + "epoch": 0.4284442182928381, + "ewc_loss": 0.036157120019197464, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013574112381320447, + "grad_norm": 4.754662036895752, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8458385467529297, + "num_tokens": 128607217.0, + "step": 3368 + }, + { + "epoch": 0.42857142857142855, + "ewc_loss": 0.03617244213819504, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013589435548055917, + "grad_norm": 4.697439193725586, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8617486953735352, + "num_tokens": 128642787.0, + "step": 3369 + }, + { + "epoch": 0.4286986388500191, + "ewc_loss": 0.036182746291160583, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013599739759229124, + "grad_norm": 4.7367777824401855, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8666743040084839, + "num_tokens": 128680462.0, + "step": 3370 + }, + { + "epoch": 0.4288258491286096, + "ewc_loss": 0.03618478775024414, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.0001360177993774414, + "grad_norm": 4.786679744720459, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8514427542686462, + "num_tokens": 128715101.0, + "step": 3371 + }, + { + "epoch": 0.4289530594072001, + "ewc_loss": 0.0362204946577549, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013637487427331507, + "grad_norm": 4.746691703796387, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8478387594223022, + "num_tokens": 128749601.0, + "step": 3372 + }, + { + "epoch": 0.4290802696857906, + "ewc_loss": 0.03619154542684555, + "ewc_loss_diag": 2.2530555725097656e-05, + "ewc_loss_parallel": 0.00013608539302367717, + "grad_norm": 4.723088264465332, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8519703149795532, + "num_tokens": 128788744.0, + "step": 3373 + }, + { + "epoch": 0.42920747996438113, + "ewc_loss": 0.03643856197595596, + "ewc_loss_diag": 2.276897430419922e-05, + "ewc_loss_parallel": 0.00013611411850433797, + "grad_norm": 4.724177837371826, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8519703149795532, + "num_tokens": 128828102.0, + "step": 3374 + }, + { + "epoch": 0.4293346902429716, + "ewc_loss": 0.03648298978805542, + "ewc_loss_diag": 2.276897430419922e-05, + "ewc_loss_parallel": 0.00013655841758009046, + "grad_norm": 4.74897575378418, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8499394655227661, + "num_tokens": 128863610.0, + "step": 3375 + }, + { + "epoch": 0.42946190052156213, + "ewc_loss": 0.03645513206720352, + "ewc_loss_diag": 2.276897430419922e-05, + "ewc_loss_parallel": 0.00013627982116304338, + "grad_norm": 4.803811073303223, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.869819700717926, + "num_tokens": 128898836.0, + "step": 3376 + }, + { + "epoch": 0.42958911080015266, + "ewc_loss": 0.03644802048802376, + "ewc_loss_diag": 2.276897430419922e-05, + "ewc_loss_parallel": 0.00013620872050523758, + "grad_norm": 4.708678722381592, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8566750884056091, + "num_tokens": 128934821.0, + "step": 3377 + }, + { + "epoch": 0.42971632107874314, + "ewc_loss": 0.03643403202295303, + "ewc_loss_diag": 2.276897430419922e-05, + "ewc_loss_parallel": 0.0001360688329441473, + "grad_norm": 4.778313636779785, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8540571331977844, + "num_tokens": 128966914.0, + "step": 3378 + }, + { + "epoch": 0.42984353135733366, + "ewc_loss": 0.03648604080080986, + "ewc_loss_diag": 2.276897430419922e-05, + "ewc_loss_parallel": 0.00013658891839440912, + "grad_norm": 4.698261737823486, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8617502450942993, + "num_tokens": 129008305.0, + "step": 3379 + }, + { + "epoch": 0.4299707416359242, + "ewc_loss": 0.03643934056162834, + "ewc_loss_diag": 2.276897430419922e-05, + "ewc_loss_parallel": 0.00013612191833090037, + "grad_norm": 4.749721527099609, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8763185739517212, + "num_tokens": 129043331.0, + "step": 3380 + }, + { + "epoch": 0.4300979519145147, + "ewc_loss": 0.03647441416978836, + "ewc_loss_diag": 2.276897430419922e-05, + "ewc_loss_parallel": 0.00013647263403981924, + "grad_norm": 4.674828052520752, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8608862161636353, + "num_tokens": 129083513.0, + "step": 3381 + }, + { + "epoch": 0.4302251621931052, + "ewc_loss": 0.03645961731672287, + "ewc_loss_diag": 2.276897430419922e-05, + "ewc_loss_parallel": 0.00013632468471769243, + "grad_norm": 4.867300987243652, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8545477390289307, + "num_tokens": 129113463.0, + "step": 3382 + }, + { + "epoch": 0.4303523724716957, + "ewc_loss": 0.03656037896871567, + "ewc_loss_diag": 2.276897430419922e-05, + "ewc_loss_parallel": 0.000137332288431935, + "grad_norm": 4.720974445343018, + "learning_rate": 1e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8321465253829956, + "num_tokens": 129153080.0, + "step": 3383 + }, + { + "epoch": 0.43047958275028625, + "ewc_loss": 0.03655102849006653, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013601807586383075, + "grad_norm": 4.716946125030518, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8524194955825806, + "num_tokens": 129193338.0, + "step": 3384 + }, + { + "epoch": 0.4306067930288767, + "ewc_loss": 0.036653872579336166, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001370465470245108, + "grad_norm": 4.696650981903076, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8459312915802002, + "num_tokens": 129234816.0, + "step": 3385 + }, + { + "epoch": 0.43073400330746725, + "ewc_loss": 0.03665052354335785, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013701306306757033, + "grad_norm": 4.6956658363342285, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8587245941162109, + "num_tokens": 129275228.0, + "step": 3386 + }, + { + "epoch": 0.4308612135860578, + "ewc_loss": 0.03665335476398468, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001370413665426895, + "grad_norm": 4.846129417419434, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8370447754859924, + "num_tokens": 129309747.0, + "step": 3387 + }, + { + "epoch": 0.43098842386464825, + "ewc_loss": 0.036689553409814835, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013740335998591036, + "grad_norm": 4.737316608428955, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8574618101119995, + "num_tokens": 129342580.0, + "step": 3388 + }, + { + "epoch": 0.4311156341432388, + "ewc_loss": 0.03661414235830307, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001366492360830307, + "grad_norm": 4.77412223815918, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8517259359359741, + "num_tokens": 129376599.0, + "step": 3389 + }, + { + "epoch": 0.4312428444218293, + "ewc_loss": 0.03670752793550491, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013758310524281114, + "grad_norm": 4.781284332275391, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8528356552124023, + "num_tokens": 129414783.0, + "step": 3390 + }, + { + "epoch": 0.4313700547004198, + "ewc_loss": 0.03664599359035492, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001369677484035492, + "grad_norm": 4.782475471496582, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8548130393028259, + "num_tokens": 129450287.0, + "step": 3391 + }, + { + "epoch": 0.4314972649790103, + "ewc_loss": 0.03665716201066971, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001370794343529269, + "grad_norm": 4.79805326461792, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8371833562850952, + "num_tokens": 129486733.0, + "step": 3392 + }, + { + "epoch": 0.43162447525760084, + "ewc_loss": 0.036662258207798004, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013713038060814142, + "grad_norm": 4.750434875488281, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8558825254440308, + "num_tokens": 129524392.0, + "step": 3393 + }, + { + "epoch": 0.4317516855361913, + "ewc_loss": 0.03664032369852066, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001369110686937347, + "grad_norm": 4.798492908477783, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8404372930526733, + "num_tokens": 129558452.0, + "step": 3394 + }, + { + "epoch": 0.43187889581478184, + "ewc_loss": 0.036642298102378845, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001369308156426996, + "grad_norm": 4.716836929321289, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8534639477729797, + "num_tokens": 129597076.0, + "step": 3395 + }, + { + "epoch": 0.43200610609337237, + "ewc_loss": 0.03659876063466072, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013649542233906686, + "grad_norm": 4.726850509643555, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8502553105354309, + "num_tokens": 129635861.0, + "step": 3396 + }, + { + "epoch": 0.43213331637196284, + "ewc_loss": 0.03666381537914276, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013714595115743577, + "grad_norm": 4.699315071105957, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8508799076080322, + "num_tokens": 129673760.0, + "step": 3397 + }, + { + "epoch": 0.43226052665055337, + "ewc_loss": 0.03662990778684616, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013680690608453006, + "grad_norm": 4.733442783355713, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8571919798851013, + "num_tokens": 129710730.0, + "step": 3398 + }, + { + "epoch": 0.4323877369291439, + "ewc_loss": 0.03671286255121231, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013763642346020788, + "grad_norm": 4.742886066436768, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8527061939239502, + "num_tokens": 129748724.0, + "step": 3399 + }, + { + "epoch": 0.43251494720773437, + "ewc_loss": 0.03667410463094711, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.000137248876853846, + "grad_norm": 4.696539402008057, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8670205473899841, + "num_tokens": 129787478.0, + "step": 3400 + }, + { + "epoch": 0.4326421574863249, + "ewc_loss": 0.03667648136615753, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013727264013141394, + "grad_norm": 4.714407920837402, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8654332160949707, + "num_tokens": 129829852.0, + "step": 3401 + }, + { + "epoch": 0.4327693677649154, + "ewc_loss": 0.036677874624729156, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013728656631428748, + "grad_norm": 4.739677429199219, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8538755178451538, + "num_tokens": 129869279.0, + "step": 3402 + }, + { + "epoch": 0.4328965780435059, + "ewc_loss": 0.03667319566011429, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013723976735491306, + "grad_norm": 4.784278392791748, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8494836688041687, + "num_tokens": 129903347.0, + "step": 3403 + }, + { + "epoch": 0.4330237883220964, + "ewc_loss": 0.03667745739221573, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013728238991461694, + "grad_norm": 4.699065208435059, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.865599513053894, + "num_tokens": 129940028.0, + "step": 3404 + }, + { + "epoch": 0.43315099860068695, + "ewc_loss": 0.03665310889482498, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001370389072690159, + "grad_norm": 4.746776103973389, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.863906741142273, + "num_tokens": 129976438.0, + "step": 3405 + }, + { + "epoch": 0.4332782088792774, + "ewc_loss": 0.036712396889925, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013763178139925003, + "grad_norm": 4.71232795715332, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8596895337104797, + "num_tokens": 130015698.0, + "step": 3406 + }, + { + "epoch": 0.43340541915786795, + "ewc_loss": 0.036647625267505646, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001369840611005202, + "grad_norm": 4.701710224151611, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8706948757171631, + "num_tokens": 130056000.0, + "step": 3407 + }, + { + "epoch": 0.4335326294364585, + "ewc_loss": 0.03668150305747986, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001373228442389518, + "grad_norm": 4.728583335876465, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8530139923095703, + "num_tokens": 130096990.0, + "step": 3408 + }, + { + "epoch": 0.43365983971504896, + "ewc_loss": 0.03670428320765495, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013755065447185189, + "grad_norm": 4.762558460235596, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8648483753204346, + "num_tokens": 130128556.0, + "step": 3409 + }, + { + "epoch": 0.4337870499936395, + "ewc_loss": 0.03669627755880356, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001374705898342654, + "grad_norm": 4.721419334411621, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8618959784507751, + "num_tokens": 130170607.0, + "step": 3410 + }, + { + "epoch": 0.43391426027223, + "ewc_loss": 0.036665286868810654, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013716067769564688, + "grad_norm": 4.824120044708252, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8521321415901184, + "num_tokens": 130212511.0, + "step": 3411 + }, + { + "epoch": 0.4340414705508205, + "ewc_loss": 0.03670554608106613, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013756327098235488, + "grad_norm": 4.780065059661865, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8492588996887207, + "num_tokens": 130249246.0, + "step": 3412 + }, + { + "epoch": 0.434168680829411, + "ewc_loss": 0.036607857793569565, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013658638636115938, + "grad_norm": 4.681299209594727, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8758294582366943, + "num_tokens": 130291250.0, + "step": 3413 + }, + { + "epoch": 0.43429589110800154, + "ewc_loss": 0.03658275306224823, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001363353367196396, + "grad_norm": 4.772039413452148, + "learning_rate": 1e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8356295824050903, + "num_tokens": 130327094.0, + "step": 3414 + }, + { + "epoch": 0.434423101386592, + "ewc_loss": 0.036661386489868164, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013712167856283486, + "grad_norm": 4.7742462158203125, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8474791049957275, + "num_tokens": 130366482.0, + "step": 3415 + }, + { + "epoch": 0.43455031166518254, + "ewc_loss": 0.036589279770851135, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013640063116326928, + "grad_norm": 4.70604133605957, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8556117415428162, + "num_tokens": 130407416.0, + "step": 3416 + }, + { + "epoch": 0.43467752194377307, + "ewc_loss": 0.03661616891622543, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013666952145285904, + "grad_norm": 4.724389553070068, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8613103032112122, + "num_tokens": 130447125.0, + "step": 3417 + }, + { + "epoch": 0.43480473222236354, + "ewc_loss": 0.03660827875137329, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013659057731274515, + "grad_norm": 4.772496223449707, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8623051643371582, + "num_tokens": 130480955.0, + "step": 3418 + }, + { + "epoch": 0.43493194250095407, + "ewc_loss": 0.03666883707046509, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013719616981688887, + "grad_norm": 4.804825305938721, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8680809736251831, + "num_tokens": 130511771.0, + "step": 3419 + }, + { + "epoch": 0.4350591527795446, + "ewc_loss": 0.03661535680294037, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013666135782841593, + "grad_norm": 4.7208943367004395, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8536809682846069, + "num_tokens": 130550080.0, + "step": 3420 + }, + { + "epoch": 0.4351863630581351, + "ewc_loss": 0.03662506863474846, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001367585064144805, + "grad_norm": 4.730772972106934, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8442456126213074, + "num_tokens": 130588109.0, + "step": 3421 + }, + { + "epoch": 0.4353135733367256, + "ewc_loss": 0.03664622828364372, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.0001369701058138162, + "grad_norm": 4.784800052642822, + "learning_rate": 1e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8420014977455139, + "num_tokens": 130626832.0, + "step": 3422 + }, + { + "epoch": 0.43544078361531613, + "ewc_loss": 0.03666376322507858, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013714542728848755, + "grad_norm": 4.727084159851074, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8594586253166199, + "num_tokens": 130662202.0, + "step": 3423 + }, + { + "epoch": 0.4355679938939066, + "ewc_loss": 0.03665188327431679, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013702665455639362, + "grad_norm": 4.754114627838135, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.851686954498291, + "num_tokens": 130700747.0, + "step": 3424 + }, + { + "epoch": 0.43569520417249713, + "ewc_loss": 0.03665778785943985, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013708567712455988, + "grad_norm": 4.713992118835449, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8537902235984802, + "num_tokens": 130742849.0, + "step": 3425 + }, + { + "epoch": 0.43582241445108766, + "ewc_loss": 0.036681875586509705, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013732655497733504, + "grad_norm": 4.715254783630371, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8687622547149658, + "num_tokens": 130780156.0, + "step": 3426 + }, + { + "epoch": 0.43594962472967813, + "ewc_loss": 0.03667636960744858, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013727150508202612, + "grad_norm": 4.706639766693115, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.855248212814331, + "num_tokens": 130825824.0, + "step": 3427 + }, + { + "epoch": 0.43607683500826866, + "ewc_loss": 0.036691755056381226, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013742537703365088, + "grad_norm": 4.872343063354492, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8633812665939331, + "num_tokens": 130863603.0, + "step": 3428 + }, + { + "epoch": 0.4362040452868592, + "ewc_loss": 0.03673955798149109, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.0001379033928969875, + "grad_norm": 4.747454643249512, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8575488328933716, + "num_tokens": 130906689.0, + "step": 3429 + }, + { + "epoch": 0.4363312555654497, + "ewc_loss": 0.036597125232219696, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013647906598635018, + "grad_norm": 4.772472858428955, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8535799980163574, + "num_tokens": 130947785.0, + "step": 3430 + }, + { + "epoch": 0.4364584658440402, + "ewc_loss": 0.036689043045043945, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013739823771174997, + "grad_norm": 4.748591899871826, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8656375408172607, + "num_tokens": 130985489.0, + "step": 3431 + }, + { + "epoch": 0.4365856761226307, + "ewc_loss": 0.03658264875411987, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013633428898174316, + "grad_norm": 4.667750835418701, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8652838468551636, + "num_tokens": 131025274.0, + "step": 3432 + }, + { + "epoch": 0.43671288640122125, + "ewc_loss": 0.036606062203645706, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013656842929776758, + "grad_norm": 4.72222900390625, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8543935418128967, + "num_tokens": 131060132.0, + "step": 3433 + }, + { + "epoch": 0.4368400966798117, + "ewc_loss": 0.03667488694190979, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013725669123232365, + "grad_norm": 4.730299949645996, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.844497799873352, + "num_tokens": 131105109.0, + "step": 3434 + }, + { + "epoch": 0.43696730695840225, + "ewc_loss": 0.036673642694950104, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013724423479288816, + "grad_norm": 4.775670051574707, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8714616894721985, + "num_tokens": 131142094.0, + "step": 3435 + }, + { + "epoch": 0.4370945172369928, + "ewc_loss": 0.03669457882642746, + "ewc_loss_diag": 2.288818359375e-05, + "ewc_loss_parallel": 0.00013745362230110914, + "grad_norm": 4.792278289794922, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8548982739448547, + "num_tokens": 131174648.0, + "step": 3436 + }, + { + "epoch": 0.43722172751558325, + "ewc_loss": 0.03667691722512245, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.000137276976602152, + "grad_norm": 4.720491886138916, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8708530068397522, + "num_tokens": 131213447.0, + "step": 3437 + }, + { + "epoch": 0.4373489377941738, + "ewc_loss": 0.0366457924246788, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.0001369657547911629, + "grad_norm": 4.8644609451293945, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8516173958778381, + "num_tokens": 131249622.0, + "step": 3438 + }, + { + "epoch": 0.4374761480727643, + "ewc_loss": 0.036721959710121155, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013772741658613086, + "grad_norm": 4.716605186462402, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8711376190185547, + "num_tokens": 131285883.0, + "step": 3439 + }, + { + "epoch": 0.4376033583513548, + "ewc_loss": 0.03659861162304878, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013649393804371357, + "grad_norm": 4.757751941680908, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8591718077659607, + "num_tokens": 131322774.0, + "step": 3440 + }, + { + "epoch": 0.4377305686299453, + "ewc_loss": 0.03671606630086899, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.0001376684958813712, + "grad_norm": 4.702037811279297, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8685288429260254, + "num_tokens": 131364513.0, + "step": 3441 + }, + { + "epoch": 0.43785777890853583, + "ewc_loss": 0.0366625040769577, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013713283988181502, + "grad_norm": 4.743974208831787, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8474456071853638, + "num_tokens": 131402299.0, + "step": 3442 + }, + { + "epoch": 0.4379849891871263, + "ewc_loss": 0.036774132400751114, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.0001382491464028135, + "grad_norm": 4.723980903625488, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.863620400428772, + "num_tokens": 131447830.0, + "step": 3443 + }, + { + "epoch": 0.43811219946571683, + "ewc_loss": 0.03670470789074898, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013755490363109857, + "grad_norm": 4.834615707397461, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8487755060195923, + "num_tokens": 131481058.0, + "step": 3444 + }, + { + "epoch": 0.43823940974430736, + "ewc_loss": 0.03679754585027695, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013848325761500746, + "grad_norm": 4.773408889770508, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8524612188339233, + "num_tokens": 131518376.0, + "step": 3445 + }, + { + "epoch": 0.43836662002289783, + "ewc_loss": 0.0367182195186615, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.0001376899890601635, + "grad_norm": 4.812516212463379, + "learning_rate": 1e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8424832820892334, + "num_tokens": 131553702.0, + "step": 3446 + }, + { + "epoch": 0.43849383030148836, + "ewc_loss": 0.036773934960365295, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013824716734234244, + "grad_norm": 4.786194801330566, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8647115230560303, + "num_tokens": 131587026.0, + "step": 3447 + }, + { + "epoch": 0.4386210405800789, + "ewc_loss": 0.03674595430493355, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013796734856441617, + "grad_norm": 4.768896579742432, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8623403310775757, + "num_tokens": 131621145.0, + "step": 3448 + }, + { + "epoch": 0.43874825085866936, + "ewc_loss": 0.036896757781505585, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.0001382546906825155, + "grad_norm": 4.7209272384643555, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8632638454437256, + "num_tokens": 131661618.0, + "step": 3449 + }, + { + "epoch": 0.4388754611372599, + "ewc_loss": 0.036740075796842575, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013790855882689357, + "grad_norm": 4.790042877197266, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8512983322143555, + "num_tokens": 131693840.0, + "step": 3450 + }, + { + "epoch": 0.4390026714158504, + "ewc_loss": 0.03680996224284172, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013860744365956634, + "grad_norm": 4.74951696395874, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8596752882003784, + "num_tokens": 131730856.0, + "step": 3451 + }, + { + "epoch": 0.4391298816944409, + "ewc_loss": 0.03677017241716385, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013820952153764665, + "grad_norm": 4.805149078369141, + "learning_rate": 1e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.8324610590934753, + "num_tokens": 131769172.0, + "step": 3452 + }, + { + "epoch": 0.4392570919730314, + "ewc_loss": 0.03686009347438812, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013910874258726835, + "grad_norm": 4.7491044998168945, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.85909104347229, + "num_tokens": 131810481.0, + "step": 3453 + }, + { + "epoch": 0.43938430225162195, + "ewc_loss": 0.036742083728313446, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013792864046990871, + "grad_norm": 4.7647175788879395, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8578688502311707, + "num_tokens": 131850742.0, + "step": 3454 + }, + { + "epoch": 0.4395115125302124, + "ewc_loss": 0.03681733086705208, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.0001386811345582828, + "grad_norm": 4.785123348236084, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8551527261734009, + "num_tokens": 131886675.0, + "step": 3455 + }, + { + "epoch": 0.43963872280880295, + "ewc_loss": 0.036829788237810135, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.0001388056989526376, + "grad_norm": 4.758841514587402, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8630044460296631, + "num_tokens": 131926831.0, + "step": 3456 + }, + { + "epoch": 0.4397659330873935, + "ewc_loss": 0.036801110953092575, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013851890980731696, + "grad_norm": 4.812261581420898, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.855402946472168, + "num_tokens": 131962265.0, + "step": 3457 + }, + { + "epoch": 0.43989314336598395, + "ewc_loss": 0.03685180842876434, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.0001390259130857885, + "grad_norm": 4.796864986419678, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8514295220375061, + "num_tokens": 131999906.0, + "step": 3458 + }, + { + "epoch": 0.4400203536445745, + "ewc_loss": 0.036780521273612976, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013831304386258125, + "grad_norm": 4.795787811279297, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.871793806552887, + "num_tokens": 132031330.0, + "step": 3459 + }, + { + "epoch": 0.440147563923165, + "ewc_loss": 0.03684689849615097, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013897681492380798, + "grad_norm": 4.866365432739258, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8547568917274475, + "num_tokens": 132065049.0, + "step": 3460 + }, + { + "epoch": 0.4402747742017555, + "ewc_loss": 0.03684994950890541, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.0001390073011862114, + "grad_norm": 4.741048336029053, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8610590100288391, + "num_tokens": 132100993.0, + "step": 3461 + }, + { + "epoch": 0.440401984480346, + "ewc_loss": 0.03678835928440094, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.000138391405926086, + "grad_norm": 4.917842388153076, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.854766845703125, + "num_tokens": 132138169.0, + "step": 3462 + }, + { + "epoch": 0.44052919475893654, + "ewc_loss": 0.03687670826911926, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013927491090726107, + "grad_norm": 4.804690361022949, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8634394407272339, + "num_tokens": 132170498.0, + "step": 3463 + }, + { + "epoch": 0.440656405037527, + "ewc_loss": 0.03676065430045128, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.00013811435201205313, + "grad_norm": 4.75062370300293, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8380277752876282, + "num_tokens": 132213583.0, + "step": 3464 + }, + { + "epoch": 0.44078361531611754, + "ewc_loss": 0.03691824898123741, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.00013846959336660802, + "grad_norm": 4.756311416625977, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.852996826171875, + "num_tokens": 132255427.0, + "step": 3465 + }, + { + "epoch": 0.44091082559470807, + "ewc_loss": 0.03690659627318382, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.00013835306162945926, + "grad_norm": 4.760505199432373, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8551068305969238, + "num_tokens": 132295795.0, + "step": 3466 + }, + { + "epoch": 0.44103803587329854, + "ewc_loss": 0.03690802678465843, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.00013836736616212875, + "grad_norm": 4.778151512145996, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8399032354354858, + "num_tokens": 132336881.0, + "step": 3467 + }, + { + "epoch": 0.44116524615188907, + "ewc_loss": 0.03693138435482979, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.0001386009535053745, + "grad_norm": 4.778204441070557, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8564304113388062, + "num_tokens": 132375136.0, + "step": 3468 + }, + { + "epoch": 0.4412924564304796, + "ewc_loss": 0.03677995130419731, + "ewc_loss_diag": 2.300739288330078e-05, + "ewc_loss_parallel": 0.0001383073249598965, + "grad_norm": 4.826218128204346, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8541644811630249, + "num_tokens": 132412893.0, + "step": 3469 + }, + { + "epoch": 0.44141966670907007, + "ewc_loss": 0.036917731165885925, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.00013846444198861718, + "grad_norm": 4.921947956085205, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8508513569831848, + "num_tokens": 132444360.0, + "step": 3470 + }, + { + "epoch": 0.4415468769876606, + "ewc_loss": 0.03694280609488487, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.00013871517148800194, + "grad_norm": 4.775613307952881, + "learning_rate": 1e-06, + "loss": 0.5618, + "mean_token_accuracy": 0.8316231369972229, + "num_tokens": 132485022.0, + "step": 3471 + }, + { + "epoch": 0.4416740872662511, + "ewc_loss": 0.036854758858680725, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.00013783469330519438, + "grad_norm": 4.824055194854736, + "learning_rate": 1e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8421876430511475, + "num_tokens": 132522800.0, + "step": 3472 + }, + { + "epoch": 0.4418012975448416, + "ewc_loss": 0.03690995275974274, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.00013838661834597588, + "grad_norm": 4.768736362457275, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8584295511245728, + "num_tokens": 132560527.0, + "step": 3473 + }, + { + "epoch": 0.4419285078234321, + "ewc_loss": 0.036879755556583405, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.00013808465155307204, + "grad_norm": 4.944977283477783, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8537431359291077, + "num_tokens": 132601075.0, + "step": 3474 + }, + { + "epoch": 0.44205571810202265, + "ewc_loss": 0.036913078278303146, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.00013841790496371686, + "grad_norm": 4.781179428100586, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8462927937507629, + "num_tokens": 132639851.0, + "step": 3475 + }, + { + "epoch": 0.4421829283806131, + "ewc_loss": 0.03677935153245926, + "ewc_loss_diag": 2.3126602172851562e-05, + "ewc_loss_parallel": 0.0001370806130580604, + "grad_norm": 4.770888328552246, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8580933809280396, + "num_tokens": 132679849.0, + "step": 3476 + }, + { + "epoch": 0.44231013865920366, + "ewc_loss": 0.03697682172060013, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.000137834605993703, + "grad_norm": 4.719295024871826, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8609627485275269, + "num_tokens": 132721020.0, + "step": 3477 + }, + { + "epoch": 0.4424373489377942, + "ewc_loss": 0.03699159622192383, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013798238069284707, + "grad_norm": 4.853503704071045, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.843826413154602, + "num_tokens": 132760933.0, + "step": 3478 + }, + { + "epoch": 0.44256455921638466, + "ewc_loss": 0.03705480694770813, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013861445768270642, + "grad_norm": 4.748047828674316, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8659698963165283, + "num_tokens": 132800627.0, + "step": 3479 + }, + { + "epoch": 0.4426917694949752, + "ewc_loss": 0.0369725227355957, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.0001377916196361184, + "grad_norm": 4.835720539093018, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8551660776138306, + "num_tokens": 132833232.0, + "step": 3480 + }, + { + "epoch": 0.4428189797735657, + "ewc_loss": 0.03710003197193146, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013906671665608883, + "grad_norm": 4.836761951446533, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8534950017929077, + "num_tokens": 132874670.0, + "step": 3481 + }, + { + "epoch": 0.44294619005215624, + "ewc_loss": 0.037024952471256256, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013831595424562693, + "grad_norm": 4.952852249145508, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8577538728713989, + "num_tokens": 132916187.0, + "step": 3482 + }, + { + "epoch": 0.4430734003307467, + "ewc_loss": 0.03704039007425308, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013847032096236944, + "grad_norm": 4.73992919921875, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.858977198600769, + "num_tokens": 132956948.0, + "step": 3483 + }, + { + "epoch": 0.44320061060933724, + "ewc_loss": 0.03694503754377365, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013751679216511548, + "grad_norm": 4.8071393966674805, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8613043427467346, + "num_tokens": 132995213.0, + "step": 3484 + }, + { + "epoch": 0.44332782088792777, + "ewc_loss": 0.03705277293920517, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.0001385941286571324, + "grad_norm": 4.746512413024902, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8531182408332825, + "num_tokens": 133038850.0, + "step": 3485 + }, + { + "epoch": 0.44345503116651824, + "ewc_loss": 0.0370003804564476, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013807023060508072, + "grad_norm": 4.83453369140625, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8578606843948364, + "num_tokens": 133080766.0, + "step": 3486 + }, + { + "epoch": 0.44358224144510877, + "ewc_loss": 0.03707680106163025, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.0001388344244332984, + "grad_norm": 4.779711723327637, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8518367409706116, + "num_tokens": 133120845.0, + "step": 3487 + }, + { + "epoch": 0.4437094517236993, + "ewc_loss": 0.03702889010310173, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013835530262440443, + "grad_norm": 4.808264255523682, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8533874750137329, + "num_tokens": 133166011.0, + "step": 3488 + }, + { + "epoch": 0.4438366620022898, + "ewc_loss": 0.037050481885671616, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013857122394256294, + "grad_norm": 4.768223285675049, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8540133237838745, + "num_tokens": 133203507.0, + "step": 3489 + }, + { + "epoch": 0.4439638722808803, + "ewc_loss": 0.03701963275671005, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013826272333972156, + "grad_norm": 4.861219882965088, + "learning_rate": 1e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8326336145401001, + "num_tokens": 133240554.0, + "step": 3490 + }, + { + "epoch": 0.44409108255947083, + "ewc_loss": 0.03703761845827103, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.0001384425995638594, + "grad_norm": 4.838342666625977, + "learning_rate": 1e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.83902907371521, + "num_tokens": 133274987.0, + "step": 3491 + }, + { + "epoch": 0.4442182928380613, + "ewc_loss": 0.03703663498163223, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.0001384327479172498, + "grad_norm": 4.837133884429932, + "learning_rate": 1e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.8275359869003296, + "num_tokens": 133319287.0, + "step": 3492 + }, + { + "epoch": 0.44434550311665183, + "ewc_loss": 0.037015970796346664, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.000138226110721007, + "grad_norm": 4.7863688468933105, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8603315949440002, + "num_tokens": 133353670.0, + "step": 3493 + }, + { + "epoch": 0.44447271339524236, + "ewc_loss": 0.0370246097445488, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013831251999363303, + "grad_norm": 4.779521942138672, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8568863868713379, + "num_tokens": 133391950.0, + "step": 3494 + }, + { + "epoch": 0.44459992367383283, + "ewc_loss": 0.03708631172776222, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013892952119931579, + "grad_norm": 4.819051265716553, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8614566326141357, + "num_tokens": 133424758.0, + "step": 3495 + }, + { + "epoch": 0.44472713395242336, + "ewc_loss": 0.03708463907241821, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013891278649680316, + "grad_norm": 4.744017601013184, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8841134309768677, + "num_tokens": 133459869.0, + "step": 3496 + }, + { + "epoch": 0.4448543442310139, + "ewc_loss": 0.03707050904631615, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013877148739993572, + "grad_norm": 4.878601551055908, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8544520735740662, + "num_tokens": 133494598.0, + "step": 3497 + }, + { + "epoch": 0.44498155450960436, + "ewc_loss": 0.037111423909664154, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013918062904849648, + "grad_norm": 4.726611137390137, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8641690611839294, + "num_tokens": 133536403.0, + "step": 3498 + }, + { + "epoch": 0.4451087647881949, + "ewc_loss": 0.03705110773444176, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013857748126611114, + "grad_norm": 4.869487285614014, + "learning_rate": 1e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8350125551223755, + "num_tokens": 133567319.0, + "step": 3499 + }, + { + "epoch": 0.4452359750667854, + "ewc_loss": 0.03718901053071022, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013995650806464255, + "grad_norm": 4.778711795806885, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.847959041595459, + "num_tokens": 133604829.0, + "step": 3500 + }, + { + "epoch": 0.4453631853453759, + "ewc_loss": 0.03710826486349106, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013914906594436616, + "grad_norm": 4.841062068939209, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8559712171554565, + "num_tokens": 133638373.0, + "step": 3501 + }, + { + "epoch": 0.4454903956239664, + "ewc_loss": 0.037149712443351746, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013956354814581573, + "grad_norm": 4.762954235076904, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8661823272705078, + "num_tokens": 133674110.0, + "step": 3502 + }, + { + "epoch": 0.44561760590255695, + "ewc_loss": 0.03711318224668503, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013919822231400758, + "grad_norm": 4.776076316833496, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8572031259536743, + "num_tokens": 133711005.0, + "step": 3503 + }, + { + "epoch": 0.4457448161811474, + "ewc_loss": 0.03713101148605347, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.000139376541483216, + "grad_norm": 4.74535608291626, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8683561086654663, + "num_tokens": 133746258.0, + "step": 3504 + }, + { + "epoch": 0.44587202645973795, + "ewc_loss": 0.03716616332530975, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.0001397280429955572, + "grad_norm": 4.721456050872803, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8639688491821289, + "num_tokens": 133790695.0, + "step": 3505 + }, + { + "epoch": 0.4459992367383285, + "ewc_loss": 0.03713227063417435, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.0001393891143379733, + "grad_norm": 4.755168437957764, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8529889583587646, + "num_tokens": 133830181.0, + "step": 3506 + }, + { + "epoch": 0.44612644701691895, + "ewc_loss": 0.03718045353889465, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013987095735501498, + "grad_norm": 4.777127265930176, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8599968552589417, + "num_tokens": 133866849.0, + "step": 3507 + }, + { + "epoch": 0.4462536572955095, + "ewc_loss": 0.037148892879486084, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013955531176179647, + "grad_norm": 4.772649765014648, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8555258512496948, + "num_tokens": 133906812.0, + "step": 3508 + }, + { + "epoch": 0.4463808675741, + "ewc_loss": 0.03713139891624451, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.0001393803977407515, + "grad_norm": 4.805727958679199, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8493372201919556, + "num_tokens": 133948956.0, + "step": 3509 + }, + { + "epoch": 0.4465080778526905, + "ewc_loss": 0.03717614337801933, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013982784003019333, + "grad_norm": 4.802751064300537, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8388040661811829, + "num_tokens": 133988764.0, + "step": 3510 + }, + { + "epoch": 0.446635288131281, + "ewc_loss": 0.03711570426821709, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013922345533501357, + "grad_norm": 4.758293628692627, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8608989119529724, + "num_tokens": 134028148.0, + "step": 3511 + }, + { + "epoch": 0.44676249840987153, + "ewc_loss": 0.03715686872601509, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013963508536107838, + "grad_norm": 4.840041160583496, + "learning_rate": 1e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8319789171218872, + "num_tokens": 134065909.0, + "step": 3512 + }, + { + "epoch": 0.446889708688462, + "ewc_loss": 0.0371667854487896, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013973427121527493, + "grad_norm": 4.72925329208374, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8613110184669495, + "num_tokens": 134108342.0, + "step": 3513 + }, + { + "epoch": 0.44701691896705253, + "ewc_loss": 0.03710222616791725, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.0001390886609442532, + "grad_norm": 4.826301574707031, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8543514013290405, + "num_tokens": 134145928.0, + "step": 3514 + }, + { + "epoch": 0.44714412924564306, + "ewc_loss": 0.03716178610920906, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013968425628263503, + "grad_norm": 4.74887752532959, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8516108393669128, + "num_tokens": 134185955.0, + "step": 3515 + }, + { + "epoch": 0.44727133952423354, + "ewc_loss": 0.03712901473045349, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013935656170360744, + "grad_norm": 4.801302433013916, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8617939949035645, + "num_tokens": 134225829.0, + "step": 3516 + }, + { + "epoch": 0.44739854980282406, + "ewc_loss": 0.037178922444581985, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.0001398556341882795, + "grad_norm": 4.776817798614502, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.84561687707901, + "num_tokens": 134261381.0, + "step": 3517 + }, + { + "epoch": 0.4475257600814146, + "ewc_loss": 0.037150897085666656, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013957536430098116, + "grad_norm": 4.8197174072265625, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8633429408073425, + "num_tokens": 134294697.0, + "step": 3518 + }, + { + "epoch": 0.44765297036000506, + "ewc_loss": 0.03717566281557083, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013982302334625274, + "grad_norm": 4.729246616363525, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8762322664260864, + "num_tokens": 134334349.0, + "step": 3519 + }, + { + "epoch": 0.4477801806385956, + "ewc_loss": 0.03716141730546951, + "ewc_loss_diag": 2.3245811462402344e-05, + "ewc_loss_parallel": 0.00013968058919999748, + "grad_norm": 4.820186614990234, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8437809944152832, + "num_tokens": 134373381.0, + "step": 3520 + }, + { + "epoch": 0.4479073909171861, + "ewc_loss": 0.03735823556780815, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014042806287761778, + "grad_norm": 4.787060260772705, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8614411354064941, + "num_tokens": 134407952.0, + "step": 3521 + }, + { + "epoch": 0.4480346011957766, + "ewc_loss": 0.03728828579187393, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00013972855231259018, + "grad_norm": 4.75184440612793, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8461182117462158, + "num_tokens": 134458862.0, + "step": 3522 + }, + { + "epoch": 0.4481618114743671, + "ewc_loss": 0.03732796758413315, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014012539759278297, + "grad_norm": 4.8323564529418945, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.85285484790802, + "num_tokens": 134494007.0, + "step": 3523 + }, + { + "epoch": 0.44828902175295765, + "ewc_loss": 0.0373661145567894, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014050683239474893, + "grad_norm": 4.815718173980713, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8424860239028931, + "num_tokens": 134534195.0, + "step": 3524 + }, + { + "epoch": 0.4484162320315481, + "ewc_loss": 0.037328433245420456, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014013003965374082, + "grad_norm": 4.791529655456543, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8629289865493774, + "num_tokens": 134570241.0, + "step": 3525 + }, + { + "epoch": 0.44854344231013865, + "ewc_loss": 0.03735668957233429, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014041260874364525, + "grad_norm": 4.750748634338379, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8541500568389893, + "num_tokens": 134611904.0, + "step": 3526 + }, + { + "epoch": 0.4486706525887292, + "ewc_loss": 0.03735973685979843, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014044309500604868, + "grad_norm": 4.852187633514404, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8465278148651123, + "num_tokens": 134644426.0, + "step": 3527 + }, + { + "epoch": 0.44879786286731965, + "ewc_loss": 0.03742744028568268, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014112010831013322, + "grad_norm": 4.815024375915527, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8650949001312256, + "num_tokens": 134678979.0, + "step": 3528 + }, + { + "epoch": 0.4489250731459102, + "ewc_loss": 0.037402793765068054, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001408736570738256, + "grad_norm": 4.819249629974365, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.858593761920929, + "num_tokens": 134717270.0, + "step": 3529 + }, + { + "epoch": 0.4490522834245007, + "ewc_loss": 0.037403255701065063, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014087824092712253, + "grad_norm": 4.860630035400391, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8559602499008179, + "num_tokens": 134755464.0, + "step": 3530 + }, + { + "epoch": 0.44917949370309124, + "ewc_loss": 0.03738131374120712, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014065882714930922, + "grad_norm": 4.77034854888916, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8483452796936035, + "num_tokens": 134793737.0, + "step": 3531 + }, + { + "epoch": 0.4493067039816817, + "ewc_loss": 0.03737005218863487, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014054622442927212, + "grad_norm": 4.817102432250977, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.850466251373291, + "num_tokens": 134834332.0, + "step": 3532 + }, + { + "epoch": 0.44943391426027224, + "ewc_loss": 0.03738067299127579, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001406524097546935, + "grad_norm": 4.796292781829834, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8668326139450073, + "num_tokens": 134871027.0, + "step": 3533 + }, + { + "epoch": 0.44956112453886277, + "ewc_loss": 0.03733932226896286, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014023894618730992, + "grad_norm": 4.858094692230225, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8564068078994751, + "num_tokens": 134908529.0, + "step": 3534 + }, + { + "epoch": 0.44968833481745324, + "ewc_loss": 0.0373942106962204, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014078780077397823, + "grad_norm": 4.781126499176025, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8649158477783203, + "num_tokens": 134944177.0, + "step": 3535 + }, + { + "epoch": 0.44981554509604377, + "ewc_loss": 0.03735092282295227, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014035493950359523, + "grad_norm": 4.9030280113220215, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8548361659049988, + "num_tokens": 134977928.0, + "step": 3536 + }, + { + "epoch": 0.4499427553746343, + "ewc_loss": 0.03743985295295715, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001412442361470312, + "grad_norm": 4.7635345458984375, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.855833888053894, + "num_tokens": 135017554.0, + "step": 3537 + }, + { + "epoch": 0.45006996565322477, + "ewc_loss": 0.037348777055740356, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001403334754286334, + "grad_norm": 4.8036699295043945, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8622713088989258, + "num_tokens": 135058694.0, + "step": 3538 + }, + { + "epoch": 0.4501971759318153, + "ewc_loss": 0.037409476935863495, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001409404940204695, + "grad_norm": 4.847080230712891, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8589023351669312, + "num_tokens": 135094037.0, + "step": 3539 + }, + { + "epoch": 0.4503243862104058, + "ewc_loss": 0.03741811588406563, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014102685963734984, + "grad_norm": 4.806209087371826, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8494845628738403, + "num_tokens": 135132758.0, + "step": 3540 + }, + { + "epoch": 0.4504515964889963, + "ewc_loss": 0.03735093027353287, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014035499771125615, + "grad_norm": 4.818580627441406, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8678842782974243, + "num_tokens": 135169398.0, + "step": 3541 + }, + { + "epoch": 0.4505788067675868, + "ewc_loss": 0.037428148090839386, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001411271805409342, + "grad_norm": 4.881467819213867, + "learning_rate": 1e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8411925435066223, + "num_tokens": 135204312.0, + "step": 3542 + }, + { + "epoch": 0.45070601704617735, + "ewc_loss": 0.037392716854810715, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014077288506086916, + "grad_norm": 4.779300689697266, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8538293838500977, + "num_tokens": 135246287.0, + "step": 3543 + }, + { + "epoch": 0.4508332273247678, + "ewc_loss": 0.037361592054367065, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014046160504221916, + "grad_norm": 4.796817779541016, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8582955598831177, + "num_tokens": 135280970.0, + "step": 3544 + }, + { + "epoch": 0.45096043760335836, + "ewc_loss": 0.03741147369146347, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014096044469624758, + "grad_norm": 4.811551094055176, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.877860426902771, + "num_tokens": 135317873.0, + "step": 3545 + }, + { + "epoch": 0.4510876478819489, + "ewc_loss": 0.037393514066934586, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014078084495849907, + "grad_norm": 4.802650451660156, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8630359172821045, + "num_tokens": 135354994.0, + "step": 3546 + }, + { + "epoch": 0.45121485816053936, + "ewc_loss": 0.03736366704106331, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014048237062525004, + "grad_norm": 4.842403888702393, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8504071831703186, + "num_tokens": 135393050.0, + "step": 3547 + }, + { + "epoch": 0.4513420684391299, + "ewc_loss": 0.037391193211078644, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014075763465370983, + "grad_norm": 4.779775142669678, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8612353801727295, + "num_tokens": 135431704.0, + "step": 3548 + }, + { + "epoch": 0.4514692787177204, + "ewc_loss": 0.037361860275268555, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001404642971465364, + "grad_norm": 4.8319573402404785, + "learning_rate": 1e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8340247869491577, + "num_tokens": 135471306.0, + "step": 3549 + }, + { + "epoch": 0.4515964889963109, + "ewc_loss": 0.03742564469575882, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014110216579865664, + "grad_norm": 4.825688362121582, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8438078761100769, + "num_tokens": 135509647.0, + "step": 3550 + }, + { + "epoch": 0.4517236992749014, + "ewc_loss": 0.0373665913939476, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014051160542294383, + "grad_norm": 4.766808986663818, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8633381128311157, + "num_tokens": 135550469.0, + "step": 3551 + }, + { + "epoch": 0.45185090955349194, + "ewc_loss": 0.03740319609642029, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001408776588505134, + "grad_norm": 4.797001361846924, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8555018901824951, + "num_tokens": 135589290.0, + "step": 3552 + }, + { + "epoch": 0.4519781198320824, + "ewc_loss": 0.03738335892558098, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001406792871421203, + "grad_norm": 4.82196044921875, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8502188920974731, + "num_tokens": 135624416.0, + "step": 3553 + }, + { + "epoch": 0.45210533011067294, + "ewc_loss": 0.037457846105098724, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014142415602691472, + "grad_norm": 4.898198127746582, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8605072498321533, + "num_tokens": 135662316.0, + "step": 3554 + }, + { + "epoch": 0.45223254038926347, + "ewc_loss": 0.037435781210660934, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.000141203505336307, + "grad_norm": 4.8078508377075195, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8574793338775635, + "num_tokens": 135701739.0, + "step": 3555 + }, + { + "epoch": 0.45235975066785394, + "ewc_loss": 0.03752940893173218, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014091907360125333, + "grad_norm": 4.9218878746032715, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8522510528564453, + "num_tokens": 135741372.0, + "step": 3556 + }, + { + "epoch": 0.45248696094644447, + "ewc_loss": 0.0374448299407959, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.00014129401824902743, + "grad_norm": 4.834019660949707, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8778527975082397, + "num_tokens": 135779945.0, + "step": 3557 + }, + { + "epoch": 0.452614171225035, + "ewc_loss": 0.03741771727800369, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001410228869644925, + "grad_norm": 4.888402462005615, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8495796918869019, + "num_tokens": 135813179.0, + "step": 3558 + }, + { + "epoch": 0.4527413815036255, + "ewc_loss": 0.03742755576968193, + "ewc_loss_diag": 2.3365020751953125e-05, + "ewc_loss_parallel": 0.0001411212724633515, + "grad_norm": 4.813950538635254, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8592221736907959, + "num_tokens": 135855147.0, + "step": 3559 + }, + { + "epoch": 0.452868591782216, + "ewc_loss": 0.03749492019414902, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014057420776225626, + "grad_norm": 4.816268444061279, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.864887535572052, + "num_tokens": 135892990.0, + "step": 3560 + }, + { + "epoch": 0.45299580206080653, + "ewc_loss": 0.037523362785577774, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014085863949730992, + "grad_norm": 4.8263373374938965, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8740384578704834, + "num_tokens": 135926911.0, + "step": 3561 + }, + { + "epoch": 0.453123012339397, + "ewc_loss": 0.03751819208264351, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014080690743867308, + "grad_norm": 4.79665470123291, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8647283315658569, + "num_tokens": 135963530.0, + "step": 3562 + }, + { + "epoch": 0.45325022261798753, + "ewc_loss": 0.03748590126633644, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014048401499167085, + "grad_norm": 4.792569160461426, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8469133377075195, + "num_tokens": 136000677.0, + "step": 3563 + }, + { + "epoch": 0.45337743289657806, + "ewc_loss": 0.03755762428045273, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014120126434136182, + "grad_norm": 4.864881992340088, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.846825361251831, + "num_tokens": 136037283.0, + "step": 3564 + }, + { + "epoch": 0.45350464317516853, + "ewc_loss": 0.037578172981739044, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.0001414067082805559, + "grad_norm": 4.851828098297119, + "learning_rate": 1e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8382238149642944, + "num_tokens": 136071454.0, + "step": 3565 + }, + { + "epoch": 0.45363185345375906, + "ewc_loss": 0.03753794729709625, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014100447879172862, + "grad_norm": 4.787271022796631, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8446035385131836, + "num_tokens": 136105638.0, + "step": 3566 + }, + { + "epoch": 0.4537590637323496, + "ewc_loss": 0.03755789250135422, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014120391278993338, + "grad_norm": 4.7607903480529785, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8593457937240601, + "num_tokens": 136147968.0, + "step": 3567 + }, + { + "epoch": 0.45388627401094006, + "ewc_loss": 0.03757651522755623, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014139014820102602, + "grad_norm": 4.80892276763916, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8566383123397827, + "num_tokens": 136190468.0, + "step": 3568 + }, + { + "epoch": 0.4540134842895306, + "ewc_loss": 0.03754904493689537, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014111545169726014, + "grad_norm": 4.751708030700684, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8602700233459473, + "num_tokens": 136229363.0, + "step": 3569 + }, + { + "epoch": 0.4541406945681211, + "ewc_loss": 0.037593431770801544, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.0001415593142155558, + "grad_norm": 4.819159984588623, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8495519161224365, + "num_tokens": 136268834.0, + "step": 3570 + }, + { + "epoch": 0.4542679048467116, + "ewc_loss": 0.03759726136922836, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014159762940835208, + "grad_norm": 4.789361000061035, + "learning_rate": 1e-06, + "loss": 0.5504, + "mean_token_accuracy": 0.8327771425247192, + "num_tokens": 136306761.0, + "step": 3571 + }, + { + "epoch": 0.4543951151253021, + "ewc_loss": 0.037625424563884735, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.0001418792235199362, + "grad_norm": 4.814775466918945, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8553023338317871, + "num_tokens": 136347736.0, + "step": 3572 + }, + { + "epoch": 0.45452232540389265, + "ewc_loss": 0.03762670233845711, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014189202920533717, + "grad_norm": 4.858994483947754, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8591731190681458, + "num_tokens": 136381009.0, + "step": 3573 + }, + { + "epoch": 0.4546495356824831, + "ewc_loss": 0.037650059908628464, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014212560199666768, + "grad_norm": 4.833052158355713, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8661432862281799, + "num_tokens": 136418050.0, + "step": 3574 + }, + { + "epoch": 0.45477674596107365, + "ewc_loss": 0.037613216787576675, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014175716205500066, + "grad_norm": 4.7701945304870605, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8388580083847046, + "num_tokens": 136459109.0, + "step": 3575 + }, + { + "epoch": 0.4549039562396642, + "ewc_loss": 0.037598058581352234, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.000141605589305982, + "grad_norm": 4.768657207489014, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8515993356704712, + "num_tokens": 136497993.0, + "step": 3576 + }, + { + "epoch": 0.45503116651825465, + "ewc_loss": 0.037564389407634735, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.0001412689161952585, + "grad_norm": 4.793758392333984, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8590290546417236, + "num_tokens": 136534446.0, + "step": 3577 + }, + { + "epoch": 0.4551583767968452, + "ewc_loss": 0.03761196509003639, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.0001417446619598195, + "grad_norm": 4.796946048736572, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8398172855377197, + "num_tokens": 136571117.0, + "step": 3578 + }, + { + "epoch": 0.4552855870754357, + "ewc_loss": 0.03758169338107109, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014144192391540855, + "grad_norm": 4.827662467956543, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8628997802734375, + "num_tokens": 136601706.0, + "step": 3579 + }, + { + "epoch": 0.4554127973540262, + "ewc_loss": 0.03761845827102661, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014180959260556847, + "grad_norm": 4.770970821380615, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8566538691520691, + "num_tokens": 136639735.0, + "step": 3580 + }, + { + "epoch": 0.4555400076326167, + "ewc_loss": 0.03760147839784622, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014163980085868388, + "grad_norm": 4.824625492095947, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8493636846542358, + "num_tokens": 136675647.0, + "step": 3581 + }, + { + "epoch": 0.45566721791120723, + "ewc_loss": 0.037756554782390594, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014196985284797847, + "grad_norm": 4.741276264190674, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8641270995140076, + "num_tokens": 136713361.0, + "step": 3582 + }, + { + "epoch": 0.45579442818979776, + "ewc_loss": 0.0377497598528862, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.000141901895403862, + "grad_norm": 4.822211265563965, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8543535470962524, + "num_tokens": 136751440.0, + "step": 3583 + }, + { + "epoch": 0.45592163846838824, + "ewc_loss": 0.03779931366443634, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014239741722121835, + "grad_norm": 4.834112644195557, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8520486354827881, + "num_tokens": 136786078.0, + "step": 3584 + }, + { + "epoch": 0.45604884874697876, + "ewc_loss": 0.03775171935558319, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001419214968336746, + "grad_norm": 4.752541542053223, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8634591102600098, + "num_tokens": 136828313.0, + "step": 3585 + }, + { + "epoch": 0.4561760590255693, + "ewc_loss": 0.03765401244163513, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014216512499842793, + "grad_norm": 4.820522785186768, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.854703426361084, + "num_tokens": 136862549.0, + "step": 3586 + }, + { + "epoch": 0.45630326930415976, + "ewc_loss": 0.037807151675224304, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014247580838855356, + "grad_norm": 4.766400337219238, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8592305779457092, + "num_tokens": 136906423.0, + "step": 3587 + }, + { + "epoch": 0.4564304795827503, + "ewc_loss": 0.037770386785268784, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014210815425030887, + "grad_norm": 4.851423263549805, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8441401720046997, + "num_tokens": 136941550.0, + "step": 3588 + }, + { + "epoch": 0.4565576898613408, + "ewc_loss": 0.03785886615514755, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001429929689038545, + "grad_norm": 4.770657062530518, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8593728542327881, + "num_tokens": 136980234.0, + "step": 3589 + }, + { + "epoch": 0.4566849001399313, + "ewc_loss": 0.03776780888438225, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014208238280843943, + "grad_norm": 4.7552900314331055, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8588775396347046, + "num_tokens": 137023991.0, + "step": 3590 + }, + { + "epoch": 0.4568121104185218, + "ewc_loss": 0.037842947989702225, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014283377095125616, + "grad_norm": 4.813417434692383, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.847206711769104, + "num_tokens": 137063261.0, + "step": 3591 + }, + { + "epoch": 0.45693932069711235, + "ewc_loss": 0.03783499076962471, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014275420107878745, + "grad_norm": 4.776843070983887, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8656727075576782, + "num_tokens": 137103935.0, + "step": 3592 + }, + { + "epoch": 0.4570665309757028, + "ewc_loss": 0.037815339863300323, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014255769201554358, + "grad_norm": 5.050707817077637, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8554794788360596, + "num_tokens": 137140194.0, + "step": 3593 + }, + { + "epoch": 0.45719374125429335, + "ewc_loss": 0.03790011256933212, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014340544294100255, + "grad_norm": 4.853545188903809, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8469089865684509, + "num_tokens": 137179086.0, + "step": 3594 + }, + { + "epoch": 0.4573209515328839, + "ewc_loss": 0.03770054504275322, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001414097350789234, + "grad_norm": 4.831478118896484, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8617455959320068, + "num_tokens": 137216887.0, + "step": 3595 + }, + { + "epoch": 0.45744816181147435, + "ewc_loss": 0.03779090195894241, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014231332170311362, + "grad_norm": 4.789885997772217, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8719083666801453, + "num_tokens": 137255029.0, + "step": 3596 + }, + { + "epoch": 0.4575753720900649, + "ewc_loss": 0.037758804857730865, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014199236466083676, + "grad_norm": 4.824028491973877, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8593069314956665, + "num_tokens": 137291399.0, + "step": 3597 + }, + { + "epoch": 0.4577025823686554, + "ewc_loss": 0.0376676470041275, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014230149099603295, + "grad_norm": 4.855290412902832, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8607981204986572, + "num_tokens": 137326569.0, + "step": 3598 + }, + { + "epoch": 0.4578297926472459, + "ewc_loss": 0.03782125562429428, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001426168455509469, + "grad_norm": 4.764369487762451, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8519271016120911, + "num_tokens": 137370035.0, + "step": 3599 + }, + { + "epoch": 0.4579570029258364, + "ewc_loss": 0.037753984332084656, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014194415416568518, + "grad_norm": 4.867375373840332, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8522849082946777, + "num_tokens": 137407969.0, + "step": 3600 + }, + { + "epoch": 0.45808421320442694, + "ewc_loss": 0.0378265380859375, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001426696835551411, + "grad_norm": 4.809388160705566, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8549583554267883, + "num_tokens": 137447256.0, + "step": 3601 + }, + { + "epoch": 0.4582114234830174, + "ewc_loss": 0.037763115018606186, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014203545288182795, + "grad_norm": 4.919652938842773, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8416628241539001, + "num_tokens": 137479762.0, + "step": 3602 + }, + { + "epoch": 0.45833863376160794, + "ewc_loss": 0.03770022839307785, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.00014262729382608086, + "grad_norm": 4.803165435791016, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8493874073028564, + "num_tokens": 137521823.0, + "step": 3603 + }, + { + "epoch": 0.45846584404019847, + "ewc_loss": 0.03774632513523102, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014186756743583828, + "grad_norm": 4.898288726806641, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8548556566238403, + "num_tokens": 137558174.0, + "step": 3604 + }, + { + "epoch": 0.45859305431878894, + "ewc_loss": 0.03781268745660782, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014253119297791272, + "grad_norm": 4.810171604156494, + "learning_rate": 1e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8318244814872742, + "num_tokens": 137597265.0, + "step": 3605 + }, + { + "epoch": 0.45872026459737947, + "ewc_loss": 0.0375887006521225, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.0001415120204910636, + "grad_norm": 4.80432653427124, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8665037751197815, + "num_tokens": 137635954.0, + "step": 3606 + }, + { + "epoch": 0.45884747487597, + "ewc_loss": 0.03778395429253578, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014224383630789816, + "grad_norm": 4.905585765838623, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.858447253704071, + "num_tokens": 137676137.0, + "step": 3607 + }, + { + "epoch": 0.45897468515456047, + "ewc_loss": 0.03779491037130356, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001423533831257373, + "grad_norm": 4.8101301193237305, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8594420552253723, + "num_tokens": 137716723.0, + "step": 3608 + }, + { + "epoch": 0.459101895433151, + "ewc_loss": 0.03772424906492233, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014164680032990873, + "grad_norm": 4.879550457000732, + "learning_rate": 1e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8395701050758362, + "num_tokens": 137752435.0, + "step": 3609 + }, + { + "epoch": 0.4592291057117415, + "ewc_loss": 0.03781786561012268, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014258296869229525, + "grad_norm": 4.8608012199401855, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8474938273429871, + "num_tokens": 137786684.0, + "step": 3610 + }, + { + "epoch": 0.459356315990332, + "ewc_loss": 0.03763718158006668, + "ewc_loss_diag": 2.3484230041503906e-05, + "ewc_loss_parallel": 0.0001419968029949814, + "grad_norm": 4.766204357147217, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8482154607772827, + "num_tokens": 137830668.0, + "step": 3611 + }, + { + "epoch": 0.4594835262689225, + "ewc_loss": 0.03779810294508934, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001423853391315788, + "grad_norm": 4.887805938720703, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8631756901741028, + "num_tokens": 137865262.0, + "step": 3612 + }, + { + "epoch": 0.45961073654751305, + "ewc_loss": 0.03784429654479027, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014284727512858808, + "grad_norm": 4.918178558349609, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8633816242218018, + "num_tokens": 137898867.0, + "step": 3613 + }, + { + "epoch": 0.4597379468261035, + "ewc_loss": 0.03779502958059311, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014235459093470126, + "grad_norm": 4.864989757537842, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8573046922683716, + "num_tokens": 137935108.0, + "step": 3614 + }, + { + "epoch": 0.45986515710469406, + "ewc_loss": 0.03776863217353821, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001420906191924587, + "grad_norm": 4.842766284942627, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8672513961791992, + "num_tokens": 137977714.0, + "step": 3615 + }, + { + "epoch": 0.4599923673832846, + "ewc_loss": 0.037740953266620636, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014181384176481515, + "grad_norm": 4.939222812652588, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.867527961730957, + "num_tokens": 138006345.0, + "step": 3616 + }, + { + "epoch": 0.46011957766187506, + "ewc_loss": 0.0378013476729393, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014241778990253806, + "grad_norm": 4.8122687339782715, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8563960790634155, + "num_tokens": 138045106.0, + "step": 3617 + }, + { + "epoch": 0.4602467879404656, + "ewc_loss": 0.037691112607717514, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014131540956441313, + "grad_norm": 4.907309055328369, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8506797552108765, + "num_tokens": 138079993.0, + "step": 3618 + }, + { + "epoch": 0.4603739982190561, + "ewc_loss": 0.03783094137907028, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014271368854679167, + "grad_norm": 4.847977161407471, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8596633076667786, + "num_tokens": 138118555.0, + "step": 3619 + }, + { + "epoch": 0.4605012084976466, + "ewc_loss": 0.037747275084257126, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014187704073265195, + "grad_norm": 4.8360595703125, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8639953136444092, + "num_tokens": 138155096.0, + "step": 3620 + }, + { + "epoch": 0.4606284187762371, + "ewc_loss": 0.03782230243086815, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001426273083779961, + "grad_norm": 4.809477806091309, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8586278557777405, + "num_tokens": 138194529.0, + "step": 3621 + }, + { + "epoch": 0.46075562905482764, + "ewc_loss": 0.03780198097229004, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014242409088183194, + "grad_norm": 4.841312408447266, + "learning_rate": 1e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8309524059295654, + "num_tokens": 138235520.0, + "step": 3622 + }, + { + "epoch": 0.4608828393334181, + "ewc_loss": 0.037843089550733566, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014283519703894854, + "grad_norm": 4.846263408660889, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8662509322166443, + "num_tokens": 138271391.0, + "step": 3623 + }, + { + "epoch": 0.46101004961200864, + "ewc_loss": 0.03779590502381325, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001423633366357535, + "grad_norm": 4.801260948181152, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8512659072875977, + "num_tokens": 138313923.0, + "step": 3624 + }, + { + "epoch": 0.46113725989059917, + "ewc_loss": 0.03785496577620506, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014295395521912724, + "grad_norm": 4.791409969329834, + "learning_rate": 1e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8486660718917847, + "num_tokens": 138355992.0, + "step": 3625 + }, + { + "epoch": 0.46126447016918964, + "ewc_loss": 0.0378342941403389, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001427472452633083, + "grad_norm": 4.830881595611572, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8762853145599365, + "num_tokens": 138393074.0, + "step": 3626 + }, + { + "epoch": 0.4613916804477802, + "ewc_loss": 0.037905994802713394, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014346423267852515, + "grad_norm": 4.894108772277832, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8633424639701843, + "num_tokens": 138427854.0, + "step": 3627 + }, + { + "epoch": 0.4615188907263707, + "ewc_loss": 0.03788112848997116, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014321558410301805, + "grad_norm": 4.82073974609375, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8656454682350159, + "num_tokens": 138467359.0, + "step": 3628 + }, + { + "epoch": 0.4616461010049612, + "ewc_loss": 0.03784334287047386, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014283771452028304, + "grad_norm": 4.88939905166626, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8440630435943604, + "num_tokens": 138505355.0, + "step": 3629 + }, + { + "epoch": 0.4617733112835517, + "ewc_loss": 0.03787323087453842, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014313659630715847, + "grad_norm": 4.81663703918457, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8609297275543213, + "num_tokens": 138543703.0, + "step": 3630 + }, + { + "epoch": 0.46190052156214223, + "ewc_loss": 0.03783104196190834, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001427147217327729, + "grad_norm": 4.7920708656311035, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8486351370811462, + "num_tokens": 138579860.0, + "step": 3631 + }, + { + "epoch": 0.46202773184073276, + "ewc_loss": 0.037872303277254105, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014312734128907323, + "grad_norm": 4.8753581047058105, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8664820194244385, + "num_tokens": 138615755.0, + "step": 3632 + }, + { + "epoch": 0.46215494211932323, + "ewc_loss": 0.03786458820104599, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014305020158644766, + "grad_norm": 4.8947906494140625, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8513777256011963, + "num_tokens": 138647607.0, + "step": 3633 + }, + { + "epoch": 0.46228215239791376, + "ewc_loss": 0.03786059841513634, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014301027113106102, + "grad_norm": 4.909396648406982, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8615810871124268, + "num_tokens": 138679284.0, + "step": 3634 + }, + { + "epoch": 0.4624093626765043, + "ewc_loss": 0.03785451501607895, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014294942957349122, + "grad_norm": 4.819883346557617, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8659348487854004, + "num_tokens": 138719690.0, + "step": 3635 + }, + { + "epoch": 0.46253657295509476, + "ewc_loss": 0.03783833980560303, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014278768503572792, + "grad_norm": 4.816837310791016, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8443948030471802, + "num_tokens": 138758031.0, + "step": 3636 + }, + { + "epoch": 0.4626637832336853, + "ewc_loss": 0.03786933422088623, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001430976262781769, + "grad_norm": 4.846026420593262, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8559576272964478, + "num_tokens": 138797283.0, + "step": 3637 + }, + { + "epoch": 0.4627909935122758, + "ewc_loss": 0.03786158561706543, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014302013732958585, + "grad_norm": 4.80498743057251, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8530911803245544, + "num_tokens": 138834672.0, + "step": 3638 + }, + { + "epoch": 0.4629182037908663, + "ewc_loss": 0.03809671103954315, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014292998821474612, + "grad_norm": 4.823371410369873, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8672759532928467, + "num_tokens": 138871105.0, + "step": 3639 + }, + { + "epoch": 0.4630454140694568, + "ewc_loss": 0.03787359595298767, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014314027794171125, + "grad_norm": 4.872243404388428, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8570425510406494, + "num_tokens": 138903516.0, + "step": 3640 + }, + { + "epoch": 0.46317262434804735, + "ewc_loss": 0.03787466511130333, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014315095904748887, + "grad_norm": 4.810895919799805, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8549131155014038, + "num_tokens": 138942021.0, + "step": 3641 + }, + { + "epoch": 0.4632998346266378, + "ewc_loss": 0.03785766661167145, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014298097812570632, + "grad_norm": 4.833042621612549, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8658854365348816, + "num_tokens": 138978210.0, + "step": 3642 + }, + { + "epoch": 0.46342704490522835, + "ewc_loss": 0.037870123982429504, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014310555707197636, + "grad_norm": 4.823436737060547, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8596935272216797, + "num_tokens": 139019122.0, + "step": 3643 + }, + { + "epoch": 0.4635542551838189, + "ewc_loss": 0.03787979856133461, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014320228365249932, + "grad_norm": 4.810404300689697, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8669416904449463, + "num_tokens": 139062187.0, + "step": 3644 + }, + { + "epoch": 0.46368146546240935, + "ewc_loss": 0.03788403421640396, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014324462972581387, + "grad_norm": 4.828364849090576, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8756915330886841, + "num_tokens": 139102960.0, + "step": 3645 + }, + { + "epoch": 0.4638086757409999, + "ewc_loss": 0.03789495676755905, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.000143353856401518, + "grad_norm": 4.8727803230285645, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8585619926452637, + "num_tokens": 139137302.0, + "step": 3646 + }, + { + "epoch": 0.4639358860195904, + "ewc_loss": 0.0378752164542675, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001431564596714452, + "grad_norm": 4.855495929718018, + "learning_rate": 1e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8388291597366333, + "num_tokens": 139177195.0, + "step": 3647 + }, + { + "epoch": 0.4640630962981809, + "ewc_loss": 0.0378781221807003, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.000143185505294241, + "grad_norm": 4.9177727699279785, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8530742526054382, + "num_tokens": 139214722.0, + "step": 3648 + }, + { + "epoch": 0.4641903065767714, + "ewc_loss": 0.037870507687330246, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014310938422568142, + "grad_norm": 4.897454261779785, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8559715747833252, + "num_tokens": 139248569.0, + "step": 3649 + }, + { + "epoch": 0.46431751685536193, + "ewc_loss": 0.037854503840208054, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014294934226199985, + "grad_norm": 4.807063102722168, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8632864952087402, + "num_tokens": 139289141.0, + "step": 3650 + }, + { + "epoch": 0.4644447271339524, + "ewc_loss": 0.037829093635082245, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001426952367182821, + "grad_norm": 4.820914268493652, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.874038815498352, + "num_tokens": 139330791.0, + "step": 3651 + }, + { + "epoch": 0.46457193741254293, + "ewc_loss": 0.03786695376038551, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014307383389677852, + "grad_norm": 4.861061096191406, + "learning_rate": 1e-06, + "loss": 0.5618, + "mean_token_accuracy": 0.8248181343078613, + "num_tokens": 139373009.0, + "step": 3652 + }, + { + "epoch": 0.46469914769113346, + "ewc_loss": 0.037861037999391556, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014301466580945998, + "grad_norm": 4.842605113983154, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.855905294418335, + "num_tokens": 139408894.0, + "step": 3653 + }, + { + "epoch": 0.46482635796972394, + "ewc_loss": 0.0378614217042923, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001430185220669955, + "grad_norm": 4.867559432983398, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8543382883071899, + "num_tokens": 139444040.0, + "step": 3654 + }, + { + "epoch": 0.46495356824831446, + "ewc_loss": 0.037878379225730896, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001431881100870669, + "grad_norm": 4.818472385406494, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8631769418716431, + "num_tokens": 139479675.0, + "step": 3655 + }, + { + "epoch": 0.465080778526905, + "ewc_loss": 0.038124289363622665, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014320577611215413, + "grad_norm": 4.958441257476807, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.864775538444519, + "num_tokens": 139521792.0, + "step": 3656 + }, + { + "epoch": 0.46520798880549546, + "ewc_loss": 0.03792579472064972, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001436622260371223, + "grad_norm": 4.774930477142334, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8637648820877075, + "num_tokens": 139563471.0, + "step": 3657 + }, + { + "epoch": 0.465335199084086, + "ewc_loss": 0.03794370964169502, + "ewc_loss_diag": 2.372264862060547e-05, + "ewc_loss_parallel": 0.0001426207018084824, + "grad_norm": 4.830897331237793, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8566184043884277, + "num_tokens": 139604702.0, + "step": 3658 + }, + { + "epoch": 0.4654624093626765, + "ewc_loss": 0.037934113293886185, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.00014374541933648288, + "grad_norm": 4.839705944061279, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.855562686920166, + "num_tokens": 139644573.0, + "step": 3659 + }, + { + "epoch": 0.465589619641267, + "ewc_loss": 0.037903137505054474, + "ewc_loss_diag": 2.3603439331054688e-05, + "ewc_loss_parallel": 0.0001434356818208471, + "grad_norm": 4.909999370574951, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.842988133430481, + "num_tokens": 139680924.0, + "step": 3660 + }, + { + "epoch": 0.4657168299198575, + "ewc_loss": 0.038013286888599396, + "ewc_loss_diag": 2.372264862060547e-05, + "ewc_loss_parallel": 0.00014331645797938108, + "grad_norm": 4.864963054656982, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8492305874824524, + "num_tokens": 139724369.0, + "step": 3661 + }, + { + "epoch": 0.46584404019844805, + "ewc_loss": 0.03813319653272629, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014329487748909742, + "grad_norm": 5.871254920959473, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8568447232246399, + "num_tokens": 139769412.0, + "step": 3662 + }, + { + "epoch": 0.4659712504770385, + "ewc_loss": 0.03863935172557831, + "ewc_loss_diag": 2.372264862060547e-05, + "ewc_loss_parallel": 0.0001495770993642509, + "grad_norm": 4.906871795654297, + "learning_rate": 1e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8320032954216003, + "num_tokens": 139808066.0, + "step": 3663 + }, + { + "epoch": 0.46609846075562905, + "ewc_loss": 0.0376458615064621, + "ewc_loss_diag": 2.372264862060547e-05, + "ewc_loss_parallel": 0.00013964220124762505, + "grad_norm": 4.809577465057373, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8629776239395142, + "num_tokens": 139851169.0, + "step": 3664 + }, + { + "epoch": 0.4662256710342196, + "ewc_loss": 0.03807516023516655, + "ewc_loss_diag": 2.372264862060547e-05, + "ewc_loss_parallel": 0.00014393519086297601, + "grad_norm": 4.8427252769470215, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8667754530906677, + "num_tokens": 139890550.0, + "step": 3665 + }, + { + "epoch": 0.46635288131281005, + "ewc_loss": 0.03804464265704155, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014240932068787515, + "grad_norm": 4.8757195472717285, + "learning_rate": 1e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8305240869522095, + "num_tokens": 139931496.0, + "step": 3666 + }, + { + "epoch": 0.4664800915914006, + "ewc_loss": 0.038120463490486145, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014316751912701875, + "grad_norm": 4.940337657928467, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8480292558670044, + "num_tokens": 139965872.0, + "step": 3667 + }, + { + "epoch": 0.4666073018699911, + "ewc_loss": 0.03810692951083183, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014303218631539494, + "grad_norm": 4.885894298553467, + "learning_rate": 1e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.8416793346405029, + "num_tokens": 140000394.0, + "step": 3668 + }, + { + "epoch": 0.4667345121485816, + "ewc_loss": 0.03810728341341019, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014303573698271066, + "grad_norm": 4.845188140869141, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.840171217918396, + "num_tokens": 140039967.0, + "step": 3669 + }, + { + "epoch": 0.4668617224271721, + "ewc_loss": 0.03814752772450447, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014343815564643592, + "grad_norm": 4.875495433807373, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8577924370765686, + "num_tokens": 140075739.0, + "step": 3670 + }, + { + "epoch": 0.46698893270576264, + "ewc_loss": 0.03817605599761009, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014372346049640328, + "grad_norm": 4.883052349090576, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8604258298873901, + "num_tokens": 140110016.0, + "step": 3671 + }, + { + "epoch": 0.4671161429843531, + "ewc_loss": 0.03815874829888344, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014355038001667708, + "grad_norm": 5.026832103729248, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.847574770450592, + "num_tokens": 140147905.0, + "step": 3672 + }, + { + "epoch": 0.46724335326294364, + "ewc_loss": 0.03821820020675659, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.000144144898513332, + "grad_norm": 4.8781609535217285, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.850648045539856, + "num_tokens": 140182284.0, + "step": 3673 + }, + { + "epoch": 0.46737056354153417, + "ewc_loss": 0.038093458861112595, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014289747923612595, + "grad_norm": 4.83052396774292, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8494150042533875, + "num_tokens": 140220669.0, + "step": 3674 + }, + { + "epoch": 0.46749777382012464, + "ewc_loss": 0.03816574811935425, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.0001436203601770103, + "grad_norm": 4.903184413909912, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8423405289649963, + "num_tokens": 140258399.0, + "step": 3675 + }, + { + "epoch": 0.46762498409871517, + "ewc_loss": 0.03819572180509567, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014392008597496897, + "grad_norm": 4.85983419418335, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8698171377182007, + "num_tokens": 140296015.0, + "step": 3676 + }, + { + "epoch": 0.4677521943773057, + "ewc_loss": 0.038193944841623306, + "ewc_loss_diag": 2.384185791015625e-05, + "ewc_loss_parallel": 0.00014390233263839036, + "grad_norm": 4.845358371734619, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8468217849731445, + "num_tokens": 140338618.0, + "step": 3677 + }, + { + "epoch": 0.46787940465589617, + "ewc_loss": 0.038320913910865784, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014395132893696427, + "grad_norm": 4.868719100952148, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8603442907333374, + "num_tokens": 140379151.0, + "step": 3678 + }, + { + "epoch": 0.4680066149344867, + "ewc_loss": 0.038374967873096466, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014449185982812196, + "grad_norm": 4.885891437530518, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8428491353988647, + "num_tokens": 140417740.0, + "step": 3679 + }, + { + "epoch": 0.4681338252130772, + "ewc_loss": 0.03835589811205864, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014430117153096944, + "grad_norm": 4.934405326843262, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8531189560890198, + "num_tokens": 140452454.0, + "step": 3680 + }, + { + "epoch": 0.46826103549166775, + "ewc_loss": 0.038368791341781616, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014443008694797754, + "grad_norm": 4.875729084014893, + "learning_rate": 1e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.8326692581176758, + "num_tokens": 140492981.0, + "step": 3681 + }, + { + "epoch": 0.4683882457702582, + "ewc_loss": 0.0383278951048851, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014402113447431475, + "grad_norm": 4.853135108947754, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8452975749969482, + "num_tokens": 140534131.0, + "step": 3682 + }, + { + "epoch": 0.46851545604884876, + "ewc_loss": 0.03837188705801964, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001444610534235835, + "grad_norm": 4.883054733276367, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8489913940429688, + "num_tokens": 140576761.0, + "step": 3683 + }, + { + "epoch": 0.4686426663274393, + "ewc_loss": 0.03837750107049942, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014451718016061932, + "grad_norm": 4.936554908752441, + "learning_rate": 1e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8357946872711182, + "num_tokens": 140609538.0, + "step": 3684 + }, + { + "epoch": 0.46876987660602976, + "ewc_loss": 0.03841004520654678, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014484263374470174, + "grad_norm": 4.856863975524902, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8522546887397766, + "num_tokens": 140641802.0, + "step": 3685 + }, + { + "epoch": 0.4688970868846203, + "ewc_loss": 0.038343410938978195, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001441762869944796, + "grad_norm": 4.865566730499268, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8445857763290405, + "num_tokens": 140685832.0, + "step": 3686 + }, + { + "epoch": 0.4690242971632108, + "ewc_loss": 0.03848889842629433, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.0001444104709662497, + "grad_norm": 4.854653358459473, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.843528151512146, + "num_tokens": 140725333.0, + "step": 3687 + }, + { + "epoch": 0.4691515074418013, + "ewc_loss": 0.03847963735461235, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014431784802582115, + "grad_norm": 4.918674468994141, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8532363772392273, + "num_tokens": 140760838.0, + "step": 3688 + }, + { + "epoch": 0.4692787177203918, + "ewc_loss": 0.038536395877599716, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.0001448854454793036, + "grad_norm": 4.861401081085205, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8451476097106934, + "num_tokens": 140797367.0, + "step": 3689 + }, + { + "epoch": 0.46940592799898234, + "ewc_loss": 0.0384611114859581, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014413261669687927, + "grad_norm": 4.7837233543396, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8542666435241699, + "num_tokens": 140839778.0, + "step": 3690 + }, + { + "epoch": 0.4695331382775728, + "ewc_loss": 0.03836534172296524, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014439562801271677, + "grad_norm": 4.870092391967773, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.849997341632843, + "num_tokens": 140880340.0, + "step": 3691 + }, + { + "epoch": 0.46966034855616334, + "ewc_loss": 0.03844847157597542, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014522690617013723, + "grad_norm": 4.897744655609131, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8487821817398071, + "num_tokens": 140915451.0, + "step": 3692 + }, + { + "epoch": 0.46978755883475387, + "ewc_loss": 0.03856382519006729, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014515973452944309, + "grad_norm": 4.865251064300537, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8528734445571899, + "num_tokens": 140956967.0, + "step": 3693 + }, + { + "epoch": 0.46991476911334434, + "ewc_loss": 0.03841428458690643, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014488505257759243, + "grad_norm": 4.820101261138916, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8700554966926575, + "num_tokens": 140996640.0, + "step": 3694 + }, + { + "epoch": 0.47004197939193487, + "ewc_loss": 0.038572393357753754, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014524540165439248, + "grad_norm": 4.865997791290283, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8540076017379761, + "num_tokens": 141036094.0, + "step": 3695 + }, + { + "epoch": 0.4701691896705254, + "ewc_loss": 0.038540732115507126, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014492881018668413, + "grad_norm": 4.929928302764893, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.845747172832489, + "num_tokens": 141069033.0, + "step": 3696 + }, + { + "epoch": 0.4702963999491159, + "ewc_loss": 0.03858382999897003, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014535979426000267, + "grad_norm": 4.822778224945068, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8725988268852234, + "num_tokens": 141111075.0, + "step": 3697 + }, + { + "epoch": 0.4704236102277064, + "ewc_loss": 0.03854244202375412, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014494590868707746, + "grad_norm": 4.9193291664123535, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8701111078262329, + "num_tokens": 141145270.0, + "step": 3698 + }, + { + "epoch": 0.47055082050629693, + "ewc_loss": 0.03844871744513512, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014522936544381082, + "grad_norm": 4.884842872619629, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8694765567779541, + "num_tokens": 141182132.0, + "step": 3699 + }, + { + "epoch": 0.4706780307848874, + "ewc_loss": 0.03839436173439026, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014468582230620086, + "grad_norm": 4.896604537963867, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8571689128875732, + "num_tokens": 141217455.0, + "step": 3700 + }, + { + "epoch": 0.47080524106347793, + "ewc_loss": 0.03851279616355896, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014464945707004517, + "grad_norm": 4.877641201019287, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8553426861763, + "num_tokens": 141249680.0, + "step": 3701 + }, + { + "epoch": 0.47093245134206846, + "ewc_loss": 0.03838294371962547, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014457161887548864, + "grad_norm": 4.8620381355285645, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8549519777297974, + "num_tokens": 141292085.0, + "step": 3702 + }, + { + "epoch": 0.47105966162065893, + "ewc_loss": 0.0384005568921566, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001447477552574128, + "grad_norm": 4.853857517242432, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8632306456565857, + "num_tokens": 141331455.0, + "step": 3703 + }, + { + "epoch": 0.47118687189924946, + "ewc_loss": 0.038353241980075836, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014427461428567767, + "grad_norm": 4.894763946533203, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8552656173706055, + "num_tokens": 141365623.0, + "step": 3704 + }, + { + "epoch": 0.47131408217784, + "ewc_loss": 0.03839083015918732, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001446504902560264, + "grad_norm": 4.921292304992676, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8458161354064941, + "num_tokens": 141399033.0, + "step": 3705 + }, + { + "epoch": 0.47144129245643046, + "ewc_loss": 0.0383702889084816, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014444506086874753, + "grad_norm": 4.921889781951904, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8631928563117981, + "num_tokens": 141430436.0, + "step": 3706 + }, + { + "epoch": 0.471568502735021, + "ewc_loss": 0.038391828536987305, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014466047286987305, + "grad_norm": 4.912367820739746, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8723840117454529, + "num_tokens": 141463169.0, + "step": 3707 + }, + { + "epoch": 0.4716957130136115, + "ewc_loss": 0.03834827244281769, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014422493404708803, + "grad_norm": 4.866394519805908, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8745450973510742, + "num_tokens": 141495524.0, + "step": 3708 + }, + { + "epoch": 0.471822923292202, + "ewc_loss": 0.038326624780893326, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014400843065232038, + "grad_norm": 4.845724105834961, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8669873476028442, + "num_tokens": 141533099.0, + "step": 3709 + }, + { + "epoch": 0.4719501335707925, + "ewc_loss": 0.03836977854371071, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014443998225033283, + "grad_norm": 4.826980113983154, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.861514687538147, + "num_tokens": 141576389.0, + "step": 3710 + }, + { + "epoch": 0.47207734384938305, + "ewc_loss": 0.03834616765379906, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014420386287383735, + "grad_norm": 4.858243942260742, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.858866274356842, + "num_tokens": 141611870.0, + "step": 3711 + }, + { + "epoch": 0.4722045541279735, + "ewc_loss": 0.038370516151189804, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014444736007135361, + "grad_norm": 4.842195510864258, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.866799533367157, + "num_tokens": 141648539.0, + "step": 3712 + }, + { + "epoch": 0.47233176440656405, + "ewc_loss": 0.03851129859685898, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014463445404544473, + "grad_norm": 4.828500747680664, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8595193028450012, + "num_tokens": 141688549.0, + "step": 3713 + }, + { + "epoch": 0.4724589746851546, + "ewc_loss": 0.03842974454164505, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001450396521249786, + "grad_norm": 4.8708038330078125, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8642574548721313, + "num_tokens": 141725294.0, + "step": 3714 + }, + { + "epoch": 0.47258618496374505, + "ewc_loss": 0.03843091428279877, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014505132276099175, + "grad_norm": 4.933594226837158, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8400572538375854, + "num_tokens": 141756761.0, + "step": 3715 + }, + { + "epoch": 0.4727133952423356, + "ewc_loss": 0.038479529321193695, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014553748769685626, + "grad_norm": 4.892036437988281, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8410839438438416, + "num_tokens": 141790417.0, + "step": 3716 + }, + { + "epoch": 0.4728406055209261, + "ewc_loss": 0.03842708468437195, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014501303667202592, + "grad_norm": 4.807374954223633, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8677239418029785, + "num_tokens": 141830749.0, + "step": 3717 + }, + { + "epoch": 0.4729678157995166, + "ewc_loss": 0.03848256543278694, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001455678284401074, + "grad_norm": 4.904820919036865, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8602583408355713, + "num_tokens": 141868433.0, + "step": 3718 + }, + { + "epoch": 0.4730950260781071, + "ewc_loss": 0.03848734498023987, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001456156314816326, + "grad_norm": 4.8025431632995605, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8738813996315002, + "num_tokens": 141907140.0, + "step": 3719 + }, + { + "epoch": 0.47322223635669763, + "ewc_loss": 0.0385018028318882, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014576020475942641, + "grad_norm": 4.852083683013916, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8707759380340576, + "num_tokens": 141947436.0, + "step": 3720 + }, + { + "epoch": 0.4733494466352881, + "ewc_loss": 0.03849851340055466, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014572733198292553, + "grad_norm": 4.884087085723877, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.849261999130249, + "num_tokens": 141986876.0, + "step": 3721 + }, + { + "epoch": 0.47347665691387864, + "ewc_loss": 0.03847311809659004, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014547335740644485, + "grad_norm": 4.844509601593018, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8600916862487793, + "num_tokens": 142025806.0, + "step": 3722 + }, + { + "epoch": 0.47360386719246916, + "ewc_loss": 0.03844669461250305, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014520912372972816, + "grad_norm": 4.950627326965332, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8561244010925293, + "num_tokens": 142057932.0, + "step": 3723 + }, + { + "epoch": 0.47373107747105964, + "ewc_loss": 0.03850774094462395, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014581959112547338, + "grad_norm": 4.8974199295043945, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8602354526519775, + "num_tokens": 142088880.0, + "step": 3724 + }, + { + "epoch": 0.47385828774965016, + "ewc_loss": 0.03860088065266609, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014553028449881822, + "grad_norm": 4.8505120277404785, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8704696893692017, + "num_tokens": 142124959.0, + "step": 3725 + }, + { + "epoch": 0.4739854980282407, + "ewc_loss": 0.03861192241311073, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.0001456407189834863, + "grad_norm": 4.9220805168151855, + "learning_rate": 1e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8429019451141357, + "num_tokens": 142160667.0, + "step": 3726 + }, + { + "epoch": 0.47411270830683117, + "ewc_loss": 0.03851563483476639, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001458985498175025, + "grad_norm": 4.861931324005127, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8571561574935913, + "num_tokens": 142205017.0, + "step": 3727 + }, + { + "epoch": 0.4742399185854217, + "ewc_loss": 0.03845900669693947, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001453322620363906, + "grad_norm": 4.9120774269104, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8632401823997498, + "num_tokens": 142241479.0, + "step": 3728 + }, + { + "epoch": 0.4743671288640122, + "ewc_loss": 0.03863309696316719, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014585244935005903, + "grad_norm": 4.909548759460449, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8573364019393921, + "num_tokens": 142280881.0, + "step": 3729 + }, + { + "epoch": 0.4744943391426027, + "ewc_loss": 0.03843115270137787, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001450537092750892, + "grad_norm": 4.836949825286865, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8642117381095886, + "num_tokens": 142328248.0, + "step": 3730 + }, + { + "epoch": 0.4746215494211932, + "ewc_loss": 0.03844292834401131, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014517147792503238, + "grad_norm": 4.928370475769043, + "learning_rate": 1e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8408700227737427, + "num_tokens": 142364872.0, + "step": 3731 + }, + { + "epoch": 0.47474875969978375, + "ewc_loss": 0.03847049921751022, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014544717851094902, + "grad_norm": 4.892477512359619, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8390985727310181, + "num_tokens": 142400494.0, + "step": 3732 + }, + { + "epoch": 0.4748759699783743, + "ewc_loss": 0.03841982036828995, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014494040806312114, + "grad_norm": 4.910579204559326, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8554192781448364, + "num_tokens": 142433739.0, + "step": 3733 + }, + { + "epoch": 0.47500318025696475, + "ewc_loss": 0.03845100477337837, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014525222650263458, + "grad_norm": 4.85783576965332, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8650598526000977, + "num_tokens": 142469777.0, + "step": 3734 + }, + { + "epoch": 0.4751303905355553, + "ewc_loss": 0.03843382000923157, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014508036838378757, + "grad_norm": 4.891045570373535, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8470427989959717, + "num_tokens": 142510052.0, + "step": 3735 + }, + { + "epoch": 0.4752576008141458, + "ewc_loss": 0.03845690190792084, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014531119086313993, + "grad_norm": 4.864171981811523, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8450536131858826, + "num_tokens": 142545689.0, + "step": 3736 + }, + { + "epoch": 0.4753848110927363, + "ewc_loss": 0.0384620800614357, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001453629956813529, + "grad_norm": 4.861556053161621, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8557766675949097, + "num_tokens": 142585916.0, + "step": 3737 + }, + { + "epoch": 0.4755120213713268, + "ewc_loss": 0.038628168404102325, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014580317656509578, + "grad_norm": 4.9326372146606445, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8643181324005127, + "num_tokens": 142623019.0, + "step": 3738 + }, + { + "epoch": 0.47563923164991734, + "ewc_loss": 0.03850112110376358, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.0001457533799111843, + "grad_norm": 4.844799518585205, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8535395860671997, + "num_tokens": 142660986.0, + "step": 3739 + }, + { + "epoch": 0.4757664419285078, + "ewc_loss": 0.03843924403190613, + "ewc_loss_diag": 2.396106719970703e-05, + "ewc_loss_parallel": 0.00014513463247567415, + "grad_norm": 4.868739604949951, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8599271774291992, + "num_tokens": 142702876.0, + "step": 3740 + }, + { + "epoch": 0.47589365220709834, + "ewc_loss": 0.038770608603954315, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014600685972254723, + "grad_norm": 4.857888698577881, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8448232412338257, + "num_tokens": 142744082.0, + "step": 3741 + }, + { + "epoch": 0.47602086248568887, + "ewc_loss": 0.03868740797042847, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014517483941745013, + "grad_norm": 4.89415168762207, + "learning_rate": 1e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8383871912956238, + "num_tokens": 142777511.0, + "step": 3742 + }, + { + "epoch": 0.47614807276427934, + "ewc_loss": 0.03873688727617264, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014566963363904506, + "grad_norm": 4.8744730949401855, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8516182899475098, + "num_tokens": 142818882.0, + "step": 3743 + }, + { + "epoch": 0.47627528304286987, + "ewc_loss": 0.03870260342955589, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014532680506817997, + "grad_norm": 4.935489654541016, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8611867427825928, + "num_tokens": 142852771.0, + "step": 3744 + }, + { + "epoch": 0.4764024933214604, + "ewc_loss": 0.03874647617340088, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001457655307604, + "grad_norm": 4.84688138961792, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8571000099182129, + "num_tokens": 142888398.0, + "step": 3745 + }, + { + "epoch": 0.47652970360005087, + "ewc_loss": 0.0386723056435585, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014502381964121014, + "grad_norm": 4.83341646194458, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8720256090164185, + "num_tokens": 142924789.0, + "step": 3746 + }, + { + "epoch": 0.4766569138786414, + "ewc_loss": 0.03873631730675697, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014566394384019077, + "grad_norm": 4.9071946144104, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8638097047805786, + "num_tokens": 142958843.0, + "step": 3747 + }, + { + "epoch": 0.4767841241572319, + "ewc_loss": 0.03875085711479187, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014580937568098307, + "grad_norm": 4.903623580932617, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8678324818611145, + "num_tokens": 142996408.0, + "step": 3748 + }, + { + "epoch": 0.4769113344358224, + "ewc_loss": 0.0387430340051651, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014573113003280014, + "grad_norm": 4.823856830596924, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8489680290222168, + "num_tokens": 143040031.0, + "step": 3749 + }, + { + "epoch": 0.4770385447144129, + "ewc_loss": 0.03878742828965187, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001461750507587567, + "grad_norm": 4.900606632232666, + "learning_rate": 1e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8335096836090088, + "num_tokens": 143081648.0, + "step": 3750 + }, + { + "epoch": 0.47716575499300345, + "ewc_loss": 0.038815777748823166, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014645855117123574, + "grad_norm": 4.866534233093262, + "learning_rate": 1e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8349086046218872, + "num_tokens": 143118913.0, + "step": 3751 + }, + { + "epoch": 0.4772929652715939, + "ewc_loss": 0.03879714012145996, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014627219934482127, + "grad_norm": 4.906892776489258, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8653392195701599, + "num_tokens": 143153858.0, + "step": 3752 + }, + { + "epoch": 0.47742017555018446, + "ewc_loss": 0.03885290399193764, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014682981418445706, + "grad_norm": 4.916257858276367, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8675625324249268, + "num_tokens": 143190977.0, + "step": 3753 + }, + { + "epoch": 0.477547385828775, + "ewc_loss": 0.03882990777492523, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014659983571618795, + "grad_norm": 4.885456562042236, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8487177491188049, + "num_tokens": 143228009.0, + "step": 3754 + }, + { + "epoch": 0.47767459610736546, + "ewc_loss": 0.038825202733278275, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001465528184780851, + "grad_norm": 4.853305339813232, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8457016944885254, + "num_tokens": 143269820.0, + "step": 3755 + }, + { + "epoch": 0.477801806385956, + "ewc_loss": 0.03881428763270378, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001464436500100419, + "grad_norm": 4.922641754150391, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8603808879852295, + "num_tokens": 143303441.0, + "step": 3756 + }, + { + "epoch": 0.4779290166645465, + "ewc_loss": 0.03883717209100723, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014667252253275365, + "grad_norm": 4.899822235107422, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8492215275764465, + "num_tokens": 143338748.0, + "step": 3757 + }, + { + "epoch": 0.478056226943137, + "ewc_loss": 0.03881886973977089, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014648948854301125, + "grad_norm": 4.8775248527526855, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8570031523704529, + "num_tokens": 143373900.0, + "step": 3758 + }, + { + "epoch": 0.4781834372217275, + "ewc_loss": 0.03870806097984314, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014660209126304835, + "grad_norm": 4.915970802307129, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8509559631347656, + "num_tokens": 143411190.0, + "step": 3759 + }, + { + "epoch": 0.47831064750031804, + "ewc_loss": 0.03872234374284744, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014674491831101477, + "grad_norm": 4.89182710647583, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8527947664260864, + "num_tokens": 143449470.0, + "step": 3760 + }, + { + "epoch": 0.4784378577789085, + "ewc_loss": 0.038844477385282516, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014674555859528482, + "grad_norm": 4.90991735458374, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8625969886779785, + "num_tokens": 143485868.0, + "step": 3761 + }, + { + "epoch": 0.47856506805749904, + "ewc_loss": 0.0388057604432106, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001463583903387189, + "grad_norm": 4.924238204956055, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.864901065826416, + "num_tokens": 143520334.0, + "step": 3762 + }, + { + "epoch": 0.47869227833608957, + "ewc_loss": 0.038830362260341644, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014660439046565443, + "grad_norm": 4.850756645202637, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8497623205184937, + "num_tokens": 143557752.0, + "step": 3763 + }, + { + "epoch": 0.47881948861468004, + "ewc_loss": 0.03877270594239235, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001460278290323913, + "grad_norm": 4.927357196807861, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.856311559677124, + "num_tokens": 143593385.0, + "step": 3764 + }, + { + "epoch": 0.4789466988932706, + "ewc_loss": 0.038745127618312836, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014697277219966054, + "grad_norm": 4.867574214935303, + "learning_rate": 1e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8362650275230408, + "num_tokens": 143631975.0, + "step": 3765 + }, + { + "epoch": 0.4790739091718611, + "ewc_loss": 0.03879859670996666, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014628675126004964, + "grad_norm": 4.959922790527344, + "learning_rate": 1e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8384832739830017, + "num_tokens": 143668382.0, + "step": 3766 + }, + { + "epoch": 0.4792011194504516, + "ewc_loss": 0.038870081305503845, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014700159954372793, + "grad_norm": 4.918344020843506, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8429867029190063, + "num_tokens": 143706623.0, + "step": 3767 + }, + { + "epoch": 0.4793283297290421, + "ewc_loss": 0.03889026120305061, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00014598268899135292, + "grad_norm": 4.850558757781982, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8610712885856628, + "num_tokens": 143745664.0, + "step": 3768 + }, + { + "epoch": 0.47945554000763263, + "ewc_loss": 0.03878828510642052, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014618363638874143, + "grad_norm": 5.343461990356445, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.846703052520752, + "num_tokens": 143780761.0, + "step": 3769 + }, + { + "epoch": 0.4795827502862231, + "ewc_loss": 0.03904060274362564, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014870679297018796, + "grad_norm": 4.7670979499816895, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8611807823181152, + "num_tokens": 143825298.0, + "step": 3770 + }, + { + "epoch": 0.47970996056481363, + "ewc_loss": 0.038667645305395126, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014497723896056414, + "grad_norm": 4.926849842071533, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8747942447662354, + "num_tokens": 143859855.0, + "step": 3771 + }, + { + "epoch": 0.47983717084340416, + "ewc_loss": 0.03907286003232002, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00014780867786612362, + "grad_norm": 4.891674041748047, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.869624137878418, + "num_tokens": 143898553.0, + "step": 3772 + }, + { + "epoch": 0.47996438112199463, + "ewc_loss": 0.038752973079681396, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001458305341657251, + "grad_norm": 4.892956733703613, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8539884090423584, + "num_tokens": 143935362.0, + "step": 3773 + }, + { + "epoch": 0.48009159140058516, + "ewc_loss": 0.03888659551739693, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014716673467773944, + "grad_norm": 4.888367176055908, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8568177223205566, + "num_tokens": 143978692.0, + "step": 3774 + }, + { + "epoch": 0.4802188016791757, + "ewc_loss": 0.03884554281830788, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014675619604531676, + "grad_norm": 4.827160835266113, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8635280132293701, + "num_tokens": 144020951.0, + "step": 3775 + }, + { + "epoch": 0.48034601195776616, + "ewc_loss": 0.038885779678821564, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014715857105329633, + "grad_norm": 4.91581916809082, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8649895191192627, + "num_tokens": 144058341.0, + "step": 3776 + }, + { + "epoch": 0.4804732222363567, + "ewc_loss": 0.03888518363237381, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001471526047680527, + "grad_norm": 4.9323649406433105, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8499437570571899, + "num_tokens": 144098232.0, + "step": 3777 + }, + { + "epoch": 0.4806004325149472, + "ewc_loss": 0.038847796618938446, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001467787369620055, + "grad_norm": 4.868683815002441, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8518216013908386, + "num_tokens": 144137537.0, + "step": 3778 + }, + { + "epoch": 0.4807276427935377, + "ewc_loss": 0.03898464888334274, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001469265844207257, + "grad_norm": 4.867013454437256, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8621687293052673, + "num_tokens": 144178459.0, + "step": 3779 + }, + { + "epoch": 0.4808548530721282, + "ewc_loss": 0.03886974975466728, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014699828170705587, + "grad_norm": 4.917744159698486, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8508234024047852, + "num_tokens": 144219257.0, + "step": 3780 + }, + { + "epoch": 0.48098206335071875, + "ewc_loss": 0.0388653427362442, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014695418940391392, + "grad_norm": 4.841121196746826, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8615899085998535, + "num_tokens": 144262232.0, + "step": 3781 + }, + { + "epoch": 0.4811092736293093, + "ewc_loss": 0.03883785754442215, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014667936193291098, + "grad_norm": 4.9619245529174805, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.83411705493927, + "num_tokens": 144297144.0, + "step": 3782 + }, + { + "epoch": 0.48123648390789975, + "ewc_loss": 0.038868460804224014, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014698538871016353, + "grad_norm": 4.806212425231934, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8571584224700928, + "num_tokens": 144336078.0, + "step": 3783 + }, + { + "epoch": 0.4813636941864903, + "ewc_loss": 0.03883352875709534, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014663605543319136, + "grad_norm": 4.8904194831848145, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8485895991325378, + "num_tokens": 144380162.0, + "step": 3784 + }, + { + "epoch": 0.4814909044650808, + "ewc_loss": 0.03888549283146858, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014715571887791157, + "grad_norm": 4.923427581787109, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8425285816192627, + "num_tokens": 144416735.0, + "step": 3785 + }, + { + "epoch": 0.4816181147436713, + "ewc_loss": 0.03887148201465607, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014701559848617762, + "grad_norm": 4.910103797912598, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8455261588096619, + "num_tokens": 144452780.0, + "step": 3786 + }, + { + "epoch": 0.4817453250222618, + "ewc_loss": 0.03884696215391159, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014677041326649487, + "grad_norm": 4.922731876373291, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8616867065429688, + "num_tokens": 144494155.0, + "step": 3787 + }, + { + "epoch": 0.48187253530085233, + "ewc_loss": 0.03881983831524849, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014649915101472288, + "grad_norm": 4.84370756149292, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8654448390007019, + "num_tokens": 144533989.0, + "step": 3788 + }, + { + "epoch": 0.4819997455794428, + "ewc_loss": 0.03877463936805725, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014604718307964504, + "grad_norm": 4.876986026763916, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8522343039512634, + "num_tokens": 144577409.0, + "step": 3789 + }, + { + "epoch": 0.48212695585803333, + "ewc_loss": 0.03885062783956528, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014680706954095513, + "grad_norm": 4.923197269439697, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8485144376754761, + "num_tokens": 144618501.0, + "step": 3790 + }, + { + "epoch": 0.48225416613662386, + "ewc_loss": 0.03881584107875824, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001464591914555058, + "grad_norm": 4.848592281341553, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8636366724967957, + "num_tokens": 144659454.0, + "step": 3791 + }, + { + "epoch": 0.48238137641521434, + "ewc_loss": 0.038788795471191406, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014618875866290182, + "grad_norm": 4.896555423736572, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8626339435577393, + "num_tokens": 144695580.0, + "step": 3792 + }, + { + "epoch": 0.48250858669380486, + "ewc_loss": 0.03884129226207733, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014671370445284992, + "grad_norm": 4.914090633392334, + "learning_rate": 1e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8390640616416931, + "num_tokens": 144735586.0, + "step": 3793 + }, + { + "epoch": 0.4826357969723954, + "ewc_loss": 0.03884254768490791, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014672624820377678, + "grad_norm": 4.845788478851318, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8571230173110962, + "num_tokens": 144776371.0, + "step": 3794 + }, + { + "epoch": 0.48276300725098586, + "ewc_loss": 0.03886768966913223, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014697769074700773, + "grad_norm": 4.980401039123535, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8504965901374817, + "num_tokens": 144816724.0, + "step": 3795 + }, + { + "epoch": 0.4828902175295764, + "ewc_loss": 0.0389305017888546, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014760579506400973, + "grad_norm": 4.9027791023254395, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8618794083595276, + "num_tokens": 144854591.0, + "step": 3796 + }, + { + "epoch": 0.4830174278081669, + "ewc_loss": 0.03878328576683998, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.000146133650559932, + "grad_norm": 4.882019519805908, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8575267195701599, + "num_tokens": 144893694.0, + "step": 3797 + }, + { + "epoch": 0.4831446380867574, + "ewc_loss": 0.03885624557733536, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014686325448565185, + "grad_norm": 4.929953098297119, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8530537486076355, + "num_tokens": 144932985.0, + "step": 3798 + }, + { + "epoch": 0.4832718483653479, + "ewc_loss": 0.03882955014705658, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014659629960078746, + "grad_norm": 4.904418468475342, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8568265438079834, + "num_tokens": 144967823.0, + "step": 3799 + }, + { + "epoch": 0.48339905864393845, + "ewc_loss": 0.038853537291288376, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014683615881949663, + "grad_norm": 4.837968349456787, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8765733242034912, + "num_tokens": 145007750.0, + "step": 3800 + }, + { + "epoch": 0.4835262689225289, + "ewc_loss": 0.03885223716497421, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014682313485536724, + "grad_norm": 4.913517475128174, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8506550788879395, + "num_tokens": 145048596.0, + "step": 3801 + }, + { + "epoch": 0.48365347920111945, + "ewc_loss": 0.038934122771024704, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001476420002290979, + "grad_norm": 4.9712934494018555, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8458131551742554, + "num_tokens": 145080868.0, + "step": 3802 + }, + { + "epoch": 0.48378068947971, + "ewc_loss": 0.03885379433631897, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001468387054046616, + "grad_norm": 4.9174909591674805, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8653546571731567, + "num_tokens": 145115865.0, + "step": 3803 + }, + { + "epoch": 0.48390789975830045, + "ewc_loss": 0.03884562849998474, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014675705460831523, + "grad_norm": 4.877311706542969, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.868564248085022, + "num_tokens": 145151748.0, + "step": 3804 + }, + { + "epoch": 0.484035110036891, + "ewc_loss": 0.038834333419799805, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014664411719422787, + "grad_norm": 4.867997169494629, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8552378416061401, + "num_tokens": 145187074.0, + "step": 3805 + }, + { + "epoch": 0.4841623203154815, + "ewc_loss": 0.038840651512145996, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014670728705823421, + "grad_norm": 4.877817153930664, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8747323751449585, + "num_tokens": 145224855.0, + "step": 3806 + }, + { + "epoch": 0.484289530594072, + "ewc_loss": 0.038873422890901566, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014703501074109226, + "grad_norm": 4.868473529815674, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8655813336372375, + "num_tokens": 145266538.0, + "step": 3807 + }, + { + "epoch": 0.4844167408726625, + "ewc_loss": 0.03886399418115616, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001469407434342429, + "grad_norm": 5.054356575012207, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8624266982078552, + "num_tokens": 145303560.0, + "step": 3808 + }, + { + "epoch": 0.48454395115125304, + "ewc_loss": 0.03894234448671341, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014772424765396863, + "grad_norm": 4.880270004272461, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8578786253929138, + "num_tokens": 145345985.0, + "step": 3809 + }, + { + "epoch": 0.4846711614298435, + "ewc_loss": 0.03883016109466553, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014660241140518337, + "grad_norm": 4.948724269866943, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8459442853927612, + "num_tokens": 145382132.0, + "step": 3810 + }, + { + "epoch": 0.48479837170843404, + "ewc_loss": 0.03894723579287529, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014777314208913594, + "grad_norm": 4.92579460144043, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.847149133682251, + "num_tokens": 145418992.0, + "step": 3811 + }, + { + "epoch": 0.48492558198702457, + "ewc_loss": 0.03883831575512886, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014668393123429269, + "grad_norm": 4.948277950286865, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8559010028839111, + "num_tokens": 145455892.0, + "step": 3812 + }, + { + "epoch": 0.48505279226561504, + "ewc_loss": 0.03894122689962387, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001477130426792428, + "grad_norm": 4.915505409240723, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.854282021522522, + "num_tokens": 145497217.0, + "step": 3813 + }, + { + "epoch": 0.48518000254420557, + "ewc_loss": 0.038854386657476425, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014684464258607477, + "grad_norm": 4.874146938323975, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8632402420043945, + "num_tokens": 145533822.0, + "step": 3814 + }, + { + "epoch": 0.4853072128227961, + "ewc_loss": 0.038891613483428955, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014721690968144685, + "grad_norm": 4.875735282897949, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.876188337802887, + "num_tokens": 145572967.0, + "step": 3815 + }, + { + "epoch": 0.48543442310138657, + "ewc_loss": 0.038898494094610214, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014728571113664657, + "grad_norm": 4.883853912353516, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.870945930480957, + "num_tokens": 145609436.0, + "step": 3816 + }, + { + "epoch": 0.4855616333799771, + "ewc_loss": 0.03890395537018776, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014734032447449863, + "grad_norm": 4.9261016845703125, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8686143159866333, + "num_tokens": 145645083.0, + "step": 3817 + }, + { + "epoch": 0.4856888436585676, + "ewc_loss": 0.038898225873708725, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014728303358424455, + "grad_norm": 4.9561872482299805, + "learning_rate": 1e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.8389551043510437, + "num_tokens": 145680249.0, + "step": 3818 + }, + { + "epoch": 0.4858160539371581, + "ewc_loss": 0.03903225436806679, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00014740262122359127, + "grad_norm": 4.892472267150879, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8625282645225525, + "num_tokens": 145718569.0, + "step": 3819 + }, + { + "epoch": 0.4859432642157486, + "ewc_loss": 0.03887627646327019, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014706353249493986, + "grad_norm": 4.955776214599609, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8416839838027954, + "num_tokens": 145751868.0, + "step": 3820 + }, + { + "epoch": 0.48607047449433916, + "ewc_loss": 0.038916196674108505, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014746274973731488, + "grad_norm": 4.895814418792725, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8617086410522461, + "num_tokens": 145787881.0, + "step": 3821 + }, + { + "epoch": 0.48619768477292963, + "ewc_loss": 0.0389128252863884, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014742901839781553, + "grad_norm": 4.911758899688721, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8630272150039673, + "num_tokens": 145824010.0, + "step": 3822 + }, + { + "epoch": 0.48632489505152016, + "ewc_loss": 0.03893977776169777, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014769854897167534, + "grad_norm": 4.880654811859131, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8639707565307617, + "num_tokens": 145860981.0, + "step": 3823 + }, + { + "epoch": 0.4864521053301107, + "ewc_loss": 0.0388944037258625, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014724480570293963, + "grad_norm": 4.903532981872559, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8615831136703491, + "num_tokens": 145900687.0, + "step": 3824 + }, + { + "epoch": 0.48657931560870116, + "ewc_loss": 0.03896588459610939, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014795962488278747, + "grad_norm": 4.932367324829102, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.847658634185791, + "num_tokens": 145938888.0, + "step": 3825 + }, + { + "epoch": 0.4867065258872917, + "ewc_loss": 0.038919273763895035, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014749351248610765, + "grad_norm": 4.837735652923584, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8532043695449829, + "num_tokens": 145987060.0, + "step": 3826 + }, + { + "epoch": 0.4868337361658822, + "ewc_loss": 0.0389728918671608, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014802969235461205, + "grad_norm": 4.961212158203125, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8520050644874573, + "num_tokens": 146026447.0, + "step": 3827 + }, + { + "epoch": 0.4869609464444727, + "ewc_loss": 0.03898119181394577, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014811269647907466, + "grad_norm": 4.845722198486328, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8597049117088318, + "num_tokens": 146067099.0, + "step": 3828 + }, + { + "epoch": 0.4870881567230632, + "ewc_loss": 0.03896670788526535, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014796786126680672, + "grad_norm": 4.913503170013428, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8444997668266296, + "num_tokens": 146110767.0, + "step": 3829 + }, + { + "epoch": 0.48721536700165374, + "ewc_loss": 0.03897816687822342, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014808242849539965, + "grad_norm": 4.927469730377197, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8490574359893799, + "num_tokens": 146147603.0, + "step": 3830 + }, + { + "epoch": 0.48734257728024427, + "ewc_loss": 0.038949958980083466, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.000147800354170613, + "grad_norm": 6.576099872589111, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8677407503128052, + "num_tokens": 146188821.0, + "step": 3831 + }, + { + "epoch": 0.48746978755883474, + "ewc_loss": 0.040292851626873016, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00016122929810080677, + "grad_norm": 5.002132892608643, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8540641665458679, + "num_tokens": 146231886.0, + "step": 3832 + }, + { + "epoch": 0.48759699783742527, + "ewc_loss": 0.03847230598330498, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014302383351605386, + "grad_norm": 4.947723388671875, + "learning_rate": 1e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8336154818534851, + "num_tokens": 146271965.0, + "step": 3833 + }, + { + "epoch": 0.4877242081160158, + "ewc_loss": 0.0391959547996521, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015026031178422272, + "grad_norm": 4.988358020782471, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8441451191902161, + "num_tokens": 146316686.0, + "step": 3834 + }, + { + "epoch": 0.4878514183946063, + "ewc_loss": 0.03886638581752777, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001469646522309631, + "grad_norm": 4.924571514129639, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8466443419456482, + "num_tokens": 146355098.0, + "step": 3835 + }, + { + "epoch": 0.4879786286731968, + "ewc_loss": 0.03888508304953575, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014715160068590194, + "grad_norm": 4.972362041473389, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8622068166732788, + "num_tokens": 146390666.0, + "step": 3836 + }, + { + "epoch": 0.48810583895178733, + "ewc_loss": 0.038924265652894974, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014754344010725617, + "grad_norm": 4.915757179260254, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.863939642906189, + "num_tokens": 146426740.0, + "step": 3837 + }, + { + "epoch": 0.4882330492303778, + "ewc_loss": 0.03877058997750282, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014722738706041127, + "grad_norm": 4.964740753173828, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.847466766834259, + "num_tokens": 146462668.0, + "step": 3838 + }, + { + "epoch": 0.48836025950896833, + "ewc_loss": 0.038836922496557236, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014789070701226592, + "grad_norm": 4.9559783935546875, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8493889570236206, + "num_tokens": 146500160.0, + "step": 3839 + }, + { + "epoch": 0.48848746978755886, + "ewc_loss": 0.038767363876104355, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014719512546434999, + "grad_norm": 4.9044413566589355, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8518143892288208, + "num_tokens": 146542963.0, + "step": 3840 + }, + { + "epoch": 0.48861468006614933, + "ewc_loss": 0.03880491852760315, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014757065218873322, + "grad_norm": 4.9470109939575195, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8624507188796997, + "num_tokens": 146580649.0, + "step": 3841 + }, + { + "epoch": 0.48874189034473986, + "ewc_loss": 0.03892011195421219, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001475018943892792, + "grad_norm": 6.566589832305908, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8638231754302979, + "num_tokens": 146620823.0, + "step": 3842 + }, + { + "epoch": 0.4888691006233304, + "ewc_loss": 0.040145620703697205, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015975700807757676, + "grad_norm": 4.978963375091553, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8675775527954102, + "num_tokens": 146663700.0, + "step": 3843 + }, + { + "epoch": 0.48899631090192086, + "ewc_loss": 0.038328126072883606, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014280273171607405, + "grad_norm": 4.9696269035339355, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8519169092178345, + "num_tokens": 146698211.0, + "step": 3844 + }, + { + "epoch": 0.4891235211805114, + "ewc_loss": 0.03897504508495331, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014927194570191205, + "grad_norm": 5.096059799194336, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.855175256729126, + "num_tokens": 146738380.0, + "step": 3845 + }, + { + "epoch": 0.4892507314591019, + "ewc_loss": 0.038728080689907074, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014680229651276022, + "grad_norm": 4.952956199645996, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8592054843902588, + "num_tokens": 146775413.0, + "step": 3846 + }, + { + "epoch": 0.4893779417376924, + "ewc_loss": 0.03867102414369583, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014623173046857119, + "grad_norm": 4.9175920486450195, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.858968198299408, + "num_tokens": 146814774.0, + "step": 3847 + }, + { + "epoch": 0.4895051520162829, + "ewc_loss": 0.038755595684051514, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014707744412589818, + "grad_norm": 4.916170597076416, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8671834468841553, + "num_tokens": 146853957.0, + "step": 3848 + }, + { + "epoch": 0.48963236229487345, + "ewc_loss": 0.038737956434488297, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014690104580949992, + "grad_norm": 5.003209114074707, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8483453392982483, + "num_tokens": 146889060.0, + "step": 3849 + }, + { + "epoch": 0.4897595725734639, + "ewc_loss": 0.03879055753350258, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014742706844117492, + "grad_norm": 4.869640827178955, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8669060468673706, + "num_tokens": 146929849.0, + "step": 3850 + }, + { + "epoch": 0.48988678285205445, + "ewc_loss": 0.03879659250378609, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014748741523362696, + "grad_norm": 4.971161365509033, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8584097027778625, + "num_tokens": 146968376.0, + "step": 3851 + }, + { + "epoch": 0.490013993130645, + "ewc_loss": 0.038847655057907104, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014799804193899035, + "grad_norm": 4.955628395080566, + "learning_rate": 1e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8407755494117737, + "num_tokens": 147002871.0, + "step": 3852 + }, + { + "epoch": 0.49014120340923545, + "ewc_loss": 0.0389426164329052, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001477269543102011, + "grad_norm": 4.943394660949707, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8531358242034912, + "num_tokens": 147039412.0, + "step": 3853 + }, + { + "epoch": 0.490268413687826, + "ewc_loss": 0.0389723926782608, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014802470104768872, + "grad_norm": 4.873876571655273, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8606616258621216, + "num_tokens": 147080307.0, + "step": 3854 + }, + { + "epoch": 0.4903956239664165, + "ewc_loss": 0.0391041561961174, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00014812166045885533, + "grad_norm": 6.088198184967041, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8639328479766846, + "num_tokens": 147120131.0, + "step": 3855 + }, + { + "epoch": 0.490522834245007, + "ewc_loss": 0.03981657698750496, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001564665581099689, + "grad_norm": 4.993508338928223, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8506409525871277, + "num_tokens": 147155760.0, + "step": 3856 + }, + { + "epoch": 0.4906500445235975, + "ewc_loss": 0.0386761911213398, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014506270235870034, + "grad_norm": 4.902724266052246, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.861642062664032, + "num_tokens": 147195406.0, + "step": 3857 + }, + { + "epoch": 0.49077725480218803, + "ewc_loss": 0.03912036865949631, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014950448530726135, + "grad_norm": 4.9580278396606445, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8698751330375671, + "num_tokens": 147235688.0, + "step": 3858 + }, + { + "epoch": 0.4909044650807785, + "ewc_loss": 0.0389786958694458, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014808775449637324, + "grad_norm": 4.9526214599609375, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8647674322128296, + "num_tokens": 147271575.0, + "step": 3859 + }, + { + "epoch": 0.49103167535936904, + "ewc_loss": 0.039023078978061676, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001485315733589232, + "grad_norm": 4.872398853302002, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8534165024757385, + "num_tokens": 147315473.0, + "step": 3860 + }, + { + "epoch": 0.49115888563795956, + "ewc_loss": 0.03900034725666046, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014830425789114088, + "grad_norm": 4.917003154754639, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8610593676567078, + "num_tokens": 147359058.0, + "step": 3861 + }, + { + "epoch": 0.49128609591655004, + "ewc_loss": 0.039091549813747406, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014921627007424831, + "grad_norm": 4.9861578941345215, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8619735836982727, + "num_tokens": 147394443.0, + "step": 3862 + }, + { + "epoch": 0.49141330619514056, + "ewc_loss": 0.03904832899570465, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014878404908813536, + "grad_norm": 4.881914138793945, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8521156311035156, + "num_tokens": 147436225.0, + "step": 3863 + }, + { + "epoch": 0.4915405164737311, + "ewc_loss": 0.039053671061992645, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001488374691689387, + "grad_norm": 4.964796543121338, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.866236686706543, + "num_tokens": 147473449.0, + "step": 3864 + }, + { + "epoch": 0.49166772675232157, + "ewc_loss": 0.03906909003853798, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014899169036652893, + "grad_norm": 4.938346862792969, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8551907539367676, + "num_tokens": 147510330.0, + "step": 3865 + }, + { + "epoch": 0.4917949370309121, + "ewc_loss": 0.03895494341850281, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014907094009686261, + "grad_norm": 4.989339351654053, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8694687485694885, + "num_tokens": 147544683.0, + "step": 3866 + }, + { + "epoch": 0.4919221473095026, + "ewc_loss": 0.03893962502479553, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014891772298142314, + "grad_norm": 4.8828020095825195, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8750362396240234, + "num_tokens": 147589330.0, + "step": 3867 + }, + { + "epoch": 0.4920493575880931, + "ewc_loss": 0.038902368396520615, + "ewc_loss_diag": 2.4080276489257812e-05, + "ewc_loss_parallel": 0.00014854517939966172, + "grad_norm": 4.984978199005127, + "learning_rate": 1e-06, + "loss": 0.57, + "mean_token_accuracy": 0.8258808255195618, + "num_tokens": 147626864.0, + "step": 3868 + }, + { + "epoch": 0.4921765678666836, + "ewc_loss": 0.039110369980335236, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014940446999389678, + "grad_norm": 4.961553573608398, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8392404317855835, + "num_tokens": 147672006.0, + "step": 3869 + }, + { + "epoch": 0.49230377814527415, + "ewc_loss": 0.03902287036180496, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014852949243504554, + "grad_norm": 4.88584566116333, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8477089405059814, + "num_tokens": 147717388.0, + "step": 3870 + }, + { + "epoch": 0.4924309884238646, + "ewc_loss": 0.03908052295446396, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001491060247644782, + "grad_norm": 4.960875034332275, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8458026051521301, + "num_tokens": 147759827.0, + "step": 3871 + }, + { + "epoch": 0.49255819870245515, + "ewc_loss": 0.03909062594175339, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001492070296080783, + "grad_norm": 5.02761697769165, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8377135396003723, + "num_tokens": 147793742.0, + "step": 3872 + }, + { + "epoch": 0.4926854089810457, + "ewc_loss": 0.03911309689283371, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014943174028303474, + "grad_norm": 4.947309494018555, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.867240846157074, + "num_tokens": 147829881.0, + "step": 3873 + }, + { + "epoch": 0.49281261925963615, + "ewc_loss": 0.039045944809913635, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001487602130509913, + "grad_norm": 4.95416259765625, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8419222831726074, + "num_tokens": 147875051.0, + "step": 3874 + }, + { + "epoch": 0.4929398295382267, + "ewc_loss": 0.039063964039087296, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001489404239691794, + "grad_norm": 4.946969032287598, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8496556878089905, + "num_tokens": 147912786.0, + "step": 3875 + }, + { + "epoch": 0.4930670398168172, + "ewc_loss": 0.0390634648501873, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014893541811034083, + "grad_norm": 5.000242710113525, + "learning_rate": 1e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8361554145812988, + "num_tokens": 147954981.0, + "step": 3876 + }, + { + "epoch": 0.4931942500954077, + "ewc_loss": 0.039086103439331055, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.000149161831359379, + "grad_norm": 4.96238374710083, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8497727513313293, + "num_tokens": 147992179.0, + "step": 3877 + }, + { + "epoch": 0.4933214603739982, + "ewc_loss": 0.039113372564315796, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001494344905950129, + "grad_norm": 4.947878837585449, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8615508675575256, + "num_tokens": 148029866.0, + "step": 3878 + }, + { + "epoch": 0.49344867065258874, + "ewc_loss": 0.03911827877163887, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014948357420507818, + "grad_norm": 4.971359729766846, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8563289642333984, + "num_tokens": 148070057.0, + "step": 3879 + }, + { + "epoch": 0.4935758809311792, + "ewc_loss": 0.0391383059322834, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001496838522143662, + "grad_norm": 4.959507942199707, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8614859580993652, + "num_tokens": 148108461.0, + "step": 3880 + }, + { + "epoch": 0.49370309120976974, + "ewc_loss": 0.03913357853889465, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014963657304178923, + "grad_norm": 4.946068286895752, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8499854207038879, + "num_tokens": 148147609.0, + "step": 3881 + }, + { + "epoch": 0.49383030148836027, + "ewc_loss": 0.03913813456892967, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014968213508836925, + "grad_norm": 4.936351776123047, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8515953421592712, + "num_tokens": 148190475.0, + "step": 3882 + }, + { + "epoch": 0.4939575117669508, + "ewc_loss": 0.03914234787225723, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014972426288295537, + "grad_norm": 5.006076335906982, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8689566254615784, + "num_tokens": 148221187.0, + "step": 3883 + }, + { + "epoch": 0.49408472204554127, + "ewc_loss": 0.03920967876911163, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001503975800005719, + "grad_norm": 4.940131664276123, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8652843236923218, + "num_tokens": 148260916.0, + "step": 3884 + }, + { + "epoch": 0.4942119323241318, + "ewc_loss": 0.03916262462735176, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001499270147178322, + "grad_norm": 5.027210235595703, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.863015353679657, + "num_tokens": 148295591.0, + "step": 3885 + }, + { + "epoch": 0.4943391426027223, + "ewc_loss": 0.039298735558986664, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015006745525170118, + "grad_norm": 4.967231750488281, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8509056568145752, + "num_tokens": 148333445.0, + "step": 3886 + }, + { + "epoch": 0.4944663528813128, + "ewc_loss": 0.03915330022573471, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001498337951488793, + "grad_norm": 4.996988296508789, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8650773763656616, + "num_tokens": 148367951.0, + "step": 3887 + }, + { + "epoch": 0.4945935631599033, + "ewc_loss": 0.03917597606778145, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015006052854005247, + "grad_norm": 4.957164764404297, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8486216068267822, + "num_tokens": 148408491.0, + "step": 3888 + }, + { + "epoch": 0.49472077343849385, + "ewc_loss": 0.039171162992715836, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015001240535639226, + "grad_norm": 5.032464504241943, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8566077947616577, + "num_tokens": 148443883.0, + "step": 3889 + }, + { + "epoch": 0.4948479837170843, + "ewc_loss": 0.0392022430896759, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001503231906099245, + "grad_norm": 5.023646354675293, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8589867353439331, + "num_tokens": 148480511.0, + "step": 3890 + }, + { + "epoch": 0.49497519399567486, + "ewc_loss": 0.03914483264088631, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001497491030022502, + "grad_norm": 4.964334487915039, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8506381511688232, + "num_tokens": 148514759.0, + "step": 3891 + }, + { + "epoch": 0.4951024042742654, + "ewc_loss": 0.039196062833070755, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015026141772978008, + "grad_norm": 4.998199939727783, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.858705997467041, + "num_tokens": 148553209.0, + "step": 3892 + }, + { + "epoch": 0.49522961455285586, + "ewc_loss": 0.03917192667722702, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015002005966380239, + "grad_norm": 4.939537048339844, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8482879400253296, + "num_tokens": 148590387.0, + "step": 3893 + }, + { + "epoch": 0.4953568248314464, + "ewc_loss": 0.039142854511737823, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00014972932694945484, + "grad_norm": 4.99746561050415, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8541513085365295, + "num_tokens": 148625201.0, + "step": 3894 + }, + { + "epoch": 0.4954840351100369, + "ewc_loss": 0.039222005754709244, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015052083472255617, + "grad_norm": 5.013847351074219, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8595742583274841, + "num_tokens": 148665989.0, + "step": 3895 + }, + { + "epoch": 0.4956112453886274, + "ewc_loss": 0.039312057197093964, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001502006343798712, + "grad_norm": 4.972004413604736, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8596275448799133, + "num_tokens": 148704839.0, + "step": 3896 + }, + { + "epoch": 0.4957384556672179, + "ewc_loss": 0.03931203857064247, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015020045975688845, + "grad_norm": 4.953073501586914, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8668917417526245, + "num_tokens": 148740523.0, + "step": 3897 + }, + { + "epoch": 0.49586566594580844, + "ewc_loss": 0.039332516491413116, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001504052197560668, + "grad_norm": 4.967543601989746, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8713909387588501, + "num_tokens": 148779103.0, + "step": 3898 + }, + { + "epoch": 0.4959928762243989, + "ewc_loss": 0.03930762782692909, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015015635290183127, + "grad_norm": 5.013731956481934, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8676531910896301, + "num_tokens": 148814659.0, + "step": 3899 + }, + { + "epoch": 0.49612008650298944, + "ewc_loss": 0.03933970630168915, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001504771353211254, + "grad_norm": 5.027218818664551, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8772537112236023, + "num_tokens": 148856864.0, + "step": 3900 + }, + { + "epoch": 0.49624729678157997, + "ewc_loss": 0.03931156173348427, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015019568672869354, + "grad_norm": 4.982359409332275, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8636068105697632, + "num_tokens": 148890309.0, + "step": 3901 + }, + { + "epoch": 0.49637450706017044, + "ewc_loss": 0.03924758359789848, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00014955592632759362, + "grad_norm": 4.949827194213867, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.842349648475647, + "num_tokens": 148926662.0, + "step": 3902 + }, + { + "epoch": 0.496501717338761, + "ewc_loss": 0.039315320551395416, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015023328887764364, + "grad_norm": 4.9749016761779785, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.844165563583374, + "num_tokens": 148963032.0, + "step": 3903 + }, + { + "epoch": 0.4966289276173515, + "ewc_loss": 0.039337001740932465, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001504501124145463, + "grad_norm": 5.02259635925293, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.863227367401123, + "num_tokens": 148996684.0, + "step": 3904 + }, + { + "epoch": 0.496756137895942, + "ewc_loss": 0.03932838886976242, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015036395052447915, + "grad_norm": 4.931013584136963, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8568245768547058, + "num_tokens": 149032735.0, + "step": 3905 + }, + { + "epoch": 0.4968833481745325, + "ewc_loss": 0.03930993750691414, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001501794467912987, + "grad_norm": 4.959293365478516, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8588657379150391, + "num_tokens": 149069415.0, + "step": 3906 + }, + { + "epoch": 0.49701055845312303, + "ewc_loss": 0.03936566412448883, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015073672693688422, + "grad_norm": 4.933187484741211, + "learning_rate": 1e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8431163430213928, + "num_tokens": 149115042.0, + "step": 3907 + }, + { + "epoch": 0.4971377687317135, + "ewc_loss": 0.03933176398277283, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015039769641589373, + "grad_norm": 5.034974575042725, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8660778403282166, + "num_tokens": 149149767.0, + "step": 3908 + }, + { + "epoch": 0.49726497901030403, + "ewc_loss": 0.03946956247091293, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015055498806759715, + "grad_norm": 4.919536113739014, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8506511449813843, + "num_tokens": 149193000.0, + "step": 3909 + }, + { + "epoch": 0.49739218928889456, + "ewc_loss": 0.03929027169942856, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001499827776569873, + "grad_norm": 4.959747791290283, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8546496033668518, + "num_tokens": 149226963.0, + "step": 3910 + }, + { + "epoch": 0.49751939956748503, + "ewc_loss": 0.0393666997551918, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001507470733486116, + "grad_norm": 4.948400020599365, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.852641224861145, + "num_tokens": 149268139.0, + "step": 3911 + }, + { + "epoch": 0.49764660984607556, + "ewc_loss": 0.03934735804796219, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015055366384331137, + "grad_norm": 5.032769203186035, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8640503287315369, + "num_tokens": 149305882.0, + "step": 3912 + }, + { + "epoch": 0.4977738201246661, + "ewc_loss": 0.0393468514084816, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015054857067298144, + "grad_norm": 4.9470744132995605, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8532828092575073, + "num_tokens": 149344119.0, + "step": 3913 + }, + { + "epoch": 0.49790103040325656, + "ewc_loss": 0.03941432014107704, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.0001500025682616979, + "grad_norm": 4.951924800872803, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.848152756690979, + "num_tokens": 149384102.0, + "step": 3914 + }, + { + "epoch": 0.4980282406818471, + "ewc_loss": 0.03930896520614624, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001501697115600109, + "grad_norm": 4.915297985076904, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8536919951438904, + "num_tokens": 149425677.0, + "step": 3915 + }, + { + "epoch": 0.4981554509604376, + "ewc_loss": 0.03934628516435623, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015054293908178806, + "grad_norm": 5.013503074645996, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8529238700866699, + "num_tokens": 149463275.0, + "step": 3916 + }, + { + "epoch": 0.4982826612390281, + "ewc_loss": 0.039330948144197464, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015038954734336585, + "grad_norm": 4.889771461486816, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.871751070022583, + "num_tokens": 149503950.0, + "step": 3917 + }, + { + "epoch": 0.4984098715176186, + "ewc_loss": 0.039327919483184814, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015035927935969085, + "grad_norm": 4.974922180175781, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8490409851074219, + "num_tokens": 149541192.0, + "step": 3918 + }, + { + "epoch": 0.49853708179620915, + "ewc_loss": 0.03938639909029007, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015094407717697322, + "grad_norm": 4.8896660804748535, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8541589379310608, + "num_tokens": 149581784.0, + "step": 3919 + }, + { + "epoch": 0.4986642920747996, + "ewc_loss": 0.03932454064488411, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015032548981253058, + "grad_norm": 4.971618175506592, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8618398904800415, + "num_tokens": 149619312.0, + "step": 3920 + }, + { + "epoch": 0.49879150235339015, + "ewc_loss": 0.03940728306770325, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015115291171241552, + "grad_norm": 4.9497480392456055, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8536806702613831, + "num_tokens": 149657443.0, + "step": 3921 + }, + { + "epoch": 0.4989187126319807, + "ewc_loss": 0.03935004025697708, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015058046847116202, + "grad_norm": 4.979781150817871, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8690903782844543, + "num_tokens": 149694992.0, + "step": 3922 + }, + { + "epoch": 0.49904592291057115, + "ewc_loss": 0.03936762362718582, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015075632836669683, + "grad_norm": 5.030912399291992, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8527011871337891, + "num_tokens": 149729554.0, + "step": 3923 + }, + { + "epoch": 0.4991731331891617, + "ewc_loss": 0.039489567279815674, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015075504779815674, + "grad_norm": 4.93665885925293, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.862744927406311, + "num_tokens": 149770051.0, + "step": 3924 + }, + { + "epoch": 0.4993003434677522, + "ewc_loss": 0.039425671100616455, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015011607320047915, + "grad_norm": 4.982684135437012, + "learning_rate": 1e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.8259543180465698, + "num_tokens": 149806724.0, + "step": 3925 + }, + { + "epoch": 0.4994275537463427, + "ewc_loss": 0.039386436343193054, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015094444097485393, + "grad_norm": 4.918379306793213, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8489580750465393, + "num_tokens": 149849874.0, + "step": 3926 + }, + { + "epoch": 0.4995547640249332, + "ewc_loss": 0.03931530565023422, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015023312880657613, + "grad_norm": 4.920563220977783, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8566089272499084, + "num_tokens": 149888076.0, + "step": 3927 + }, + { + "epoch": 0.49968197430352373, + "ewc_loss": 0.0394107885658741, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015118795272428542, + "grad_norm": 4.960864543914795, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8570274114608765, + "num_tokens": 149923750.0, + "step": 3928 + }, + { + "epoch": 0.4998091845821142, + "ewc_loss": 0.03938232734799385, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015090334636624902, + "grad_norm": 4.987866401672363, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8447891473770142, + "num_tokens": 149960657.0, + "step": 3929 + }, + { + "epoch": 0.49993639486070474, + "ewc_loss": 0.03942358121275902, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.000151315878611058, + "grad_norm": 4.958763599395752, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8718642592430115, + "num_tokens": 149995983.0, + "step": 3930 + }, + { + "epoch": 0.5000636051392953, + "ewc_loss": 0.039385318756103516, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015093327965587378, + "grad_norm": 4.916769981384277, + "learning_rate": 1e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8357940316200256, + "num_tokens": 150037508.0, + "step": 3931 + }, + { + "epoch": 0.5001908154178858, + "ewc_loss": 0.039393335580825806, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015101343160495162, + "grad_norm": 4.951231002807617, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8521028161048889, + "num_tokens": 150077806.0, + "step": 3932 + }, + { + "epoch": 0.5003180256964763, + "ewc_loss": 0.039404015988111496, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015112024266272783, + "grad_norm": 4.955969333648682, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.854617178440094, + "num_tokens": 150117589.0, + "step": 3933 + }, + { + "epoch": 0.5004452359750667, + "ewc_loss": 0.039360590279102325, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015068599896039814, + "grad_norm": 4.8742852210998535, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8498848676681519, + "num_tokens": 150161579.0, + "step": 3934 + }, + { + "epoch": 0.5005724462536573, + "ewc_loss": 0.03952226787805557, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015108207298908383, + "grad_norm": 4.984164714813232, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8433477878570557, + "num_tokens": 150208018.0, + "step": 3935 + }, + { + "epoch": 0.5006996565322478, + "ewc_loss": 0.03953287750482559, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.0001511881418991834, + "grad_norm": 4.889009952545166, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8568785190582275, + "num_tokens": 150250164.0, + "step": 3936 + }, + { + "epoch": 0.5008268668108383, + "ewc_loss": 0.03940152749419212, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015109535888768733, + "grad_norm": 4.901693344116211, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8754907250404358, + "num_tokens": 150289972.0, + "step": 3937 + }, + { + "epoch": 0.5009540770894289, + "ewc_loss": 0.03931821510195732, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015148293459787965, + "grad_norm": 4.924627780914307, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.854540228843689, + "num_tokens": 150333656.0, + "step": 3938 + }, + { + "epoch": 0.5010812873680194, + "ewc_loss": 0.039471548050642014, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015179555339273065, + "grad_norm": 4.924286842346191, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8700773119926453, + "num_tokens": 150374086.0, + "step": 3939 + }, + { + "epoch": 0.5012084976466098, + "ewc_loss": 0.039499714970588684, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015085651830304414, + "grad_norm": 4.966697692871094, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8610039353370667, + "num_tokens": 150407472.0, + "step": 3940 + }, + { + "epoch": 0.5013357079252003, + "ewc_loss": 0.03944418951869011, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015152197738643736, + "grad_norm": 4.967555999755859, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8619969487190247, + "num_tokens": 150444560.0, + "step": 3941 + }, + { + "epoch": 0.5014629182037909, + "ewc_loss": 0.03926878795027733, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001509886496933177, + "grad_norm": 4.97149133682251, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.861011266708374, + "num_tokens": 150483621.0, + "step": 3942 + }, + { + "epoch": 0.5015901284823814, + "ewc_loss": 0.03924456983804703, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015074647672008723, + "grad_norm": 4.94354248046875, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8568848967552185, + "num_tokens": 150520414.0, + "step": 3943 + }, + { + "epoch": 0.5017173387609719, + "ewc_loss": 0.039278559386730194, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015108639490790665, + "grad_norm": 4.971264839172363, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8486452698707581, + "num_tokens": 150556589.0, + "step": 3944 + }, + { + "epoch": 0.5018445490395624, + "ewc_loss": 0.0392504520714283, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.0001508052955614403, + "grad_norm": 4.9325971603393555, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.854458212852478, + "num_tokens": 150592393.0, + "step": 3945 + }, + { + "epoch": 0.5019717593181529, + "ewc_loss": 0.0394098162651062, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015117826114874333, + "grad_norm": 4.957830429077148, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8434115052223206, + "num_tokens": 150630457.0, + "step": 3946 + }, + { + "epoch": 0.5020989695967434, + "ewc_loss": 0.039573200047016144, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.0001515913609182462, + "grad_norm": 5.008955001831055, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8639549612998962, + "num_tokens": 150659482.0, + "step": 3947 + }, + { + "epoch": 0.5022261798753339, + "ewc_loss": 0.03943655639886856, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015144563803914934, + "grad_norm": 4.953297138214111, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8563200831413269, + "num_tokens": 150700920.0, + "step": 3948 + }, + { + "epoch": 0.5023533901539244, + "ewc_loss": 0.03943122178316116, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015139227616600692, + "grad_norm": 4.956469535827637, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.867024302482605, + "num_tokens": 150736393.0, + "step": 3949 + }, + { + "epoch": 0.502480600432515, + "ewc_loss": 0.039574407041072845, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015160342445597053, + "grad_norm": 4.934463024139404, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8532940149307251, + "num_tokens": 150783848.0, + "step": 3950 + }, + { + "epoch": 0.5026078107111055, + "ewc_loss": 0.039462216198444366, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015170224651228637, + "grad_norm": 4.993369102478027, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.853206992149353, + "num_tokens": 150824179.0, + "step": 3951 + }, + { + "epoch": 0.5027350209896959, + "ewc_loss": 0.03942864388227463, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001513665192760527, + "grad_norm": 4.941128253936768, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.85684734582901, + "num_tokens": 150862775.0, + "step": 3952 + }, + { + "epoch": 0.5028622312682864, + "ewc_loss": 0.03957191854715347, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015157855523284525, + "grad_norm": 4.938687324523926, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8555498719215393, + "num_tokens": 150900005.0, + "step": 3953 + }, + { + "epoch": 0.502989441546877, + "ewc_loss": 0.039482228457927704, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015190234989859164, + "grad_norm": 4.975062847137451, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8535727858543396, + "num_tokens": 150936148.0, + "step": 3954 + }, + { + "epoch": 0.5031166518254675, + "ewc_loss": 0.03947710990905762, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015185115626081824, + "grad_norm": 4.980134963989258, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8535265922546387, + "num_tokens": 150971661.0, + "step": 3955 + }, + { + "epoch": 0.503243862104058, + "ewc_loss": 0.03949592262506485, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015203932707663625, + "grad_norm": 4.945784568786621, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8678557872772217, + "num_tokens": 151009521.0, + "step": 3956 + }, + { + "epoch": 0.5033710723826486, + "ewc_loss": 0.03959007188677788, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015176010492723435, + "grad_norm": 5.0382161140441895, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8530241847038269, + "num_tokens": 151053641.0, + "step": 3957 + }, + { + "epoch": 0.5034982826612391, + "ewc_loss": 0.03952045738697052, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001522846578154713, + "grad_norm": 4.961154937744141, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8611224889755249, + "num_tokens": 151094476.0, + "step": 3958 + }, + { + "epoch": 0.5036254929398295, + "ewc_loss": 0.03933098912239075, + "ewc_loss_diag": 2.4199485778808594e-05, + "ewc_loss_parallel": 0.00015161068586166948, + "grad_norm": 5.0417938232421875, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8499512672424316, + "num_tokens": 151126381.0, + "step": 3959 + }, + { + "epoch": 0.50375270321842, + "ewc_loss": 0.039509400725364685, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001521740632597357, + "grad_norm": 5.078193187713623, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8611100912094116, + "num_tokens": 151156875.0, + "step": 3960 + }, + { + "epoch": 0.5038799134970106, + "ewc_loss": 0.03957565128803253, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015161586634349078, + "grad_norm": 5.1608195304870605, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8523054718971252, + "num_tokens": 151194394.0, + "step": 3961 + }, + { + "epoch": 0.5040071237756011, + "ewc_loss": 0.039414677768945694, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015122686454560608, + "grad_norm": 4.9292168617248535, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8423396944999695, + "num_tokens": 151234472.0, + "step": 3962 + }, + { + "epoch": 0.5041343340541916, + "ewc_loss": 0.03934231400489807, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001505031978012994, + "grad_norm": 5.007771968841553, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8517464399337769, + "num_tokens": 151270426.0, + "step": 3963 + }, + { + "epoch": 0.5042615443327821, + "ewc_loss": 0.039399027824401855, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.000151070358697325, + "grad_norm": 4.976545333862305, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8545461297035217, + "num_tokens": 151305793.0, + "step": 3964 + }, + { + "epoch": 0.5043887546113726, + "ewc_loss": 0.039422206580638885, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015130215615499765, + "grad_norm": 5.077957630157471, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8446930050849915, + "num_tokens": 151345116.0, + "step": 3965 + }, + { + "epoch": 0.5045159648899631, + "ewc_loss": 0.03947402536869049, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015182032075244933, + "grad_norm": 4.959963798522949, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8620797991752625, + "num_tokens": 151381901.0, + "step": 3966 + }, + { + "epoch": 0.5046431751685536, + "ewc_loss": 0.039372142404317856, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015080151206348091, + "grad_norm": 4.927147388458252, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8584157824516296, + "num_tokens": 151426884.0, + "step": 3967 + }, + { + "epoch": 0.5047703854471441, + "ewc_loss": 0.03941825032234192, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001512625749455765, + "grad_norm": 5.0007758140563965, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8494377136230469, + "num_tokens": 151467596.0, + "step": 3968 + }, + { + "epoch": 0.5048975957257347, + "ewc_loss": 0.03946353867650032, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015171547420322895, + "grad_norm": 4.946700572967529, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8759146928787231, + "num_tokens": 151502131.0, + "step": 3969 + }, + { + "epoch": 0.5050248060043252, + "ewc_loss": 0.03943241387605667, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015140423784032464, + "grad_norm": 4.994009971618652, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8521795868873596, + "num_tokens": 151538932.0, + "step": 3970 + }, + { + "epoch": 0.5051520162829156, + "ewc_loss": 0.03954887390136719, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.0001513480965513736, + "grad_norm": 4.957365036010742, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8555448055267334, + "num_tokens": 151577839.0, + "step": 3971 + }, + { + "epoch": 0.5052792265615061, + "ewc_loss": 0.0395696684718132, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015155604341998696, + "grad_norm": 4.936337471008301, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8561031818389893, + "num_tokens": 151618984.0, + "step": 3972 + }, + { + "epoch": 0.5054064368400967, + "ewc_loss": 0.03945077955722809, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015158785390667617, + "grad_norm": 4.947983741760254, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8564702868461609, + "num_tokens": 151658392.0, + "step": 3973 + }, + { + "epoch": 0.5055336471186872, + "ewc_loss": 0.03942447528243065, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015132482803892344, + "grad_norm": 5.024632453918457, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8664433360099792, + "num_tokens": 151689781.0, + "step": 3974 + }, + { + "epoch": 0.5056608573972777, + "ewc_loss": 0.03950457274913788, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015212582366075367, + "grad_norm": 5.019479751586914, + "learning_rate": 1e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8411712646484375, + "num_tokens": 151729612.0, + "step": 3975 + }, + { + "epoch": 0.5057880676758683, + "ewc_loss": 0.03946655988693237, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015174569853115827, + "grad_norm": 4.993529319763184, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8437182307243347, + "num_tokens": 151769270.0, + "step": 3976 + }, + { + "epoch": 0.5059152779544587, + "ewc_loss": 0.039463259279727936, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001517126802355051, + "grad_norm": 5.0139617919921875, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8388558030128479, + "num_tokens": 151803936.0, + "step": 3977 + }, + { + "epoch": 0.5060424882330492, + "ewc_loss": 0.039570264518260956, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015156202425714582, + "grad_norm": 12.354912757873535, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8442946076393127, + "num_tokens": 151840429.0, + "step": 3978 + }, + { + "epoch": 0.5061696985116397, + "ewc_loss": 0.04707580432295799, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00022661742696072906, + "grad_norm": 6.253968715667725, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8572715520858765, + "num_tokens": 151883296.0, + "step": 3979 + }, + { + "epoch": 0.5062969087902303, + "ewc_loss": 0.038706451654434204, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00014170317444950342, + "grad_norm": 4.40038537979126, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.872028112411499, + "num_tokens": 151917658.0, + "step": 3980 + }, + { + "epoch": 0.5064241190688208, + "ewc_loss": 0.04136701673269272, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00016952956502791494, + "grad_norm": 5.600911617279053, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8558051586151123, + "num_tokens": 151958822.0, + "step": 3981 + }, + { + "epoch": 0.5065513293474113, + "ewc_loss": 0.041507624089717865, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00017215630214195698, + "grad_norm": 5.073422908782959, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8515142202377319, + "num_tokens": 151994964.0, + "step": 3982 + }, + { + "epoch": 0.5066785396260017, + "ewc_loss": 0.0400497131049633, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015757721848785877, + "grad_norm": 5.1858229637146, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8508296012878418, + "num_tokens": 152037013.0, + "step": 3983 + }, + { + "epoch": 0.5068057499045923, + "ewc_loss": 0.04086051136255264, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00016446446534246206, + "grad_norm": 5.143978118896484, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8519800901412964, + "num_tokens": 152074821.0, + "step": 3984 + }, + { + "epoch": 0.5069329601831828, + "ewc_loss": 0.04006151854991913, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015769526362419128, + "grad_norm": 5.07424259185791, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8489513993263245, + "num_tokens": 152116667.0, + "step": 3985 + }, + { + "epoch": 0.5070601704617733, + "ewc_loss": 0.04017600417137146, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015884012100286782, + "grad_norm": 5.141599655151367, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8596646189689636, + "num_tokens": 152152714.0, + "step": 3986 + }, + { + "epoch": 0.5071873807403638, + "ewc_loss": 0.040004417300224304, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015712427557446063, + "grad_norm": 5.0553507804870605, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8429262638092041, + "num_tokens": 152190230.0, + "step": 3987 + }, + { + "epoch": 0.5073145910189544, + "ewc_loss": 0.03999123349785805, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.0001557717187097296, + "grad_norm": 5.090488910675049, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8552730083465576, + "num_tokens": 152228887.0, + "step": 3988 + }, + { + "epoch": 0.5074418012975448, + "ewc_loss": 0.03983960673213005, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015547614020761102, + "grad_norm": 5.000594139099121, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8649541139602661, + "num_tokens": 152271863.0, + "step": 3989 + }, + { + "epoch": 0.5075690115761353, + "ewc_loss": 0.039735279977321625, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015443288430105895, + "grad_norm": 5.052361965179443, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8510146141052246, + "num_tokens": 152310765.0, + "step": 3990 + }, + { + "epoch": 0.5076962218547258, + "ewc_loss": 0.03975089266896248, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001545890117995441, + "grad_norm": 5.056303977966309, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8788930177688599, + "num_tokens": 152342545.0, + "step": 3991 + }, + { + "epoch": 0.5078234321333164, + "ewc_loss": 0.03965708613395691, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015365095168817788, + "grad_norm": 5.287057876586914, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8396522998809814, + "num_tokens": 152386223.0, + "step": 3992 + }, + { + "epoch": 0.5079506424119069, + "ewc_loss": 0.039775848388671875, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.0001536178751848638, + "grad_norm": 12.42166519165039, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.861934244632721, + "num_tokens": 152421811.0, + "step": 3993 + }, + { + "epoch": 0.5080778526904974, + "ewc_loss": 0.046686023473739624, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.0002227195946034044, + "grad_norm": 6.239504814147949, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8441764116287231, + "num_tokens": 152454581.0, + "step": 3994 + }, + { + "epoch": 0.5082050629690879, + "ewc_loss": 0.0383547879755497, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00014062794798519462, + "grad_norm": 4.434129238128662, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8465178608894348, + "num_tokens": 152490591.0, + "step": 3995 + }, + { + "epoch": 0.5083322732476784, + "ewc_loss": 0.0411221869289875, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00016830195090733469, + "grad_norm": 5.569872856140137, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8786373734474182, + "num_tokens": 152532290.0, + "step": 3996 + }, + { + "epoch": 0.5084594835262689, + "ewc_loss": 0.04133296012878418, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00017040966486092657, + "grad_norm": 5.105298042297363, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8615316152572632, + "num_tokens": 152571984.0, + "step": 3997 + }, + { + "epoch": 0.5085866938048594, + "ewc_loss": 0.039943791925907135, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015651797002647072, + "grad_norm": 5.13958215713501, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8566235899925232, + "num_tokens": 152611082.0, + "step": 3998 + }, + { + "epoch": 0.50871390408345, + "ewc_loss": 0.040580909699201584, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00016288917686324567, + "grad_norm": 5.116955757141113, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8641799092292786, + "num_tokens": 152655303.0, + "step": 3999 + }, + { + "epoch": 0.5088411143620405, + "ewc_loss": 0.04005903750658035, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001576704380568117, + "grad_norm": 5.141840934753418, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8552309274673462, + "num_tokens": 152688432.0, + "step": 4000 + }, + { + "epoch": 0.5089683246406309, + "ewc_loss": 0.040142305195331573, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015850314230192453, + "grad_norm": 5.163708209991455, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8521890640258789, + "num_tokens": 152725637.0, + "step": 4001 + }, + { + "epoch": 0.5090955349192214, + "ewc_loss": 0.039952024817466736, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015660030476283282, + "grad_norm": 5.105592727661133, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8540521860122681, + "num_tokens": 152763108.0, + "step": 4002 + }, + { + "epoch": 0.509222745197812, + "ewc_loss": 0.039883971214294434, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015591979899909347, + "grad_norm": 5.054643630981445, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8629072904586792, + "num_tokens": 152800659.0, + "step": 4003 + }, + { + "epoch": 0.5093499554764025, + "ewc_loss": 0.039789535105228424, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001549754524603486, + "grad_norm": 5.008675575256348, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8464728593826294, + "num_tokens": 152842797.0, + "step": 4004 + }, + { + "epoch": 0.509477165754993, + "ewc_loss": 0.039770856499671936, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001547886640764773, + "grad_norm": 5.0649590492248535, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8471546769142151, + "num_tokens": 152888257.0, + "step": 4005 + }, + { + "epoch": 0.5096043760335836, + "ewc_loss": 0.03976760804653168, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001547561405459419, + "grad_norm": 5.019713878631592, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8543311953544617, + "num_tokens": 152929089.0, + "step": 4006 + }, + { + "epoch": 0.5097315863121741, + "ewc_loss": 0.039692118763923645, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015400124539155513, + "grad_norm": 5.0478196144104, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8498342037200928, + "num_tokens": 152972710.0, + "step": 4007 + }, + { + "epoch": 0.5098587965907645, + "ewc_loss": 0.039741627871990204, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001544963743072003, + "grad_norm": 5.02360200881958, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8563718795776367, + "num_tokens": 153008867.0, + "step": 4008 + }, + { + "epoch": 0.509986006869355, + "ewc_loss": 0.03967377543449402, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015381784760393202, + "grad_norm": 5.03143835067749, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8506789207458496, + "num_tokens": 153051497.0, + "step": 4009 + }, + { + "epoch": 0.5101132171479456, + "ewc_loss": 0.0396837443113327, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015391749911941588, + "grad_norm": 5.03431510925293, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8715516328811646, + "num_tokens": 153089295.0, + "step": 4010 + }, + { + "epoch": 0.5102404274265361, + "ewc_loss": 0.03978755697607994, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015373494534287602, + "grad_norm": 5.031844615936279, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8607661724090576, + "num_tokens": 153123957.0, + "step": 4011 + }, + { + "epoch": 0.5103676377051266, + "ewc_loss": 0.039668355137109756, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015376362716779113, + "grad_norm": 5.061185836791992, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8534432649612427, + "num_tokens": 153165248.0, + "step": 4012 + }, + { + "epoch": 0.5104948479837171, + "ewc_loss": 0.039677079766988754, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001538508658995852, + "grad_norm": 4.955785751342773, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8711467385292053, + "num_tokens": 153206163.0, + "step": 4013 + }, + { + "epoch": 0.5106220582623076, + "ewc_loss": 0.03963436186313629, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015342367987614125, + "grad_norm": 5.033138751983643, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8477703332901001, + "num_tokens": 153248666.0, + "step": 4014 + }, + { + "epoch": 0.5107492685408981, + "ewc_loss": 0.03968828171491623, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001539629156468436, + "grad_norm": 5.012416839599609, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.879500150680542, + "num_tokens": 153289009.0, + "step": 4015 + }, + { + "epoch": 0.5108764788194886, + "ewc_loss": 0.03963332250714302, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001534132898086682, + "grad_norm": 5.0433220863342285, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8609174489974976, + "num_tokens": 153321354.0, + "step": 4016 + }, + { + "epoch": 0.5110036890980791, + "ewc_loss": 0.039670467376708984, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001537847419967875, + "grad_norm": 5.081086158752441, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8562877178192139, + "num_tokens": 153361061.0, + "step": 4017 + }, + { + "epoch": 0.5111308993766697, + "ewc_loss": 0.03963628038764, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015344287385232747, + "grad_norm": 5.020884990692139, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8494333028793335, + "num_tokens": 153397002.0, + "step": 4018 + }, + { + "epoch": 0.5112581096552602, + "ewc_loss": 0.039645835757255554, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015353845083154738, + "grad_norm": 5.034057140350342, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8418365716934204, + "num_tokens": 153435264.0, + "step": 4019 + }, + { + "epoch": 0.5113853199338506, + "ewc_loss": 0.039653971791267395, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015361978148575872, + "grad_norm": 4.990089416503906, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8545359373092651, + "num_tokens": 153474751.0, + "step": 4020 + }, + { + "epoch": 0.5115125302124411, + "ewc_loss": 0.03967345505952835, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015381463163066655, + "grad_norm": 5.017404079437256, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8651437759399414, + "num_tokens": 153510084.0, + "step": 4021 + }, + { + "epoch": 0.5116397404910317, + "ewc_loss": 0.039705488830804825, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015413497749250382, + "grad_norm": 5.007789611816406, + "learning_rate": 1e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8422527313232422, + "num_tokens": 153550387.0, + "step": 4022 + }, + { + "epoch": 0.5117669507696222, + "ewc_loss": 0.039701078087091446, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015409085608553141, + "grad_norm": 4.982141017913818, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8781842589378357, + "num_tokens": 153592441.0, + "step": 4023 + }, + { + "epoch": 0.5118941610482127, + "ewc_loss": 0.03972165659070015, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015429663471877575, + "grad_norm": 5.0218505859375, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8629002571105957, + "num_tokens": 153629202.0, + "step": 4024 + }, + { + "epoch": 0.5120213713268033, + "ewc_loss": 0.03969410061836243, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015402109420392662, + "grad_norm": 5.024658203125, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8511655330657959, + "num_tokens": 153669243.0, + "step": 4025 + }, + { + "epoch": 0.5121485816053937, + "ewc_loss": 0.039718128740787506, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001542613608762622, + "grad_norm": 5.023528099060059, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8507116436958313, + "num_tokens": 153709206.0, + "step": 4026 + }, + { + "epoch": 0.5122757918839842, + "ewc_loss": 0.03967105224728584, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001537905918667093, + "grad_norm": 5.043911457061768, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8630884885787964, + "num_tokens": 153741251.0, + "step": 4027 + }, + { + "epoch": 0.5124030021625747, + "ewc_loss": 0.03967810422182083, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015386113955173641, + "grad_norm": 5.046634197235107, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8424485921859741, + "num_tokens": 153775559.0, + "step": 4028 + }, + { + "epoch": 0.5125302124411653, + "ewc_loss": 0.0397871695458889, + "ewc_loss_diag": 2.4437904357910156e-05, + "ewc_loss_parallel": 0.00015373107453342527, + "grad_norm": 4.94227933883667, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8605133891105652, + "num_tokens": 153819158.0, + "step": 4029 + }, + { + "epoch": 0.5126574227197558, + "ewc_loss": 0.03966015577316284, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015368161257356405, + "grad_norm": 4.981248378753662, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8499977588653564, + "num_tokens": 153856231.0, + "step": 4030 + }, + { + "epoch": 0.5127846329983463, + "ewc_loss": 0.03970979526638985, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015417803660966456, + "grad_norm": 4.950451374053955, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8667411804199219, + "num_tokens": 153897125.0, + "step": 4031 + }, + { + "epoch": 0.5129118432769367, + "ewc_loss": 0.039711423218250275, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001541943202028051, + "grad_norm": 5.01969051361084, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8632557392120361, + "num_tokens": 153933552.0, + "step": 4032 + }, + { + "epoch": 0.5130390535555273, + "ewc_loss": 0.0397181361913681, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015426143363583833, + "grad_norm": 4.983279228210449, + "learning_rate": 1e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8420979976654053, + "num_tokens": 153974337.0, + "step": 4033 + }, + { + "epoch": 0.5131662638341178, + "ewc_loss": 0.03973918408155441, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015447192708961666, + "grad_norm": 4.952386379241943, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8513730764389038, + "num_tokens": 154018911.0, + "step": 4034 + }, + { + "epoch": 0.5132934741127083, + "ewc_loss": 0.0397513322532177, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015459339192602783, + "grad_norm": 5.023747444152832, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8668038845062256, + "num_tokens": 154056441.0, + "step": 4035 + }, + { + "epoch": 0.5134206843912988, + "ewc_loss": 0.03973580524325371, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015443812299054116, + "grad_norm": 5.0128560066223145, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8463726043701172, + "num_tokens": 154096326.0, + "step": 4036 + }, + { + "epoch": 0.5135478946698894, + "ewc_loss": 0.039716292172670364, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.000154242996359244, + "grad_norm": 4.948456287384033, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8605872988700867, + "num_tokens": 154137223.0, + "step": 4037 + }, + { + "epoch": 0.5136751049484798, + "ewc_loss": 0.03971115127205849, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015419158444274217, + "grad_norm": 5.152123928070068, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8541463613510132, + "num_tokens": 154168312.0, + "step": 4038 + }, + { + "epoch": 0.5138023152270703, + "ewc_loss": 0.039777971804142, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.00015485977928619832, + "grad_norm": 4.916816234588623, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8693194389343262, + "num_tokens": 154203125.0, + "step": 4039 + }, + { + "epoch": 0.5139295255056608, + "ewc_loss": 0.039923593401908875, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015387458552140743, + "grad_norm": 4.967459678649902, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.850470781326294, + "num_tokens": 154248437.0, + "step": 4040 + }, + { + "epoch": 0.5140567357842514, + "ewc_loss": 0.04003581777215004, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015499685832764953, + "grad_norm": 4.95533561706543, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8598222732543945, + "num_tokens": 154286303.0, + "step": 4041 + }, + { + "epoch": 0.5141839460628419, + "ewc_loss": 0.03972962126135826, + "ewc_loss_diag": 2.4318695068359375e-05, + "ewc_loss_parallel": 0.0001543762773508206, + "grad_norm": 4.984689712524414, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8461278080940247, + "num_tokens": 154328638.0, + "step": 4042 + }, + { + "epoch": 0.5143111563414324, + "ewc_loss": 0.04011136665940285, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015453163359779865, + "grad_norm": 5.031861305236816, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.854792058467865, + "num_tokens": 154362953.0, + "step": 4043 + }, + { + "epoch": 0.5144383666200228, + "ewc_loss": 0.040152110159397125, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015493908722419292, + "grad_norm": 4.981755256652832, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8439562320709229, + "num_tokens": 154401289.0, + "step": 4044 + }, + { + "epoch": 0.5145655768986134, + "ewc_loss": 0.04014383628964424, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.0001548563304822892, + "grad_norm": 4.976718425750732, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8595269918441772, + "num_tokens": 154438779.0, + "step": 4045 + }, + { + "epoch": 0.5146927871772039, + "ewc_loss": 0.04018544405698776, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.0001552724133944139, + "grad_norm": 5.08268928527832, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8501708507537842, + "num_tokens": 154467032.0, + "step": 4046 + }, + { + "epoch": 0.5148199974557944, + "ewc_loss": 0.040174227207899094, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.0001551602326799184, + "grad_norm": 4.943838596343994, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8652903437614441, + "num_tokens": 154505222.0, + "step": 4047 + }, + { + "epoch": 0.514947207734385, + "ewc_loss": 0.040174126625061035, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015515924314968288, + "grad_norm": 5.008325099945068, + "learning_rate": 1e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8427198529243469, + "num_tokens": 154544953.0, + "step": 4048 + }, + { + "epoch": 0.5150744180129755, + "ewc_loss": 0.04024062305688858, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015582422201987356, + "grad_norm": 4.961950302124023, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8682953119277954, + "num_tokens": 154586193.0, + "step": 4049 + }, + { + "epoch": 0.5152016282915659, + "ewc_loss": 0.040291089564561844, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.0001551081659272313, + "grad_norm": 4.970734596252441, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8600985407829285, + "num_tokens": 154629531.0, + "step": 4050 + }, + { + "epoch": 0.5153288385701564, + "ewc_loss": 0.04019924998283386, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015541048196610063, + "grad_norm": 5.036850929260254, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8715494871139526, + "num_tokens": 154659482.0, + "step": 4051 + }, + { + "epoch": 0.515456048848747, + "ewc_loss": 0.040172822773456573, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.0001551461755298078, + "grad_norm": 4.982200622558594, + "learning_rate": 1e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.8226144313812256, + "num_tokens": 154702645.0, + "step": 4052 + }, + { + "epoch": 0.5155832591273375, + "ewc_loss": 0.04013541340827942, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.0001547721039969474, + "grad_norm": 4.969078063964844, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.865536093711853, + "num_tokens": 154743329.0, + "step": 4053 + }, + { + "epoch": 0.515710469405928, + "ewc_loss": 0.040297456085681915, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015517184510827065, + "grad_norm": 4.981043338775635, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8525257110595703, + "num_tokens": 154783460.0, + "step": 4054 + }, + { + "epoch": 0.5158376796845185, + "ewc_loss": 0.04027269780635834, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015492424427065998, + "grad_norm": 5.0299072265625, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.850155234336853, + "num_tokens": 154827209.0, + "step": 4055 + }, + { + "epoch": 0.5159648899631091, + "ewc_loss": 0.04032742604613304, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015547152725048363, + "grad_norm": 5.0189032554626465, + "learning_rate": 1e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8318493366241455, + "num_tokens": 154866577.0, + "step": 4056 + }, + { + "epoch": 0.5160921002416995, + "ewc_loss": 0.04023202881217003, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015451756189577281, + "grad_norm": 4.944960594177246, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8586467504501343, + "num_tokens": 154912191.0, + "step": 4057 + }, + { + "epoch": 0.51621931052029, + "ewc_loss": 0.04017360508441925, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.0001551540190121159, + "grad_norm": 5.025081634521484, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8431351184844971, + "num_tokens": 154953412.0, + "step": 4058 + }, + { + "epoch": 0.5163465207988805, + "ewc_loss": 0.04020373523235321, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015545533096883446, + "grad_norm": 4.953619003295898, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.867017924785614, + "num_tokens": 154989913.0, + "step": 4059 + }, + { + "epoch": 0.5164737310774711, + "ewc_loss": 0.04012985900044441, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015471655933652073, + "grad_norm": 5.015284061431885, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8431550860404968, + "num_tokens": 155038094.0, + "step": 4060 + }, + { + "epoch": 0.5166009413560616, + "ewc_loss": 0.04021460562944412, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015556403377559036, + "grad_norm": 5.008162498474121, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8464588522911072, + "num_tokens": 155075900.0, + "step": 4061 + }, + { + "epoch": 0.5167281516346521, + "ewc_loss": 0.040150489658117294, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.0001549228618387133, + "grad_norm": 4.991257190704346, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8559826016426086, + "num_tokens": 155117038.0, + "step": 4062 + }, + { + "epoch": 0.5168553619132426, + "ewc_loss": 0.04015365242958069, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015495448315050453, + "grad_norm": 5.0218634605407715, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8704706430435181, + "num_tokens": 155151442.0, + "step": 4063 + }, + { + "epoch": 0.5169825721918331, + "ewc_loss": 0.0401475727558136, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015489371435251087, + "grad_norm": 5.022003173828125, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8638004064559937, + "num_tokens": 155192288.0, + "step": 4064 + }, + { + "epoch": 0.5171097824704236, + "ewc_loss": 0.04014100506901741, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.0001548280124552548, + "grad_norm": 4.922054290771484, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8584276437759399, + "num_tokens": 155238931.0, + "step": 4065 + }, + { + "epoch": 0.5172369927490141, + "ewc_loss": 0.03997175395488739, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015435621025972068, + "grad_norm": 5.014475345611572, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8694589138031006, + "num_tokens": 155276644.0, + "step": 4066 + }, + { + "epoch": 0.5173642030276047, + "ewc_loss": 0.0400429293513298, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015506795898545533, + "grad_norm": 5.066701889038086, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8684845566749573, + "num_tokens": 155307638.0, + "step": 4067 + }, + { + "epoch": 0.5174914133061952, + "ewc_loss": 0.040096793323755264, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015438589616678655, + "grad_norm": 5.02772855758667, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8496485948562622, + "num_tokens": 155343255.0, + "step": 4068 + }, + { + "epoch": 0.5176186235847856, + "ewc_loss": 0.04010468348860741, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015446481120306998, + "grad_norm": 5.041264057159424, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8574182391166687, + "num_tokens": 155382283.0, + "step": 4069 + }, + { + "epoch": 0.5177458338633761, + "ewc_loss": 0.040122874081134796, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015464669559150934, + "grad_norm": 4.992302894592285, + "learning_rate": 1e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.8342194557189941, + "num_tokens": 155427096.0, + "step": 4070 + }, + { + "epoch": 0.5178730441419667, + "ewc_loss": 0.0399695485830307, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015433414955623448, + "grad_norm": 5.06835412979126, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8442214727401733, + "num_tokens": 155467472.0, + "step": 4071 + }, + { + "epoch": 0.5180002544205572, + "ewc_loss": 0.03998241946101189, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.0001544628757983446, + "grad_norm": 5.010058403015137, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8505373001098633, + "num_tokens": 155508350.0, + "step": 4072 + }, + { + "epoch": 0.5181274646991477, + "ewc_loss": 0.03996606171131134, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.0001542992831673473, + "grad_norm": 5.019473075866699, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8789154291152954, + "num_tokens": 155547215.0, + "step": 4073 + }, + { + "epoch": 0.5182546749777382, + "ewc_loss": 0.0399777851998806, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015441651339642704, + "grad_norm": 5.090369701385498, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8610948324203491, + "num_tokens": 155586658.0, + "step": 4074 + }, + { + "epoch": 0.5183818852563287, + "ewc_loss": 0.03994886204600334, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015412727952934802, + "grad_norm": 5.002167701721191, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8594182729721069, + "num_tokens": 155620944.0, + "step": 4075 + }, + { + "epoch": 0.5185090955349192, + "ewc_loss": 0.039960067719221115, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015423934382852167, + "grad_norm": 5.089686393737793, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8534795641899109, + "num_tokens": 155662699.0, + "step": 4076 + }, + { + "epoch": 0.5186363058135097, + "ewc_loss": 0.03998074308037758, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.0001544460974400863, + "grad_norm": 4.9682440757751465, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8437625169754028, + "num_tokens": 155707783.0, + "step": 4077 + }, + { + "epoch": 0.5187635160921003, + "ewc_loss": 0.0399128720164299, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015376738156192005, + "grad_norm": 5.059669017791748, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8620101809501648, + "num_tokens": 155745227.0, + "step": 4078 + }, + { + "epoch": 0.5188907263706908, + "ewc_loss": 0.0399877168238163, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015451584476977587, + "grad_norm": 4.993104457855225, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8581897020339966, + "num_tokens": 155786466.0, + "step": 4079 + }, + { + "epoch": 0.5190179366492813, + "ewc_loss": 0.03993653133511543, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.0001540039957035333, + "grad_norm": 5.000569820404053, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8585869073867798, + "num_tokens": 155833145.0, + "step": 4080 + }, + { + "epoch": 0.5191451469278717, + "ewc_loss": 0.03998231142759323, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.0001544617989566177, + "grad_norm": 5.041201591491699, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8462707996368408, + "num_tokens": 155869533.0, + "step": 4081 + }, + { + "epoch": 0.5192723572064623, + "ewc_loss": 0.03998982161283493, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015453687228728086, + "grad_norm": 5.032033443450928, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8547683954238892, + "num_tokens": 155909329.0, + "step": 4082 + }, + { + "epoch": 0.5193995674850528, + "ewc_loss": 0.04003042355179787, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015494291437789798, + "grad_norm": 5.020242214202881, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8648151159286499, + "num_tokens": 155947848.0, + "step": 4083 + }, + { + "epoch": 0.5195267777636433, + "ewc_loss": 0.039960429072380066, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.0001542429527034983, + "grad_norm": 5.0084991455078125, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8610700368881226, + "num_tokens": 155982693.0, + "step": 4084 + }, + { + "epoch": 0.5196539880422338, + "ewc_loss": 0.04016361013054848, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015505407645832747, + "grad_norm": 5.027733325958252, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8540953397750854, + "num_tokens": 156023414.0, + "step": 4085 + }, + { + "epoch": 0.5197811983208244, + "ewc_loss": 0.04003840684890747, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015502271708101034, + "grad_norm": 4.977755069732666, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.856166660785675, + "num_tokens": 156063488.0, + "step": 4086 + }, + { + "epoch": 0.5199084085994148, + "ewc_loss": 0.04004264622926712, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015506515046581626, + "grad_norm": 5.0523271560668945, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8563116192817688, + "num_tokens": 156103081.0, + "step": 4087 + }, + { + "epoch": 0.5200356188780053, + "ewc_loss": 0.040115177631378174, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015579046157654375, + "grad_norm": 5.025825023651123, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8476477265357971, + "num_tokens": 156139453.0, + "step": 4088 + }, + { + "epoch": 0.5201628291565958, + "ewc_loss": 0.0400383397936821, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015502206224482507, + "grad_norm": 5.066784381866455, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8504536151885986, + "num_tokens": 156176564.0, + "step": 4089 + }, + { + "epoch": 0.5202900394351864, + "ewc_loss": 0.040144193917512894, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015608061221428216, + "grad_norm": 5.046045303344727, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.859779953956604, + "num_tokens": 156221278.0, + "step": 4090 + }, + { + "epoch": 0.5204172497137769, + "ewc_loss": 0.04007679596543312, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015540662570856512, + "grad_norm": 5.0601806640625, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.860395073890686, + "num_tokens": 156259124.0, + "step": 4091 + }, + { + "epoch": 0.5205444599923674, + "ewc_loss": 0.040053561329841614, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015517427527811378, + "grad_norm": 5.046606063842773, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8460054397583008, + "num_tokens": 156297832.0, + "step": 4092 + }, + { + "epoch": 0.5206716702709578, + "ewc_loss": 0.04007677361369133, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.0001554064074298367, + "grad_norm": 5.1245036125183105, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8628953099250793, + "num_tokens": 156329745.0, + "step": 4093 + }, + { + "epoch": 0.5207988805495484, + "ewc_loss": 0.040105704218149185, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015569571405649185, + "grad_norm": 5.070393085479736, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8566163778305054, + "num_tokens": 156360154.0, + "step": 4094 + }, + { + "epoch": 0.5209260908281389, + "ewc_loss": 0.040101367980241776, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015565236390102655, + "grad_norm": 5.0183610916137695, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8591762781143188, + "num_tokens": 156399360.0, + "step": 4095 + }, + { + "epoch": 0.5210533011067294, + "ewc_loss": 0.04009902477264404, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015562892076559365, + "grad_norm": 5.0631866455078125, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8544949293136597, + "num_tokens": 156438782.0, + "step": 4096 + }, + { + "epoch": 0.52118051138532, + "ewc_loss": 0.04022590070962906, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015567698574159294, + "grad_norm": 5.131521224975586, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8631086349487305, + "num_tokens": 156470261.0, + "step": 4097 + }, + { + "epoch": 0.5213077216639105, + "ewc_loss": 0.04033936187624931, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.0001555908820591867, + "grad_norm": 5.007220268249512, + "learning_rate": 1e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8388941287994385, + "num_tokens": 156510493.0, + "step": 4098 + }, + { + "epoch": 0.5214349319425009, + "ewc_loss": 0.04020322486758232, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015545020869467407, + "grad_norm": 5.0786614418029785, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8591791391372681, + "num_tokens": 156545040.0, + "step": 4099 + }, + { + "epoch": 0.5215621422210914, + "ewc_loss": 0.04037848487496376, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015598212485201657, + "grad_norm": 5.022784233093262, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.854706883430481, + "num_tokens": 156583474.0, + "step": 4100 + }, + { + "epoch": 0.521689352499682, + "ewc_loss": 0.04033683240413666, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.0001555655908305198, + "grad_norm": 5.008020401000977, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8563463687896729, + "num_tokens": 156627137.0, + "step": 4101 + }, + { + "epoch": 0.5218165627782725, + "ewc_loss": 0.040362704545259476, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015582430933136493, + "grad_norm": 5.083738803863525, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8452972769737244, + "num_tokens": 156664726.0, + "step": 4102 + }, + { + "epoch": 0.521943773056863, + "ewc_loss": 0.040406834334135056, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015626561071258038, + "grad_norm": 5.0033860206604, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.873921811580658, + "num_tokens": 156707014.0, + "step": 4103 + }, + { + "epoch": 0.5220709833354535, + "ewc_loss": 0.04037272185087204, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.000155924484715797, + "grad_norm": 5.052432537078857, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.853410005569458, + "num_tokens": 156747784.0, + "step": 4104 + }, + { + "epoch": 0.522198193614044, + "ewc_loss": 0.04039411246776581, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.000156138397869654, + "grad_norm": 5.028789043426514, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8532568216323853, + "num_tokens": 156784760.0, + "step": 4105 + }, + { + "epoch": 0.5223254038926345, + "ewc_loss": 0.040390968322753906, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015610696573276073, + "grad_norm": 5.061925411224365, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8535418510437012, + "num_tokens": 156822899.0, + "step": 4106 + }, + { + "epoch": 0.522452614171225, + "ewc_loss": 0.040390823036432266, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015610549598932266, + "grad_norm": 4.986546516418457, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.863103985786438, + "num_tokens": 156861308.0, + "step": 4107 + }, + { + "epoch": 0.5225798244498155, + "ewc_loss": 0.040391355752944946, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015611080743838102, + "grad_norm": 5.050009250640869, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8619307279586792, + "num_tokens": 156901981.0, + "step": 4108 + }, + { + "epoch": 0.5227070347284061, + "ewc_loss": 0.04030059278011322, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015642390644643456, + "grad_norm": 5.039556503295898, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8619590997695923, + "num_tokens": 156942215.0, + "step": 4109 + }, + { + "epoch": 0.5228342450069966, + "ewc_loss": 0.040425024926662445, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015644750965293497, + "grad_norm": 5.058382987976074, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8634524941444397, + "num_tokens": 156983820.0, + "step": 4110 + }, + { + "epoch": 0.5229614552855871, + "ewc_loss": 0.04038519412279129, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.0001560491946293041, + "grad_norm": 5.061813831329346, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8684463500976562, + "num_tokens": 157016215.0, + "step": 4111 + }, + { + "epoch": 0.5230886655641775, + "ewc_loss": 0.04024754837155342, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.0001558934454806149, + "grad_norm": 5.0657243728637695, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.855830192565918, + "num_tokens": 157052071.0, + "step": 4112 + }, + { + "epoch": 0.5232158758427681, + "ewc_loss": 0.040317635983228683, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015537362196482718, + "grad_norm": 5.0339035987854, + "learning_rate": 1e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.835127055644989, + "num_tokens": 157097711.0, + "step": 4113 + }, + { + "epoch": 0.5233430861213586, + "ewc_loss": 0.040142208337783813, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015606076340191066, + "grad_norm": 5.081133842468262, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8548529744148254, + "num_tokens": 157134801.0, + "step": 4114 + }, + { + "epoch": 0.5234702963999491, + "ewc_loss": 0.040224336087703705, + "ewc_loss_diag": 2.467632293701172e-05, + "ewc_loss_parallel": 0.00015566134243272245, + "grad_norm": 5.038017272949219, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8565894365310669, + "num_tokens": 157171885.0, + "step": 4115 + }, + { + "epoch": 0.5235975066785397, + "ewc_loss": 0.040090031921863556, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015553900448139757, + "grad_norm": 5.045807838439941, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8552982807159424, + "num_tokens": 157211915.0, + "step": 4116 + }, + { + "epoch": 0.5237247169571302, + "ewc_loss": 0.04011283814907074, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015576706209685653, + "grad_norm": 5.013766765594482, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8500304818153381, + "num_tokens": 157260564.0, + "step": 4117 + }, + { + "epoch": 0.5238519272357206, + "ewc_loss": 0.04010453820228577, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015568407252430916, + "grad_norm": 5.098581314086914, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8584964871406555, + "num_tokens": 157294676.0, + "step": 4118 + }, + { + "epoch": 0.5239791375143111, + "ewc_loss": 0.04013296216726303, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015596830053254962, + "grad_norm": 5.019639015197754, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8663321137428284, + "num_tokens": 157334607.0, + "step": 4119 + }, + { + "epoch": 0.5241063477929017, + "ewc_loss": 0.04012675583362579, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015590623661410064, + "grad_norm": 5.0844855308532715, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8575557470321655, + "num_tokens": 157372021.0, + "step": 4120 + }, + { + "epoch": 0.5242335580714922, + "ewc_loss": 0.04039883613586426, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015618561883457005, + "grad_norm": 5.013772487640381, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8611062169075012, + "num_tokens": 157410092.0, + "step": 4121 + }, + { + "epoch": 0.5243607683500827, + "ewc_loss": 0.04036326706409454, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015582992637064308, + "grad_norm": 5.056985378265381, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.853131890296936, + "num_tokens": 157447979.0, + "step": 4122 + }, + { + "epoch": 0.5244879786286732, + "ewc_loss": 0.04013892263174057, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015602787607349455, + "grad_norm": 5.111764430999756, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8509640693664551, + "num_tokens": 157481376.0, + "step": 4123 + }, + { + "epoch": 0.5246151889072637, + "ewc_loss": 0.04013928025960922, + "ewc_loss_diag": 2.4557113647460938e-05, + "ewc_loss_parallel": 0.00015603145584464073, + "grad_norm": 5.092555522918701, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8630117177963257, + "num_tokens": 157519555.0, + "step": 4124 + }, + { + "epoch": 0.5247423991858542, + "ewc_loss": 0.040447037667036057, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015666763647459447, + "grad_norm": 5.091957092285156, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8552588820457458, + "num_tokens": 157551849.0, + "step": 4125 + }, + { + "epoch": 0.5248696094644447, + "ewc_loss": 0.04041813313961029, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.0001563785772304982, + "grad_norm": 5.046073913574219, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.856089174747467, + "num_tokens": 157590823.0, + "step": 4126 + }, + { + "epoch": 0.5249968197430352, + "ewc_loss": 0.04043366760015488, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.000156533918925561, + "grad_norm": 5.052764892578125, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8663797378540039, + "num_tokens": 157633771.0, + "step": 4127 + }, + { + "epoch": 0.5251240300216258, + "ewc_loss": 0.04044348746538162, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015663212980143726, + "grad_norm": 5.101183891296387, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.870750904083252, + "num_tokens": 157667796.0, + "step": 4128 + }, + { + "epoch": 0.5252512403002163, + "ewc_loss": 0.04043198004364967, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015651706780772656, + "grad_norm": 5.055301189422607, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8670101165771484, + "num_tokens": 157709452.0, + "step": 4129 + }, + { + "epoch": 0.5253784505788067, + "ewc_loss": 0.04037170857191086, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015591435658279806, + "grad_norm": 5.107452869415283, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.84761643409729, + "num_tokens": 157741407.0, + "step": 4130 + }, + { + "epoch": 0.5255056608573972, + "ewc_loss": 0.040388792753219604, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.00015608521061949432, + "grad_norm": 5.023185729980469, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8673681616783142, + "num_tokens": 157777693.0, + "step": 4131 + }, + { + "epoch": 0.5256328711359878, + "ewc_loss": 0.04038694500923157, + "ewc_loss_diag": 2.47955322265625e-05, + "ewc_loss_parallel": 0.0001560667296871543, + "grad_norm": 5.087944030761719, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8544372320175171, + "num_tokens": 157814729.0, + "step": 4132 + }, + { + "epoch": 0.5257600814145783, + "ewc_loss": 0.040707193315029144, + "ewc_loss_diag": 2.5033950805664062e-05, + "ewc_loss_parallel": 0.0001568278094055131, + "grad_norm": 5.057812213897705, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8660566210746765, + "num_tokens": 157853698.0, + "step": 4133 + }, + { + "epoch": 0.5258872916931688, + "ewc_loss": 0.0405818410217762, + "ewc_loss_diag": 2.5033950805664062e-05, + "ewc_loss_parallel": 0.00015557427832391113, + "grad_norm": 5.070505619049072, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.862076997756958, + "num_tokens": 157888580.0, + "step": 4134 + }, + { + "epoch": 0.5260145019717594, + "ewc_loss": 0.040817663073539734, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001567117724334821, + "grad_norm": 5.06494140625, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8549082279205322, + "num_tokens": 157928434.0, + "step": 4135 + }, + { + "epoch": 0.5261417122503498, + "ewc_loss": 0.040785014629364014, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001563853002153337, + "grad_norm": 5.060370445251465, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8525243401527405, + "num_tokens": 157968936.0, + "step": 4136 + }, + { + "epoch": 0.5262689225289403, + "ewc_loss": 0.040791742503643036, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001564526028232649, + "grad_norm": 5.156436920166016, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8457897901535034, + "num_tokens": 158005496.0, + "step": 4137 + }, + { + "epoch": 0.5263961328075308, + "ewc_loss": 0.04084669053554535, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015700205403845757, + "grad_norm": 5.083172798156738, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8645890951156616, + "num_tokens": 158047569.0, + "step": 4138 + }, + { + "epoch": 0.5265233430861214, + "ewc_loss": 0.040760815143585205, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001561433164170012, + "grad_norm": 5.079718589782715, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8558200597763062, + "num_tokens": 158085733.0, + "step": 4139 + }, + { + "epoch": 0.5266505533647119, + "ewc_loss": 0.040777333080768585, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015630846610292792, + "grad_norm": 5.079354286193848, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8463586568832397, + "num_tokens": 158123906.0, + "step": 4140 + }, + { + "epoch": 0.5267777636433024, + "ewc_loss": 0.04081635922193527, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015669876302126795, + "grad_norm": 5.060928821563721, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8629047274589539, + "num_tokens": 158163663.0, + "step": 4141 + }, + { + "epoch": 0.5269049739218928, + "ewc_loss": 0.040538907051086426, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015636562602594495, + "grad_norm": 5.290950775146484, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8445067405700684, + "num_tokens": 158205878.0, + "step": 4142 + }, + { + "epoch": 0.5270321842004834, + "ewc_loss": 0.04061385616660118, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015711512241978198, + "grad_norm": 5.092207431793213, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8608232736587524, + "num_tokens": 158241608.0, + "step": 4143 + }, + { + "epoch": 0.5271593944790739, + "ewc_loss": 0.04046668857336044, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015564345812890679, + "grad_norm": 5.080659866333008, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8695243000984192, + "num_tokens": 158274259.0, + "step": 4144 + }, + { + "epoch": 0.5272866047576644, + "ewc_loss": 0.040808744728565216, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001566226128488779, + "grad_norm": 5.196969985961914, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8631656765937805, + "num_tokens": 158304793.0, + "step": 4145 + }, + { + "epoch": 0.527413815036255, + "ewc_loss": 0.040527597069740295, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015625252854079008, + "grad_norm": 4.972718238830566, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8697525262832642, + "num_tokens": 158349420.0, + "step": 4146 + }, + { + "epoch": 0.5275410253148455, + "ewc_loss": 0.04051315039396286, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015610805712640285, + "grad_norm": 5.0113654136657715, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8559495210647583, + "num_tokens": 158395834.0, + "step": 4147 + }, + { + "epoch": 0.5276682355934359, + "ewc_loss": 0.040829308331012726, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015682824596296996, + "grad_norm": 5.061614513397217, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8583014607429504, + "num_tokens": 158435568.0, + "step": 4148 + }, + { + "epoch": 0.5277954458720264, + "ewc_loss": 0.040555570274591446, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.0001565322745591402, + "grad_norm": 5.045917510986328, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8663143515586853, + "num_tokens": 158473584.0, + "step": 4149 + }, + { + "epoch": 0.527922656150617, + "ewc_loss": 0.040591612458229065, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.0001568926963955164, + "grad_norm": 5.015283584594727, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8585301637649536, + "num_tokens": 158513435.0, + "step": 4150 + }, + { + "epoch": 0.5280498664292075, + "ewc_loss": 0.04058085381984711, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015678508498240262, + "grad_norm": 5.044283866882324, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8586152791976929, + "num_tokens": 158549478.0, + "step": 4151 + }, + { + "epoch": 0.528177076707798, + "ewc_loss": 0.040578778833150864, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.0001567643485032022, + "grad_norm": 4.973728179931641, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8613636493682861, + "num_tokens": 158598742.0, + "step": 4152 + }, + { + "epoch": 0.5283042869863885, + "ewc_loss": 0.04059984162449837, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.0001569749874761328, + "grad_norm": 5.107848644256592, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8455109596252441, + "num_tokens": 158636806.0, + "step": 4153 + }, + { + "epoch": 0.528431497264979, + "ewc_loss": 0.04064822196960449, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015745880955364555, + "grad_norm": 5.04501485824585, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8501198887825012, + "num_tokens": 158676043.0, + "step": 4154 + }, + { + "epoch": 0.5285587075435695, + "ewc_loss": 0.04059450328350067, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.0001569215819472447, + "grad_norm": 5.103725433349609, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8417827486991882, + "num_tokens": 158711772.0, + "step": 4155 + }, + { + "epoch": 0.52868591782216, + "ewc_loss": 0.04064404219388962, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015741700190119445, + "grad_norm": 5.066506385803223, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8556855916976929, + "num_tokens": 158749608.0, + "step": 4156 + }, + { + "epoch": 0.5288131281007505, + "ewc_loss": 0.04064888879656792, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015746545977890491, + "grad_norm": 5.126696586608887, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8576307892799377, + "num_tokens": 158790502.0, + "step": 4157 + }, + { + "epoch": 0.5289403383793411, + "ewc_loss": 0.040610119700431824, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015707775310147554, + "grad_norm": 5.030754566192627, + "learning_rate": 1e-06, + "loss": 0.5424, + "mean_token_accuracy": 0.8333625197410583, + "num_tokens": 158828415.0, + "step": 4158 + }, + { + "epoch": 0.5290675486579316, + "ewc_loss": 0.040577810257673264, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.0001567546569276601, + "grad_norm": 5.054033279418945, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8469748497009277, + "num_tokens": 158866972.0, + "step": 4159 + }, + { + "epoch": 0.5291947589365221, + "ewc_loss": 0.040612008422613144, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015709665603935719, + "grad_norm": 5.063960075378418, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.854625940322876, + "num_tokens": 158902991.0, + "step": 4160 + }, + { + "epoch": 0.5293219692151125, + "ewc_loss": 0.04063526540994644, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.0001573291956447065, + "grad_norm": 4.974842071533203, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.865057110786438, + "num_tokens": 158945963.0, + "step": 4161 + }, + { + "epoch": 0.5294491794937031, + "ewc_loss": 0.04077073186635971, + "ewc_loss_diag": 2.5033950805664062e-05, + "ewc_loss_parallel": 0.0001574631896801293, + "grad_norm": 5.080398082733154, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8557270765304565, + "num_tokens": 158985407.0, + "step": 4162 + }, + { + "epoch": 0.5295763897722936, + "ewc_loss": 0.04066857323050499, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.0001576622889842838, + "grad_norm": 5.009962558746338, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8492149114608765, + "num_tokens": 159024387.0, + "step": 4163 + }, + { + "epoch": 0.5297036000508841, + "ewc_loss": 0.04066392779350281, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015761583927087486, + "grad_norm": 5.075034141540527, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8480035662651062, + "num_tokens": 159061430.0, + "step": 4164 + }, + { + "epoch": 0.5298308103294747, + "ewc_loss": 0.04070692136883736, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015804577560629696, + "grad_norm": 5.0382280349731445, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8582955598831177, + "num_tokens": 159099350.0, + "step": 4165 + }, + { + "epoch": 0.5299580206080652, + "ewc_loss": 0.04092927277088165, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015782788977958262, + "grad_norm": 5.119643688201904, + "learning_rate": 1e-06, + "loss": 0.5567, + "mean_token_accuracy": 0.8310661315917969, + "num_tokens": 159136276.0, + "step": 4166 + }, + { + "epoch": 0.5300852308866556, + "ewc_loss": 0.04067597910761833, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.0001577363582327962, + "grad_norm": 4.97323751449585, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.878609836101532, + "num_tokens": 159177102.0, + "step": 4167 + }, + { + "epoch": 0.5302124411652461, + "ewc_loss": 0.04093070328235626, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015784220886416733, + "grad_norm": 5.078506946563721, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8678321838378906, + "num_tokens": 159213582.0, + "step": 4168 + }, + { + "epoch": 0.5303396514438367, + "ewc_loss": 0.04100751504302025, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015861030260566622, + "grad_norm": 5.051489353179932, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8555250763893127, + "num_tokens": 159254283.0, + "step": 4169 + }, + { + "epoch": 0.5304668617224272, + "ewc_loss": 0.04095091298222542, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015804427675902843, + "grad_norm": 5.114438056945801, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8589009046554565, + "num_tokens": 159288557.0, + "step": 4170 + }, + { + "epoch": 0.5305940720010177, + "ewc_loss": 0.04097699001431465, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.000158305061631836, + "grad_norm": 5.046531677246094, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8658213019371033, + "num_tokens": 159325047.0, + "step": 4171 + }, + { + "epoch": 0.5307212822796082, + "ewc_loss": 0.040919940918684006, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001577345683472231, + "grad_norm": 5.058919429779053, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8608010411262512, + "num_tokens": 159366291.0, + "step": 4172 + }, + { + "epoch": 0.5308484925581987, + "ewc_loss": 0.04093819856643677, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015791716577950865, + "grad_norm": 5.118147850036621, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8510209321975708, + "num_tokens": 159401634.0, + "step": 4173 + }, + { + "epoch": 0.5309757028367892, + "ewc_loss": 0.04068514704704285, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.0001578280352987349, + "grad_norm": 5.1321187019348145, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8483880758285522, + "num_tokens": 159435316.0, + "step": 4174 + }, + { + "epoch": 0.5311029131153797, + "ewc_loss": 0.04065876081585884, + "ewc_loss_diag": 2.491474151611328e-05, + "ewc_loss_parallel": 0.00015756416541989893, + "grad_norm": 5.036098957061768, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8497757911682129, + "num_tokens": 159472192.0, + "step": 4175 + }, + { + "epoch": 0.5312301233939702, + "ewc_loss": 0.040785014629364014, + "ewc_loss_diag": 2.5033950805664062e-05, + "ewc_loss_parallel": 0.00015760598762426525, + "grad_norm": 5.01295280456543, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8635001182556152, + "num_tokens": 159515342.0, + "step": 4176 + }, + { + "epoch": 0.5313573336725608, + "ewc_loss": 0.04090443253517151, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015757945948280394, + "grad_norm": 5.047389984130859, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8608502745628357, + "num_tokens": 159553932.0, + "step": 4177 + }, + { + "epoch": 0.5314845439511513, + "ewc_loss": 0.04097567871212959, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001582919358043, + "grad_norm": 5.126951694488525, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8578519821166992, + "num_tokens": 159584711.0, + "step": 4178 + }, + { + "epoch": 0.5316117542297417, + "ewc_loss": 0.04096536710858345, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015818883548490703, + "grad_norm": 5.0810747146606445, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8441126942634583, + "num_tokens": 159618784.0, + "step": 4179 + }, + { + "epoch": 0.5317389645083322, + "ewc_loss": 0.04094882681965828, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015802342386450619, + "grad_norm": 5.06088924407959, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8545042872428894, + "num_tokens": 159661288.0, + "step": 4180 + }, + { + "epoch": 0.5318661747869228, + "ewc_loss": 0.04095930606126785, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015812822675798088, + "grad_norm": 5.034186363220215, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.867750883102417, + "num_tokens": 159701286.0, + "step": 4181 + }, + { + "epoch": 0.5319933850655133, + "ewc_loss": 0.04095310717821121, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015806620649527758, + "grad_norm": 5.0489349365234375, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8451142907142639, + "num_tokens": 159746631.0, + "step": 4182 + }, + { + "epoch": 0.5321205953441038, + "ewc_loss": 0.04098459333181381, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015838106628507376, + "grad_norm": 5.052452087402344, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8597040176391602, + "num_tokens": 159787753.0, + "step": 4183 + }, + { + "epoch": 0.5322478056226944, + "ewc_loss": 0.04091537743806839, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015768891898915172, + "grad_norm": 5.11115026473999, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8494079113006592, + "num_tokens": 159823092.0, + "step": 4184 + }, + { + "epoch": 0.5323750159012848, + "ewc_loss": 0.04097048193216324, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001582399709150195, + "grad_norm": 5.0770182609558105, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8472409248352051, + "num_tokens": 159858480.0, + "step": 4185 + }, + { + "epoch": 0.5325022261798753, + "ewc_loss": 0.04092460125684738, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015778117813169956, + "grad_norm": 5.081913948059082, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8476280570030212, + "num_tokens": 159896220.0, + "step": 4186 + }, + { + "epoch": 0.5326294364584658, + "ewc_loss": 0.04110928624868393, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015840728883631527, + "grad_norm": 5.08357048034668, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8549676537513733, + "num_tokens": 159930995.0, + "step": 4187 + }, + { + "epoch": 0.5327566467370564, + "ewc_loss": 0.04094678908586502, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015800305118318647, + "grad_norm": 5.0303144454956055, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.851672887802124, + "num_tokens": 159968013.0, + "step": 4188 + }, + { + "epoch": 0.5328838570156469, + "ewc_loss": 0.04100184142589569, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001585535501362756, + "grad_norm": 5.0988850593566895, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8499904870986938, + "num_tokens": 160004085.0, + "step": 4189 + }, + { + "epoch": 0.5330110672942374, + "ewc_loss": 0.041099898517131805, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015831341443117708, + "grad_norm": 5.006499290466309, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8676517605781555, + "num_tokens": 160040606.0, + "step": 4190 + }, + { + "epoch": 0.5331382775728278, + "ewc_loss": 0.040995292365550995, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015848808106966317, + "grad_norm": 5.088129997253418, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8677937984466553, + "num_tokens": 160080792.0, + "step": 4191 + }, + { + "epoch": 0.5332654878514184, + "ewc_loss": 0.04100348800420761, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001585700229043141, + "grad_norm": 5.087783336639404, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8452059626579285, + "num_tokens": 160117055.0, + "step": 4192 + }, + { + "epoch": 0.5333926981300089, + "ewc_loss": 0.040917735546827316, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001577125076437369, + "grad_norm": 5.05733585357666, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8621416687965393, + "num_tokens": 160153223.0, + "step": 4193 + }, + { + "epoch": 0.5335199084085994, + "ewc_loss": 0.04093554988503456, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015789065218996257, + "grad_norm": 5.128321647644043, + "learning_rate": 1e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8393141627311707, + "num_tokens": 160190920.0, + "step": 4194 + }, + { + "epoch": 0.53364711868719, + "ewc_loss": 0.04094592109322548, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015799436368979514, + "grad_norm": 5.0900397300720215, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8548711538314819, + "num_tokens": 160221898.0, + "step": 4195 + }, + { + "epoch": 0.5337743289657805, + "ewc_loss": 0.04089261591434479, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001574612979311496, + "grad_norm": 5.005542278289795, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8591511249542236, + "num_tokens": 160267102.0, + "step": 4196 + }, + { + "epoch": 0.5339015392443709, + "ewc_loss": 0.040891699492931366, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015745213022455573, + "grad_norm": 5.075113773345947, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8619875907897949, + "num_tokens": 160304422.0, + "step": 4197 + }, + { + "epoch": 0.5340287495229614, + "ewc_loss": 0.04089750722050667, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015751022147014737, + "grad_norm": 5.071922779083252, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8537901043891907, + "num_tokens": 160339006.0, + "step": 4198 + }, + { + "epoch": 0.534155959801552, + "ewc_loss": 0.04087522625923157, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001572874461999163, + "grad_norm": 5.009125709533691, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8772788643836975, + "num_tokens": 160377504.0, + "step": 4199 + }, + { + "epoch": 0.5342831700801425, + "ewc_loss": 0.040853407233953476, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015706922567915171, + "grad_norm": 5.097355842590332, + "learning_rate": 1e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8360612392425537, + "num_tokens": 160413315.0, + "step": 4200 + }, + { + "epoch": 0.534410380358733, + "ewc_loss": 0.04093928635120392, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015792800695635378, + "grad_norm": 5.009261131286621, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8710216283798218, + "num_tokens": 160455034.0, + "step": 4201 + }, + { + "epoch": 0.5345375906373235, + "ewc_loss": 0.040879324078559875, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015732840984128416, + "grad_norm": 5.076621055603027, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8521913290023804, + "num_tokens": 160490505.0, + "step": 4202 + }, + { + "epoch": 0.534664800915914, + "ewc_loss": 0.04095827788114548, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015811793855391443, + "grad_norm": 5.106203556060791, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8665511608123779, + "num_tokens": 160528712.0, + "step": 4203 + }, + { + "epoch": 0.5347920111945045, + "ewc_loss": 0.04089631885290146, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015749836165923625, + "grad_norm": 5.041658401489258, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8489052057266235, + "num_tokens": 160567945.0, + "step": 4204 + }, + { + "epoch": 0.534919221473095, + "ewc_loss": 0.040944308042526245, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015797821106389165, + "grad_norm": 5.231429576873779, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8633482456207275, + "num_tokens": 160596767.0, + "step": 4205 + }, + { + "epoch": 0.5350464317516855, + "ewc_loss": 0.04099692404270172, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015850442287046462, + "grad_norm": 5.040527820587158, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8538455963134766, + "num_tokens": 160636912.0, + "step": 4206 + }, + { + "epoch": 0.5351736420302761, + "ewc_loss": 0.04087522625923157, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015728741709608585, + "grad_norm": 5.103610992431641, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8504712581634521, + "num_tokens": 160674588.0, + "step": 4207 + }, + { + "epoch": 0.5353008523088666, + "ewc_loss": 0.04098492115736008, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015838434046600014, + "grad_norm": 5.09556770324707, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8462076187133789, + "num_tokens": 160713441.0, + "step": 4208 + }, + { + "epoch": 0.5354280625874571, + "ewc_loss": 0.04097605124115944, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.0001570749591337517, + "grad_norm": 5.065529823303223, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.866186261177063, + "num_tokens": 160752716.0, + "step": 4209 + }, + { + "epoch": 0.5355552728660475, + "ewc_loss": 0.04090384766459465, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015757362416479737, + "grad_norm": 5.0578155517578125, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8668361902236938, + "num_tokens": 160798771.0, + "step": 4210 + }, + { + "epoch": 0.5356824831446381, + "ewc_loss": 0.04087115824222565, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015724672994110733, + "grad_norm": 5.008111000061035, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8738834261894226, + "num_tokens": 160843085.0, + "step": 4211 + }, + { + "epoch": 0.5358096934232286, + "ewc_loss": 0.04085122048854828, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015704738325439394, + "grad_norm": 5.09257698059082, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8493639230728149, + "num_tokens": 160883073.0, + "step": 4212 + }, + { + "epoch": 0.5359369037018191, + "ewc_loss": 0.04090588539838791, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001575940113980323, + "grad_norm": 5.015997409820557, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8631207346916199, + "num_tokens": 160923734.0, + "step": 4213 + }, + { + "epoch": 0.5360641139804097, + "ewc_loss": 0.04089867323637009, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001575218775542453, + "grad_norm": 5.116364479064941, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8540463447570801, + "num_tokens": 160963460.0, + "step": 4214 + }, + { + "epoch": 0.5361913242590002, + "ewc_loss": 0.04094168543815613, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015795200306456536, + "grad_norm": 5.058810234069824, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8556324243545532, + "num_tokens": 161003691.0, + "step": 4215 + }, + { + "epoch": 0.5363185345375906, + "ewc_loss": 0.040900107473134995, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001575362402945757, + "grad_norm": 5.0470991134643555, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.857974112033844, + "num_tokens": 161042056.0, + "step": 4216 + }, + { + "epoch": 0.5364457448161811, + "ewc_loss": 0.04090920090675354, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001576271461090073, + "grad_norm": 5.045098781585693, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8802323937416077, + "num_tokens": 161078670.0, + "step": 4217 + }, + { + "epoch": 0.5365729550947717, + "ewc_loss": 0.040960557758808136, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015814074140507728, + "grad_norm": 5.125208377838135, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8706158399581909, + "num_tokens": 161113992.0, + "step": 4218 + }, + { + "epoch": 0.5367001653733622, + "ewc_loss": 0.04088972508907318, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015743239782750607, + "grad_norm": 5.05003547668457, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8620253801345825, + "num_tokens": 161153763.0, + "step": 4219 + }, + { + "epoch": 0.5368273756519527, + "ewc_loss": 0.04088151454925537, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015735031047370285, + "grad_norm": 5.148502349853516, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8411052227020264, + "num_tokens": 161184186.0, + "step": 4220 + }, + { + "epoch": 0.5369545859305432, + "ewc_loss": 0.04103546589612961, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015766909928061068, + "grad_norm": 5.073966026306152, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8539182543754578, + "num_tokens": 161219298.0, + "step": 4221 + }, + { + "epoch": 0.5370817962091337, + "ewc_loss": 0.04100728780031204, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.0001573873305460438, + "grad_norm": 5.14275598526001, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.873102068901062, + "num_tokens": 161255252.0, + "step": 4222 + }, + { + "epoch": 0.5372090064877242, + "ewc_loss": 0.04086815565824509, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015721672389190644, + "grad_norm": 5.098699569702148, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8614081144332886, + "num_tokens": 161295890.0, + "step": 4223 + }, + { + "epoch": 0.5373362167663147, + "ewc_loss": 0.040993474423885345, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015724921831861138, + "grad_norm": 5.1645612716674805, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8537572026252747, + "num_tokens": 161330808.0, + "step": 4224 + }, + { + "epoch": 0.5374634270449052, + "ewc_loss": 0.0408906564116478, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015744174015708268, + "grad_norm": 5.026486873626709, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8651890754699707, + "num_tokens": 161370515.0, + "step": 4225 + }, + { + "epoch": 0.5375906373234958, + "ewc_loss": 0.04109770059585571, + "ewc_loss_diag": 2.5391578674316406e-05, + "ewc_loss_parallel": 0.00015707076818216592, + "grad_norm": 5.122153282165527, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8704237937927246, + "num_tokens": 161405257.0, + "step": 4226 + }, + { + "epoch": 0.5377178476020863, + "ewc_loss": 0.040908083319664, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.00015761599934194237, + "grad_norm": 5.054081439971924, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8573073148727417, + "num_tokens": 161443526.0, + "step": 4227 + }, + { + "epoch": 0.5378450578806767, + "ewc_loss": 0.04111597314476967, + "ewc_loss_diag": 2.5391578674316406e-05, + "ewc_loss_parallel": 0.0001572534820297733, + "grad_norm": 5.156248569488525, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8508381843566895, + "num_tokens": 161481262.0, + "step": 4228 + }, + { + "epoch": 0.5379722681592672, + "ewc_loss": 0.041080281138420105, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015811724006198347, + "grad_norm": 5.09603214263916, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8424539566040039, + "num_tokens": 161517756.0, + "step": 4229 + }, + { + "epoch": 0.5380994784378578, + "ewc_loss": 0.04096811264753342, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015699556388426572, + "grad_norm": 5.066222667694092, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8658669590950012, + "num_tokens": 161552680.0, + "step": 4230 + }, + { + "epoch": 0.5382266887164483, + "ewc_loss": 0.041017547249794006, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.0001574899215484038, + "grad_norm": 5.123679161071777, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8544530868530273, + "num_tokens": 161588607.0, + "step": 4231 + }, + { + "epoch": 0.5383538989950388, + "ewc_loss": 0.040961530059576035, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001581504475325346, + "grad_norm": 5.1160783767700195, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8400304317474365, + "num_tokens": 161626427.0, + "step": 4232 + }, + { + "epoch": 0.5384811092736294, + "ewc_loss": 0.04090922325849533, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001576274080434814, + "grad_norm": 5.053335189819336, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8632356524467468, + "num_tokens": 161665182.0, + "step": 4233 + }, + { + "epoch": 0.5386083195522198, + "ewc_loss": 0.04090007394552231, + "ewc_loss_diag": 2.5153160095214844e-05, + "ewc_loss_parallel": 0.0001575358910486102, + "grad_norm": 5.083569526672363, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8573538064956665, + "num_tokens": 161704685.0, + "step": 4234 + }, + { + "epoch": 0.5387355298308103, + "ewc_loss": 0.04103713110089302, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015768576122354716, + "grad_norm": 5.029673099517822, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8625924587249756, + "num_tokens": 161742960.0, + "step": 4235 + }, + { + "epoch": 0.5388627401094008, + "ewc_loss": 0.04101695865392685, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015748401347082108, + "grad_norm": 5.0652971267700195, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8566612601280212, + "num_tokens": 161778199.0, + "step": 4236 + }, + { + "epoch": 0.5389899503879914, + "ewc_loss": 0.04129473865032196, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.0001578204391989857, + "grad_norm": 5.0665459632873535, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8536710739135742, + "num_tokens": 161811235.0, + "step": 4237 + }, + { + "epoch": 0.5391171606665819, + "ewc_loss": 0.04105468839406967, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015786134463269264, + "grad_norm": 5.055288791656494, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8572230935096741, + "num_tokens": 161847669.0, + "step": 4238 + }, + { + "epoch": 0.5392443709451724, + "ewc_loss": 0.04135372117161751, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015841025742702186, + "grad_norm": 5.106259346008301, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8433043956756592, + "num_tokens": 161888470.0, + "step": 4239 + }, + { + "epoch": 0.5393715812237628, + "ewc_loss": 0.04136013984680176, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015847443137317896, + "grad_norm": 5.143229961395264, + "learning_rate": 1e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8375509977340698, + "num_tokens": 161924563.0, + "step": 4240 + }, + { + "epoch": 0.5394987915023534, + "ewc_loss": 0.04108399897813797, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015815442020539194, + "grad_norm": 5.074530124664307, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8437970280647278, + "num_tokens": 161959062.0, + "step": 4241 + }, + { + "epoch": 0.5396260017809439, + "ewc_loss": 0.04106912761926651, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.0001580057287355885, + "grad_norm": 5.086234092712402, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8616445064544678, + "num_tokens": 161994755.0, + "step": 4242 + }, + { + "epoch": 0.5397532120595344, + "ewc_loss": 0.04109097644686699, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.0001582242111908272, + "grad_norm": 5.054283618927002, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8667799234390259, + "num_tokens": 162032812.0, + "step": 4243 + }, + { + "epoch": 0.5398804223381249, + "ewc_loss": 0.04109352082014084, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015824967704247683, + "grad_norm": 5.084807872772217, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8603254556655884, + "num_tokens": 162067130.0, + "step": 4244 + }, + { + "epoch": 0.5400076326167155, + "ewc_loss": 0.04134821146726608, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015835514932405204, + "grad_norm": 5.038002967834473, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8594534397125244, + "num_tokens": 162107452.0, + "step": 4245 + }, + { + "epoch": 0.5401348428953059, + "ewc_loss": 0.041099533438682556, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015830979100428522, + "grad_norm": 5.033344268798828, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8576110601425171, + "num_tokens": 162144220.0, + "step": 4246 + }, + { + "epoch": 0.5402620531738964, + "ewc_loss": 0.0411548987030983, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015886346227489412, + "grad_norm": 5.597728729248047, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8661658763885498, + "num_tokens": 162181798.0, + "step": 4247 + }, + { + "epoch": 0.540389263452487, + "ewc_loss": 0.04159976541996002, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.0001608706807019189, + "grad_norm": 5.036742210388184, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8509869575500488, + "num_tokens": 162222501.0, + "step": 4248 + }, + { + "epoch": 0.5405164737310775, + "ewc_loss": 0.04121556878089905, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015702875680290163, + "grad_norm": 5.077953815460205, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8539596199989319, + "num_tokens": 162265533.0, + "step": 4249 + }, + { + "epoch": 0.540643684009668, + "ewc_loss": 0.04139681160449982, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015884116874076426, + "grad_norm": 5.069182872772217, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8728057146072388, + "num_tokens": 162304984.0, + "step": 4250 + }, + { + "epoch": 0.5407708942882585, + "ewc_loss": 0.04134203493595123, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015829337644390762, + "grad_norm": 5.111566543579102, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8480880260467529, + "num_tokens": 162342358.0, + "step": 4251 + }, + { + "epoch": 0.540898104566849, + "ewc_loss": 0.04112142324447632, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015852869546506554, + "grad_norm": 5.093936443328857, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8589508533477783, + "num_tokens": 162377343.0, + "step": 4252 + }, + { + "epoch": 0.5410253148454395, + "ewc_loss": 0.04115741327404976, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.0001588885934324935, + "grad_norm": 5.121346473693848, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8577874302864075, + "num_tokens": 162416416.0, + "step": 4253 + }, + { + "epoch": 0.54115252512403, + "ewc_loss": 0.04113145172595978, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015862898726481944, + "grad_norm": 5.110411167144775, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8837024569511414, + "num_tokens": 162447108.0, + "step": 4254 + }, + { + "epoch": 0.5412797354026205, + "ewc_loss": 0.04110819846391678, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015839641855563968, + "grad_norm": 5.092203140258789, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8594991564750671, + "num_tokens": 162489659.0, + "step": 4255 + }, + { + "epoch": 0.5414069456812111, + "ewc_loss": 0.04116763174533844, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015899074787739664, + "grad_norm": 5.1197309494018555, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8472821116447449, + "num_tokens": 162525816.0, + "step": 4256 + }, + { + "epoch": 0.5415341559598016, + "ewc_loss": 0.04114878177642822, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015880227147135884, + "grad_norm": 5.134956359863281, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.854485273361206, + "num_tokens": 162564154.0, + "step": 4257 + }, + { + "epoch": 0.5416613662383921, + "ewc_loss": 0.04147060960531235, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015957916912157089, + "grad_norm": 5.1185150146484375, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8518663644790649, + "num_tokens": 162602728.0, + "step": 4258 + }, + { + "epoch": 0.5417885765169825, + "ewc_loss": 0.041188374161720276, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015919821453280747, + "grad_norm": 5.0944414138793945, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8616842031478882, + "num_tokens": 162636659.0, + "step": 4259 + }, + { + "epoch": 0.5419157867955731, + "ewc_loss": 0.0411984920501709, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015929937944747508, + "grad_norm": 5.079361438751221, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8737137317657471, + "num_tokens": 162676639.0, + "step": 4260 + }, + { + "epoch": 0.5420429970741636, + "ewc_loss": 0.04130970686674118, + "ewc_loss_diag": 2.5391578674316406e-05, + "ewc_loss_parallel": 0.000159190793056041, + "grad_norm": 5.113952159881592, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8391088247299194, + "num_tokens": 162716763.0, + "step": 4261 + }, + { + "epoch": 0.5421702073527541, + "ewc_loss": 0.04157335311174393, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015938586147967726, + "grad_norm": 5.130792617797852, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8473765254020691, + "num_tokens": 162753943.0, + "step": 4262 + }, + { + "epoch": 0.5422974176313446, + "ewc_loss": 0.04143352434039116, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.0001592082844581455, + "grad_norm": 5.128977298736572, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8530928492546082, + "num_tokens": 162792938.0, + "step": 4263 + }, + { + "epoch": 0.5424246279099352, + "ewc_loss": 0.041435521095991135, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015922824968583882, + "grad_norm": 5.084421157836914, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8554254174232483, + "num_tokens": 162838938.0, + "step": 4264 + }, + { + "epoch": 0.5425518381885256, + "ewc_loss": 0.04138551279902458, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.000158728173119016, + "grad_norm": 5.092037677764893, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8642747402191162, + "num_tokens": 162875026.0, + "step": 4265 + }, + { + "epoch": 0.5426790484671161, + "ewc_loss": 0.041165728121995926, + "ewc_loss_diag": 2.5272369384765625e-05, + "ewc_loss_parallel": 0.00015897172852419317, + "grad_norm": 5.143672943115234, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8605113625526428, + "num_tokens": 162910724.0, + "step": 4266 + }, + { + "epoch": 0.5428062587457066, + "ewc_loss": 0.04150613397359848, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015871367941144854, + "grad_norm": 5.199368953704834, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8604007959365845, + "num_tokens": 162941602.0, + "step": 4267 + }, + { + "epoch": 0.5429334690242972, + "ewc_loss": 0.04140913113951683, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015896435070317239, + "grad_norm": 5.2363409996032715, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8456348180770874, + "num_tokens": 162980760.0, + "step": 4268 + }, + { + "epoch": 0.5430606793028877, + "ewc_loss": 0.04149019345641136, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.000158554277732037, + "grad_norm": 5.081958770751953, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8689776659011841, + "num_tokens": 163016155.0, + "step": 4269 + }, + { + "epoch": 0.5431878895814782, + "ewc_loss": 0.041324466466903687, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.000158117720275186, + "grad_norm": 5.268583297729492, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8497058749198914, + "num_tokens": 163052519.0, + "step": 4270 + }, + { + "epoch": 0.5433150998600687, + "ewc_loss": 0.04149423912167549, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015859473205637187, + "grad_norm": 5.023260116577148, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8665865659713745, + "num_tokens": 163091806.0, + "step": 4271 + }, + { + "epoch": 0.5434423101386592, + "ewc_loss": 0.041495516896247864, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015860749408602715, + "grad_norm": 5.315657138824463, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8732373714447021, + "num_tokens": 163128830.0, + "step": 4272 + }, + { + "epoch": 0.5435695204172497, + "ewc_loss": 0.04141072556376457, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015898029960226268, + "grad_norm": 5.061883926391602, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8526749610900879, + "num_tokens": 163169567.0, + "step": 4273 + }, + { + "epoch": 0.5436967306958402, + "ewc_loss": 0.041264064610004425, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.00015751369937788695, + "grad_norm": 5.330670356750488, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8510116338729858, + "num_tokens": 163202913.0, + "step": 4274 + }, + { + "epoch": 0.5438239409744308, + "ewc_loss": 0.04147125780582428, + "ewc_loss_diag": 2.5510787963867188e-05, + "ewc_loss_parallel": 0.0001595856447238475, + "grad_norm": 5.100968360900879, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8657374382019043, + "num_tokens": 163236741.0, + "step": 4275 + }, + { + "epoch": 0.5439511512530213, + "ewc_loss": 0.041380081325769424, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015745316341053694, + "grad_norm": 5.10603666305542, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8518844842910767, + "num_tokens": 163281402.0, + "step": 4276 + }, + { + "epoch": 0.5440783615316117, + "ewc_loss": 0.04151192680001259, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015877161058597267, + "grad_norm": 5.121306419372559, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8666648864746094, + "num_tokens": 163325288.0, + "step": 4277 + }, + { + "epoch": 0.5442055718102022, + "ewc_loss": 0.04147012531757355, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001583535922691226, + "grad_norm": 5.192340850830078, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8478460907936096, + "num_tokens": 163367471.0, + "step": 4278 + }, + { + "epoch": 0.5443327820887928, + "ewc_loss": 0.041287314146757126, + "ewc_loss_diag": 2.5391578674316406e-05, + "ewc_loss_parallel": 0.00015896689728833735, + "grad_norm": 5.225597381591797, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8583403825759888, + "num_tokens": 163400077.0, + "step": 4279 + }, + { + "epoch": 0.5444599923673833, + "ewc_loss": 0.041166167706251144, + "ewc_loss_diag": 2.5391578674316406e-05, + "ewc_loss_parallel": 0.00015775543579366058, + "grad_norm": 5.071349143981934, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8611764907836914, + "num_tokens": 163433739.0, + "step": 4280 + }, + { + "epoch": 0.5445872026459738, + "ewc_loss": 0.04114707186818123, + "ewc_loss_diag": 2.5391578674316406e-05, + "ewc_loss_parallel": 0.00015756447101011872, + "grad_norm": 5.104365825653076, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.857444703578949, + "num_tokens": 163467447.0, + "step": 4281 + }, + { + "epoch": 0.5447144129245644, + "ewc_loss": 0.04149467498064041, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015859906852710992, + "grad_norm": 5.190202713012695, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.854301393032074, + "num_tokens": 163506633.0, + "step": 4282 + }, + { + "epoch": 0.5448416232031548, + "ewc_loss": 0.04122054576873779, + "ewc_loss_diag": 2.5391578674316406e-05, + "ewc_loss_parallel": 0.00015829918265808374, + "grad_norm": 5.081470012664795, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8413364887237549, + "num_tokens": 163542773.0, + "step": 4283 + }, + { + "epoch": 0.5449688334817453, + "ewc_loss": 0.04144898056983948, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015814215294085443, + "grad_norm": 5.097916603088379, + "learning_rate": 1e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8245299458503723, + "num_tokens": 163585247.0, + "step": 4284 + }, + { + "epoch": 0.5450960437603358, + "ewc_loss": 0.041497040539979935, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015862274449318647, + "grad_norm": 5.14747428894043, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8461421132087708, + "num_tokens": 163622802.0, + "step": 4285 + }, + { + "epoch": 0.5452232540389264, + "ewc_loss": 0.04147116839885712, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001583640114404261, + "grad_norm": 5.094213008880615, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8815404176712036, + "num_tokens": 163655980.0, + "step": 4286 + }, + { + "epoch": 0.5453504643175169, + "ewc_loss": 0.0412091463804245, + "ewc_loss_diag": 2.5391578674316406e-05, + "ewc_loss_parallel": 0.00015818521205801517, + "grad_norm": 5.071591377258301, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8635224103927612, + "num_tokens": 163691494.0, + "step": 4287 + }, + { + "epoch": 0.5454776745961074, + "ewc_loss": 0.04127328842878342, + "ewc_loss_diag": 2.5391578674316406e-05, + "ewc_loss_parallel": 0.0001588266168255359, + "grad_norm": 5.141809940338135, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8694618940353394, + "num_tokens": 163725169.0, + "step": 4288 + }, + { + "epoch": 0.5456048848746978, + "ewc_loss": 0.041506849229335785, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015872082440182567, + "grad_norm": 5.04961633682251, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8609675168991089, + "num_tokens": 163766050.0, + "step": 4289 + }, + { + "epoch": 0.5457320951532884, + "ewc_loss": 0.041499316692352295, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001586455327924341, + "grad_norm": 5.132709980010986, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8516217470169067, + "num_tokens": 163799559.0, + "step": 4290 + }, + { + "epoch": 0.5458593054318789, + "ewc_loss": 0.04155892878770828, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015924160834401846, + "grad_norm": 5.189516067504883, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8518484830856323, + "num_tokens": 163837052.0, + "step": 4291 + }, + { + "epoch": 0.5459865157104694, + "ewc_loss": 0.04158245399594307, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001594768837094307, + "grad_norm": 5.119508743286133, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8548314571380615, + "num_tokens": 163872494.0, + "step": 4292 + }, + { + "epoch": 0.5461137259890599, + "ewc_loss": 0.04154253751039505, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015907769557088614, + "grad_norm": 5.068618297576904, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8504511713981628, + "num_tokens": 163913499.0, + "step": 4293 + }, + { + "epoch": 0.5462409362676505, + "ewc_loss": 0.041578762233257294, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015943998005241156, + "grad_norm": 5.08082389831543, + "learning_rate": 1e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8381337523460388, + "num_tokens": 163957965.0, + "step": 4294 + }, + { + "epoch": 0.5463681465462409, + "ewc_loss": 0.041610635817050934, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001597587252035737, + "grad_norm": 5.09462308883667, + "learning_rate": 1e-06, + "loss": 0.596, + "mean_token_accuracy": 0.8170071244239807, + "num_tokens": 164005661.0, + "step": 4295 + }, + { + "epoch": 0.5464953568248314, + "ewc_loss": 0.0415818989276886, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015947131032589823, + "grad_norm": 5.089183330535889, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8731863498687744, + "num_tokens": 164044717.0, + "step": 4296 + }, + { + "epoch": 0.5466225671034219, + "ewc_loss": 0.041579969227313995, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001594520581420511, + "grad_norm": 5.147379398345947, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8568960428237915, + "num_tokens": 164080022.0, + "step": 4297 + }, + { + "epoch": 0.5467497773820125, + "ewc_loss": 0.041585132479667664, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015950364468153566, + "grad_norm": 5.181695461273193, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8640521764755249, + "num_tokens": 164112749.0, + "step": 4298 + }, + { + "epoch": 0.546876987660603, + "ewc_loss": 0.04156205803155899, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015927290951367468, + "grad_norm": 5.131906986236572, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8526450991630554, + "num_tokens": 164152517.0, + "step": 4299 + }, + { + "epoch": 0.5470041979391935, + "ewc_loss": 0.04153925180435181, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001590448518982157, + "grad_norm": 5.039022922515869, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8640661239624023, + "num_tokens": 164195245.0, + "step": 4300 + }, + { + "epoch": 0.5471314082177839, + "ewc_loss": 0.041505612432956696, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015870844072196633, + "grad_norm": 5.119926929473877, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8523076772689819, + "num_tokens": 164230643.0, + "step": 4301 + }, + { + "epoch": 0.5472586184963745, + "ewc_loss": 0.04159650206565857, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015961735334713012, + "grad_norm": 5.110787391662598, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8542548418045044, + "num_tokens": 164271089.0, + "step": 4302 + }, + { + "epoch": 0.547385828774965, + "ewc_loss": 0.041567735373973846, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.000159329705638811, + "grad_norm": 5.056582450866699, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8558831214904785, + "num_tokens": 164310618.0, + "step": 4303 + }, + { + "epoch": 0.5475130390535555, + "ewc_loss": 0.04158814996480942, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015953386900946498, + "grad_norm": 5.200121879577637, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8446124792098999, + "num_tokens": 164348630.0, + "step": 4304 + }, + { + "epoch": 0.5476402493321461, + "ewc_loss": 0.04163428768515587, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015999522292986512, + "grad_norm": 5.148601055145264, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8643803596496582, + "num_tokens": 164383735.0, + "step": 4305 + }, + { + "epoch": 0.5477674596107366, + "ewc_loss": 0.041551925241947174, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015917158452793956, + "grad_norm": 5.090551853179932, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8578206300735474, + "num_tokens": 164426115.0, + "step": 4306 + }, + { + "epoch": 0.5478946698893271, + "ewc_loss": 0.04158990830183029, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015955143317114562, + "grad_norm": 5.221938610076904, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8504036664962769, + "num_tokens": 164462119.0, + "step": 4307 + }, + { + "epoch": 0.5480218801679175, + "ewc_loss": 0.0415835939347744, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001594882778590545, + "grad_norm": 5.1202778816223145, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8550381064414978, + "num_tokens": 164503030.0, + "step": 4308 + }, + { + "epoch": 0.5481490904465081, + "ewc_loss": 0.04150567203760147, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015870905190240592, + "grad_norm": 5.078586101531982, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8592950105667114, + "num_tokens": 164541091.0, + "step": 4309 + }, + { + "epoch": 0.5482763007250986, + "ewc_loss": 0.04158066213130951, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001594589848536998, + "grad_norm": 5.1334757804870605, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8513857126235962, + "num_tokens": 164577678.0, + "step": 4310 + }, + { + "epoch": 0.5484035110036891, + "ewc_loss": 0.04156051576137543, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015925749903544784, + "grad_norm": 5.0862531661987305, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8549661636352539, + "num_tokens": 164612779.0, + "step": 4311 + }, + { + "epoch": 0.5485307212822796, + "ewc_loss": 0.04160144180059433, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015966675709933043, + "grad_norm": 5.143177509307861, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8623547554016113, + "num_tokens": 164649849.0, + "step": 4312 + }, + { + "epoch": 0.5486579315608702, + "ewc_loss": 0.041629940271377563, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015995172725524753, + "grad_norm": 5.062713623046875, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8539701700210571, + "num_tokens": 164690538.0, + "step": 4313 + }, + { + "epoch": 0.5487851418394606, + "ewc_loss": 0.041541218757629395, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001590645406395197, + "grad_norm": 5.089707851409912, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8581099510192871, + "num_tokens": 164729338.0, + "step": 4314 + }, + { + "epoch": 0.5489123521180511, + "ewc_loss": 0.041639089584350586, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016004324425011873, + "grad_norm": 5.182905673980713, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8687567710876465, + "num_tokens": 164766648.0, + "step": 4315 + }, + { + "epoch": 0.5490395623966416, + "ewc_loss": 0.041601017117500305, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001596625370439142, + "grad_norm": 5.0580339431762695, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8559463024139404, + "num_tokens": 164804254.0, + "step": 4316 + }, + { + "epoch": 0.5491667726752322, + "ewc_loss": 0.04158814251422882, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015953379624988884, + "grad_norm": 5.095590114593506, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8707741498947144, + "num_tokens": 164839943.0, + "step": 4317 + }, + { + "epoch": 0.5492939829538227, + "ewc_loss": 0.041700899600982666, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016066136595327407, + "grad_norm": 5.091753959655762, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8502789735794067, + "num_tokens": 164887057.0, + "step": 4318 + }, + { + "epoch": 0.5494211932324132, + "ewc_loss": 0.04165218397974968, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016017418238334358, + "grad_norm": 5.1128740310668945, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8571321964263916, + "num_tokens": 164928515.0, + "step": 4319 + }, + { + "epoch": 0.5495484035110036, + "ewc_loss": 0.04164590686559677, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016011143452487886, + "grad_norm": 5.122934818267822, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8567644357681274, + "num_tokens": 164963818.0, + "step": 4320 + }, + { + "epoch": 0.5496756137895942, + "ewc_loss": 0.041641850024461746, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016007084923330694, + "grad_norm": 5.089322566986084, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8709703683853149, + "num_tokens": 165001091.0, + "step": 4321 + }, + { + "epoch": 0.5498028240681847, + "ewc_loss": 0.04168497771024704, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016050212434493005, + "grad_norm": 5.129513263702393, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8402297496795654, + "num_tokens": 165046009.0, + "step": 4322 + }, + { + "epoch": 0.5499300343467752, + "ewc_loss": 0.041669536381959915, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016034771397244185, + "grad_norm": 5.088333606719971, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8547254800796509, + "num_tokens": 165090729.0, + "step": 4323 + }, + { + "epoch": 0.5500572446253658, + "ewc_loss": 0.04164006933569908, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016005303768906742, + "grad_norm": 5.110128879547119, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8743537664413452, + "num_tokens": 165129771.0, + "step": 4324 + }, + { + "epoch": 0.5501844549039563, + "ewc_loss": 0.04168844223022461, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016053678700700402, + "grad_norm": 5.142951965332031, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8721607327461243, + "num_tokens": 165164310.0, + "step": 4325 + }, + { + "epoch": 0.5503116651825467, + "ewc_loss": 0.04166827350854874, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016033505380619317, + "grad_norm": 5.125344753265381, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8549480438232422, + "num_tokens": 165202842.0, + "step": 4326 + }, + { + "epoch": 0.5504388754611372, + "ewc_loss": 0.04163585603237152, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016001092444639653, + "grad_norm": 5.16071081161499, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8616670370101929, + "num_tokens": 165235998.0, + "step": 4327 + }, + { + "epoch": 0.5505660857397278, + "ewc_loss": 0.04166409373283386, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016029327525757253, + "grad_norm": 5.082398414611816, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8588959574699402, + "num_tokens": 165271779.0, + "step": 4328 + }, + { + "epoch": 0.5506932960183183, + "ewc_loss": 0.04162609577178955, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00015991332475095987, + "grad_norm": 5.146859645843506, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8514966368675232, + "num_tokens": 165306136.0, + "step": 4329 + }, + { + "epoch": 0.5508205062969088, + "ewc_loss": 0.04169077426195145, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016056008462328464, + "grad_norm": 5.066555023193359, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8721035122871399, + "num_tokens": 165342401.0, + "step": 4330 + }, + { + "epoch": 0.5509477165754993, + "ewc_loss": 0.04165485501289368, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001602008705958724, + "grad_norm": 5.119933605194092, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8603605628013611, + "num_tokens": 165380120.0, + "step": 4331 + }, + { + "epoch": 0.5510749268540898, + "ewc_loss": 0.041694194078445435, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016059426707215607, + "grad_norm": 5.125982284545898, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.844963788986206, + "num_tokens": 165418210.0, + "step": 4332 + }, + { + "epoch": 0.5512021371326803, + "ewc_loss": 0.041711702942848206, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016076939937192947, + "grad_norm": 5.113580226898193, + "learning_rate": 1e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8381072282791138, + "num_tokens": 165460952.0, + "step": 4333 + }, + { + "epoch": 0.5513293474112708, + "ewc_loss": 0.04183288663625717, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016076050815172493, + "grad_norm": 5.068732261657715, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8704007267951965, + "num_tokens": 165500370.0, + "step": 4334 + }, + { + "epoch": 0.5514565576898613, + "ewc_loss": 0.04170847311615944, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016073707956820726, + "grad_norm": 5.219587326049805, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.857274055480957, + "num_tokens": 165534089.0, + "step": 4335 + }, + { + "epoch": 0.5515837679684519, + "ewc_loss": 0.04174958914518356, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016114824393298477, + "grad_norm": 5.093291759490967, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8509122729301453, + "num_tokens": 165573111.0, + "step": 4336 + }, + { + "epoch": 0.5517109782470424, + "ewc_loss": 0.04166078567504883, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001602602278580889, + "grad_norm": 5.118127822875977, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8475927114486694, + "num_tokens": 165611942.0, + "step": 4337 + }, + { + "epoch": 0.5518381885256328, + "ewc_loss": 0.04176327958703041, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016128514835145324, + "grad_norm": 5.159447193145752, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8650438785552979, + "num_tokens": 165651703.0, + "step": 4338 + }, + { + "epoch": 0.5519653988042234, + "ewc_loss": 0.041661933064460754, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016027165111154318, + "grad_norm": 5.064780235290527, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8548228740692139, + "num_tokens": 165691257.0, + "step": 4339 + }, + { + "epoch": 0.5520926090828139, + "ewc_loss": 0.041698817163705826, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016064051305875182, + "grad_norm": 5.130256175994873, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8732828497886658, + "num_tokens": 165727947.0, + "step": 4340 + }, + { + "epoch": 0.5522198193614044, + "ewc_loss": 0.04172009229660034, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016085324750747532, + "grad_norm": 5.130476474761963, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8669424653053284, + "num_tokens": 165763400.0, + "step": 4341 + }, + { + "epoch": 0.5523470296399949, + "ewc_loss": 0.041804857552051544, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.0001604802382644266, + "grad_norm": 5.053512096405029, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8585624694824219, + "num_tokens": 165809586.0, + "step": 4342 + }, + { + "epoch": 0.5524742399185855, + "ewc_loss": 0.041744064539670944, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016109299031086266, + "grad_norm": 5.216681957244873, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.860068142414093, + "num_tokens": 165839565.0, + "step": 4343 + }, + { + "epoch": 0.5526014501971759, + "ewc_loss": 0.04174834489822388, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016113580204546452, + "grad_norm": 5.1192402839660645, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8613386154174805, + "num_tokens": 165873539.0, + "step": 4344 + }, + { + "epoch": 0.5527286604757664, + "ewc_loss": 0.041695673018693924, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016060908092185855, + "grad_norm": 5.041757106781006, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8661770224571228, + "num_tokens": 165920987.0, + "step": 4345 + }, + { + "epoch": 0.5528558707543569, + "ewc_loss": 0.04174306243658066, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016108296404127032, + "grad_norm": 5.182951927185059, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8475607633590698, + "num_tokens": 165965661.0, + "step": 4346 + }, + { + "epoch": 0.5529830810329475, + "ewc_loss": 0.04175204038619995, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001611727348063141, + "grad_norm": 5.223614692687988, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8512347936630249, + "num_tokens": 165998239.0, + "step": 4347 + }, + { + "epoch": 0.553110291311538, + "ewc_loss": 0.041675008833408356, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016040241462178528, + "grad_norm": 5.095711708068848, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8552250862121582, + "num_tokens": 166036489.0, + "step": 4348 + }, + { + "epoch": 0.5532375015901285, + "ewc_loss": 0.041781701147556305, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.0001602486299816519, + "grad_norm": 12.47524356842041, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8539023399353027, + "num_tokens": 166078774.0, + "step": 4349 + }, + { + "epoch": 0.5533647118687189, + "ewc_loss": 0.04997619614005089, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00024219360784627497, + "grad_norm": 6.5930986404418945, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8643192648887634, + "num_tokens": 166119555.0, + "step": 4350 + }, + { + "epoch": 0.5534919221473095, + "ewc_loss": 0.041010141372680664, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00015009165508672595, + "grad_norm": 4.532923221588135, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8561293482780457, + "num_tokens": 166159788.0, + "step": 4351 + }, + { + "epoch": 0.5536191324259, + "ewc_loss": 0.04376713186502457, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00018132365948986262, + "grad_norm": 5.850710391998291, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8524283170700073, + "num_tokens": 166200230.0, + "step": 4352 + }, + { + "epoch": 0.5537463427044905, + "ewc_loss": 0.0440453365445137, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00018410570919513702, + "grad_norm": 5.302432060241699, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8617326021194458, + "num_tokens": 166237864.0, + "step": 4353 + }, + { + "epoch": 0.553873552983081, + "ewc_loss": 0.04233081638813019, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016696051170583814, + "grad_norm": 5.324735164642334, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8608508706092834, + "num_tokens": 166278347.0, + "step": 4354 + }, + { + "epoch": 0.5540007632616716, + "ewc_loss": 0.04300740733742714, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00017372641013935208, + "grad_norm": 5.4006242752075195, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8488004803657532, + "num_tokens": 166315744.0, + "step": 4355 + }, + { + "epoch": 0.554127973540262, + "ewc_loss": 0.04245683550834656, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001682207075646147, + "grad_norm": 5.278621673583984, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8586856126785278, + "num_tokens": 166352705.0, + "step": 4356 + }, + { + "epoch": 0.5542551838188525, + "ewc_loss": 0.04238352179527283, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016748756752349436, + "grad_norm": 5.251777648925781, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8623918294906616, + "num_tokens": 166393956.0, + "step": 4357 + }, + { + "epoch": 0.554382394097443, + "ewc_loss": 0.042252130806446075, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016617364599369466, + "grad_norm": 5.305246829986572, + "learning_rate": 1e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8345085382461548, + "num_tokens": 166430172.0, + "step": 4358 + }, + { + "epoch": 0.5545096043760336, + "ewc_loss": 0.04218095913529396, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016546192637179047, + "grad_norm": 5.191245079040527, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8576102256774902, + "num_tokens": 166468732.0, + "step": 4359 + }, + { + "epoch": 0.5546368146546241, + "ewc_loss": 0.04200921580195427, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001637445093365386, + "grad_norm": 5.141989231109619, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.852776825428009, + "num_tokens": 166511383.0, + "step": 4360 + }, + { + "epoch": 0.5547640249332146, + "ewc_loss": 0.04203768074512482, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016402913024649024, + "grad_norm": 5.2022809982299805, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8614614009857178, + "num_tokens": 166547674.0, + "step": 4361 + }, + { + "epoch": 0.5548912352118052, + "ewc_loss": 0.041998960077762604, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016364191833417863, + "grad_norm": 5.228781223297119, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8510026931762695, + "num_tokens": 166581763.0, + "step": 4362 + }, + { + "epoch": 0.5550184454903956, + "ewc_loss": 0.04197627678513527, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001634151121834293, + "grad_norm": 5.191699504852295, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8539841175079346, + "num_tokens": 166618604.0, + "step": 4363 + }, + { + "epoch": 0.5551456557689861, + "ewc_loss": 0.041858553886413574, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016223786224145442, + "grad_norm": 5.124639511108398, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8680567145347595, + "num_tokens": 166654812.0, + "step": 4364 + }, + { + "epoch": 0.5552728660475766, + "ewc_loss": 0.041860856115818024, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016226092702709138, + "grad_norm": 5.2498674392700195, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8451656103134155, + "num_tokens": 166693206.0, + "step": 4365 + }, + { + "epoch": 0.5554000763261672, + "ewc_loss": 0.041832756251096725, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001619799149921164, + "grad_norm": 5.185369491577148, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8551177978515625, + "num_tokens": 166734531.0, + "step": 4366 + }, + { + "epoch": 0.5555272866047577, + "ewc_loss": 0.04182030260562897, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016185537970159203, + "grad_norm": 5.156456470489502, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8546814918518066, + "num_tokens": 166771362.0, + "step": 4367 + }, + { + "epoch": 0.5556544968833482, + "ewc_loss": 0.04180800914764404, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016173244512174278, + "grad_norm": 5.2241740226745605, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.848256528377533, + "num_tokens": 166802784.0, + "step": 4368 + }, + { + "epoch": 0.5557817071619386, + "ewc_loss": 0.04179349169135094, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016158726066350937, + "grad_norm": 5.128501892089844, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8532698154449463, + "num_tokens": 166835590.0, + "step": 4369 + }, + { + "epoch": 0.5559089174405292, + "ewc_loss": 0.0418223962187767, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016187631990760565, + "grad_norm": 5.19988489151001, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8525183200836182, + "num_tokens": 166872145.0, + "step": 4370 + }, + { + "epoch": 0.5560361277191197, + "ewc_loss": 0.043669119477272034, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00016203298582695425, + "grad_norm": 53.490482330322266, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8534717559814453, + "num_tokens": 166912855.0, + "step": 4371 + }, + { + "epoch": 0.5561633379977102, + "ewc_loss": 0.05938388779759407, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0003338291135150939, + "grad_norm": 7.958643436431885, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8424896001815796, + "num_tokens": 166949482.0, + "step": 4372 + }, + { + "epoch": 0.5562905482763008, + "ewc_loss": 0.04546026885509491, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.0001921514922287315, + "grad_norm": 4.858233451843262, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.857223391532898, + "num_tokens": 166986727.0, + "step": 4373 + }, + { + "epoch": 0.5564177585548913, + "ewc_loss": 0.04774057865142822, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00021739599469583482, + "grad_norm": 6.820792198181152, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8489424586296082, + "num_tokens": 167025869.0, + "step": 4374 + }, + { + "epoch": 0.5565449688334817, + "ewc_loss": 0.052459776401519775, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00026458798674866557, + "grad_norm": 13.304657936096191, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8677828311920166, + "num_tokens": 167059825.0, + "step": 4375 + }, + { + "epoch": 0.5566721791120722, + "ewc_loss": 0.054350949823856354, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.00028105833916924894, + "grad_norm": 6.349548816680908, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8620964288711548, + "num_tokens": 167097375.0, + "step": 4376 + }, + { + "epoch": 0.5567993893906628, + "ewc_loss": 0.043410755693912506, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00017409780411981046, + "grad_norm": 5.185147285461426, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8483074903488159, + "num_tokens": 167136555.0, + "step": 4377 + }, + { + "epoch": 0.5569265996692533, + "ewc_loss": 0.046166956424713135, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00020410120487213135, + "grad_norm": 5.894023418426514, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8547619581222534, + "num_tokens": 167179870.0, + "step": 4378 + }, + { + "epoch": 0.5570538099478438, + "ewc_loss": 0.04681684821844101, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00020815871539525688, + "grad_norm": 5.635722637176514, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.856320858001709, + "num_tokens": 167215242.0, + "step": 4379 + }, + { + "epoch": 0.5571810202264343, + "ewc_loss": 0.044207602739334106, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.0001845076767494902, + "grad_norm": 5.511052131652832, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8648406267166138, + "num_tokens": 167250690.0, + "step": 4380 + }, + { + "epoch": 0.5573082305050248, + "ewc_loss": 0.04436977952718735, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00018612942949403077, + "grad_norm": 5.519103050231934, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8430823087692261, + "num_tokens": 167283677.0, + "step": 4381 + }, + { + "epoch": 0.5574354407836153, + "ewc_loss": 0.04413715377449989, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00018380317487753928, + "grad_norm": 5.3861565589904785, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8577015399932861, + "num_tokens": 167321355.0, + "step": 4382 + }, + { + "epoch": 0.5575626510622058, + "ewc_loss": 0.04349552094936371, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.0001773868571035564, + "grad_norm": 5.3120832443237305, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8494254350662231, + "num_tokens": 167363103.0, + "step": 4383 + }, + { + "epoch": 0.5576898613407963, + "ewc_loss": 0.043406788259744644, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00017649952496867627, + "grad_norm": 5.328271389007568, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8707689642906189, + "num_tokens": 167399732.0, + "step": 4384 + }, + { + "epoch": 0.5578170716193869, + "ewc_loss": 0.04316805303096771, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00017411216686014086, + "grad_norm": 5.332223415374756, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8563171029090881, + "num_tokens": 167434938.0, + "step": 4385 + }, + { + "epoch": 0.5579442818979774, + "ewc_loss": 0.04297594353556633, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00017219108121935278, + "grad_norm": 5.285484790802002, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8613000512123108, + "num_tokens": 167472293.0, + "step": 4386 + }, + { + "epoch": 0.5580714921765678, + "ewc_loss": 0.042625971138477325, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016991204756777734, + "grad_norm": 5.199333667755127, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.861315131187439, + "num_tokens": 167511015.0, + "step": 4387 + }, + { + "epoch": 0.5581987024551583, + "ewc_loss": 0.04263246804475784, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016875630535650998, + "grad_norm": 5.253140926361084, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.856037437915802, + "num_tokens": 167549777.0, + "step": 4388 + }, + { + "epoch": 0.5583259127337489, + "ewc_loss": 0.04256380349397659, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016806968778837472, + "grad_norm": 5.197143077850342, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8636854290962219, + "num_tokens": 167592342.0, + "step": 4389 + }, + { + "epoch": 0.5584531230123394, + "ewc_loss": 0.04227854311466217, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.0001664377487031743, + "grad_norm": 5.247544765472412, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8605835437774658, + "num_tokens": 167630829.0, + "step": 4390 + }, + { + "epoch": 0.5585803332909299, + "ewc_loss": 0.04241396486759186, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.0001665712770773098, + "grad_norm": 5.189719200134277, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8457579612731934, + "num_tokens": 167675938.0, + "step": 4391 + }, + { + "epoch": 0.5587075435695205, + "ewc_loss": 0.042128778994083405, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016494013834744692, + "grad_norm": 5.2049455642700195, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8618125915527344, + "num_tokens": 167716550.0, + "step": 4392 + }, + { + "epoch": 0.5588347538481109, + "ewc_loss": 0.042113225907087326, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016478460747748613, + "grad_norm": 5.182868003845215, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8435893058776855, + "num_tokens": 167754401.0, + "step": 4393 + }, + { + "epoch": 0.5589619641267014, + "ewc_loss": 0.04209215193986893, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016457388119306415, + "grad_norm": 5.194949626922607, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8617825508117676, + "num_tokens": 167792124.0, + "step": 4394 + }, + { + "epoch": 0.5590891744052919, + "ewc_loss": 0.04203806817531586, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016403301560785621, + "grad_norm": 5.144664764404297, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8761569261550903, + "num_tokens": 167829226.0, + "step": 4395 + }, + { + "epoch": 0.5592163846838825, + "ewc_loss": 0.04199044406414032, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016355677507817745, + "grad_norm": 5.192480087280273, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8676940202713013, + "num_tokens": 167863439.0, + "step": 4396 + }, + { + "epoch": 0.559343594962473, + "ewc_loss": 0.04211519658565521, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016358360880985856, + "grad_norm": 5.1672139167785645, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8580719232559204, + "num_tokens": 167907019.0, + "step": 4397 + }, + { + "epoch": 0.5594708052410635, + "ewc_loss": 0.04204072803258896, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016283894365187734, + "grad_norm": 5.1899333000183105, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8502342700958252, + "num_tokens": 167942539.0, + "step": 4398 + }, + { + "epoch": 0.5595980155196539, + "ewc_loss": 0.041951049119234085, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016316284018103033, + "grad_norm": 5.148359298706055, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8574084043502808, + "num_tokens": 167982848.0, + "step": 4399 + }, + { + "epoch": 0.5597252257982445, + "ewc_loss": 0.04192931577563286, + "ewc_loss_diag": 2.562999725341797e-05, + "ewc_loss_parallel": 0.00016294549277517945, + "grad_norm": 5.208834171295166, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.866304874420166, + "num_tokens": 168017500.0, + "step": 4400 + }, + { + "epoch": 0.559852436076835, + "ewc_loss": 0.042052753269672394, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016295915702357888, + "grad_norm": 5.179360866546631, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8432735204696655, + "num_tokens": 168052790.0, + "step": 4401 + }, + { + "epoch": 0.5599796463554255, + "ewc_loss": 0.04203593730926514, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.0001627910096431151, + "grad_norm": 5.21068811416626, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.864088773727417, + "num_tokens": 168082541.0, + "step": 4402 + }, + { + "epoch": 0.560106856634016, + "ewc_loss": 0.0420571006834507, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016300263814628124, + "grad_norm": 5.088900566101074, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8631443977355957, + "num_tokens": 168128218.0, + "step": 4403 + }, + { + "epoch": 0.5602340669126066, + "ewc_loss": 0.042021267116069794, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016264432633761317, + "grad_norm": 5.187147617340088, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8591135144233704, + "num_tokens": 168165496.0, + "step": 4404 + }, + { + "epoch": 0.560361277191197, + "ewc_loss": 0.04206103831529617, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.0001630420156288892, + "grad_norm": 5.148623466491699, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8456575274467468, + "num_tokens": 168204675.0, + "step": 4405 + }, + { + "epoch": 0.5604884874697875, + "ewc_loss": 0.04204419255256653, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016287359176203609, + "grad_norm": 5.163164138793945, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8531463742256165, + "num_tokens": 168244832.0, + "step": 4406 + }, + { + "epoch": 0.560615697748378, + "ewc_loss": 0.04205435886979103, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.000162975222337991, + "grad_norm": 5.138735771179199, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8528473377227783, + "num_tokens": 168282440.0, + "step": 4407 + }, + { + "epoch": 0.5607429080269686, + "ewc_loss": 0.04203450679779053, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016277671966236085, + "grad_norm": 5.1862287521362305, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8629324436187744, + "num_tokens": 168325303.0, + "step": 4408 + }, + { + "epoch": 0.5608701183055591, + "ewc_loss": 0.04207426682114601, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016317430709023029, + "grad_norm": 5.2469000816345215, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8639850616455078, + "num_tokens": 168360628.0, + "step": 4409 + }, + { + "epoch": 0.5609973285841496, + "ewc_loss": 0.04204411432147026, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016287279140669852, + "grad_norm": 5.168787002563477, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8443713188171387, + "num_tokens": 168398601.0, + "step": 4410 + }, + { + "epoch": 0.5611245388627402, + "ewc_loss": 0.042023226618766785, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016266389866359532, + "grad_norm": 5.150022506713867, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8433567881584167, + "num_tokens": 168436893.0, + "step": 4411 + }, + { + "epoch": 0.5612517491413306, + "ewc_loss": 0.04202335327863693, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016266517923213542, + "grad_norm": 5.125029563903809, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8525265455245972, + "num_tokens": 168478794.0, + "step": 4412 + }, + { + "epoch": 0.5613789594199211, + "ewc_loss": 0.04202933982014656, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016272504581138492, + "grad_norm": 5.212793827056885, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8551611304283142, + "num_tokens": 168520184.0, + "step": 4413 + }, + { + "epoch": 0.5615061696985116, + "ewc_loss": 0.04204468056559563, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016287843754980713, + "grad_norm": 5.177333831787109, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.853857159614563, + "num_tokens": 168554305.0, + "step": 4414 + }, + { + "epoch": 0.5616333799771022, + "ewc_loss": 0.04228002578020096, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016279048577416688, + "grad_norm": 5.143975257873535, + "learning_rate": 1e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8409452438354492, + "num_tokens": 168597752.0, + "step": 4415 + }, + { + "epoch": 0.5617605902556927, + "ewc_loss": 0.04203327000141144, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016276432143058628, + "grad_norm": 5.167169094085693, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8627824783325195, + "num_tokens": 168637764.0, + "step": 4416 + }, + { + "epoch": 0.5618878005342832, + "ewc_loss": 0.0420246347784996, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016267798491753638, + "grad_norm": 5.147497653961182, + "learning_rate": 1e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.837841272354126, + "num_tokens": 168672260.0, + "step": 4417 + }, + { + "epoch": 0.5620150108128736, + "ewc_loss": 0.0423327311873436, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016331755614373833, + "grad_norm": 5.154469013214111, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8652278184890747, + "num_tokens": 168711478.0, + "step": 4418 + }, + { + "epoch": 0.5621422210914642, + "ewc_loss": 0.042314670979976654, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016313693777192384, + "grad_norm": 5.147281646728516, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8603928685188293, + "num_tokens": 168751773.0, + "step": 4419 + }, + { + "epoch": 0.5622694313700547, + "ewc_loss": 0.04233366996049881, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016332695668097585, + "grad_norm": 5.185556411743164, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8680082559585571, + "num_tokens": 168786364.0, + "step": 4420 + }, + { + "epoch": 0.5623966416486452, + "ewc_loss": 0.042112693190574646, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.0001635585504118353, + "grad_norm": 5.135072231292725, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8542421460151672, + "num_tokens": 168826819.0, + "step": 4421 + }, + { + "epoch": 0.5625238519272358, + "ewc_loss": 0.04208158329129219, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016324747411999851, + "grad_norm": 5.131171703338623, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8551132678985596, + "num_tokens": 168869405.0, + "step": 4422 + }, + { + "epoch": 0.5626510622058263, + "ewc_loss": 0.04236572980880737, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016364750626962632, + "grad_norm": 5.296348571777344, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8630977869033813, + "num_tokens": 168904323.0, + "step": 4423 + }, + { + "epoch": 0.5627782724844167, + "ewc_loss": 0.04234153404831886, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001634055661270395, + "grad_norm": 5.099763870239258, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8475970029830933, + "num_tokens": 168948654.0, + "step": 4424 + }, + { + "epoch": 0.5629054827630072, + "ewc_loss": 0.04207096993923187, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016314134700223804, + "grad_norm": 5.192346572875977, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8635333776473999, + "num_tokens": 168984333.0, + "step": 4425 + }, + { + "epoch": 0.5630326930415978, + "ewc_loss": 0.042364031076431274, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001636305678403005, + "grad_norm": 5.175036430358887, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8572738170623779, + "num_tokens": 169019843.0, + "step": 4426 + }, + { + "epoch": 0.5631599033201883, + "ewc_loss": 0.04231424629688263, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016313267406076193, + "grad_norm": 5.12883186340332, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8683742880821228, + "num_tokens": 169058177.0, + "step": 4427 + }, + { + "epoch": 0.5632871135987788, + "ewc_loss": 0.042125239968299866, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016368403157684952, + "grad_norm": 5.139284133911133, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8629003167152405, + "num_tokens": 169102268.0, + "step": 4428 + }, + { + "epoch": 0.5634143238773693, + "ewc_loss": 0.04234598949551582, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016345013864338398, + "grad_norm": 5.156906604766846, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8521808981895447, + "num_tokens": 169141709.0, + "step": 4429 + }, + { + "epoch": 0.5635415341559598, + "ewc_loss": 0.04212357476353645, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016366738418582827, + "grad_norm": 5.228331565856934, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8538618087768555, + "num_tokens": 169177107.0, + "step": 4430 + }, + { + "epoch": 0.5636687444345503, + "ewc_loss": 0.04213913530111313, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.00016382297326344997, + "grad_norm": 5.125145435333252, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8540887832641602, + "num_tokens": 169211531.0, + "step": 4431 + }, + { + "epoch": 0.5637959547131408, + "ewc_loss": 0.04237113147974014, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016370152297895402, + "grad_norm": 5.253485202789307, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8496156334877014, + "num_tokens": 169247507.0, + "step": 4432 + }, + { + "epoch": 0.5639231649917313, + "ewc_loss": 0.04240161180496216, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016400637105107307, + "grad_norm": 5.179923057556152, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8705644011497498, + "num_tokens": 169281007.0, + "step": 4433 + }, + { + "epoch": 0.5640503752703219, + "ewc_loss": 0.04212681204080582, + "ewc_loss_diag": 2.574920654296875e-05, + "ewc_loss_parallel": 0.0001636997767491266, + "grad_norm": 5.166689395904541, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8588632345199585, + "num_tokens": 169322295.0, + "step": 4434 + }, + { + "epoch": 0.5641775855489124, + "ewc_loss": 0.04240240156650543, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016401425818912685, + "grad_norm": 5.154686450958252, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8532266616821289, + "num_tokens": 169358184.0, + "step": 4435 + }, + { + "epoch": 0.5643047958275028, + "ewc_loss": 0.04235877841711044, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001635780354263261, + "grad_norm": 5.187109470367432, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8621565699577332, + "num_tokens": 169391518.0, + "step": 4436 + }, + { + "epoch": 0.5644320061060933, + "ewc_loss": 0.04241666942834854, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016415692516602576, + "grad_norm": 5.166099548339844, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8571349382400513, + "num_tokens": 169426743.0, + "step": 4437 + }, + { + "epoch": 0.5645592163846839, + "ewc_loss": 0.042426496744155884, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016425520880147815, + "grad_norm": 5.116808891296387, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8765143156051636, + "num_tokens": 169466452.0, + "step": 4438 + }, + { + "epoch": 0.5646864266632744, + "ewc_loss": 0.042414721101522446, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001641374547034502, + "grad_norm": 5.20697546005249, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.862251877784729, + "num_tokens": 169500295.0, + "step": 4439 + }, + { + "epoch": 0.5648136369418649, + "ewc_loss": 0.042470768094062805, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016469793627038598, + "grad_norm": 5.162588119506836, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8576815128326416, + "num_tokens": 169541829.0, + "step": 4440 + }, + { + "epoch": 0.5649408472204555, + "ewc_loss": 0.04241929203271866, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016418314771726727, + "grad_norm": 5.1833295822143555, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8417186737060547, + "num_tokens": 169581547.0, + "step": 4441 + }, + { + "epoch": 0.5650680574990459, + "ewc_loss": 0.042431920766830444, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001643094583414495, + "grad_norm": 5.159846305847168, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8617018461227417, + "num_tokens": 169622107.0, + "step": 4442 + }, + { + "epoch": 0.5651952677776364, + "ewc_loss": 0.04239467903971672, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016393701662309468, + "grad_norm": 5.1792473793029785, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8545740246772766, + "num_tokens": 169657258.0, + "step": 4443 + }, + { + "epoch": 0.5653224780562269, + "ewc_loss": 0.042442355304956436, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016441378102172166, + "grad_norm": 5.223237991333008, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8469102382659912, + "num_tokens": 169691568.0, + "step": 4444 + }, + { + "epoch": 0.5654496883348175, + "ewc_loss": 0.04244741052389145, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016446436347905546, + "grad_norm": 5.1900224685668945, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8636734485626221, + "num_tokens": 169732429.0, + "step": 4445 + }, + { + "epoch": 0.565576898613408, + "ewc_loss": 0.04240194708108902, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016400973254349083, + "grad_norm": 5.240932941436768, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8508428335189819, + "num_tokens": 169769224.0, + "step": 4446 + }, + { + "epoch": 0.5657041088919985, + "ewc_loss": 0.04236440360546112, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016363427857868373, + "grad_norm": 5.152748107910156, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8564326763153076, + "num_tokens": 169812830.0, + "step": 4447 + }, + { + "epoch": 0.5658313191705889, + "ewc_loss": 0.04233246669173241, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016331489314325154, + "grad_norm": 5.2212233543396, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8550928831100464, + "num_tokens": 169843823.0, + "step": 4448 + }, + { + "epoch": 0.5659585294491795, + "ewc_loss": 0.04237155243754387, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016370575758628547, + "grad_norm": 5.172889232635498, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8602133989334106, + "num_tokens": 169882847.0, + "step": 4449 + }, + { + "epoch": 0.56608573972777, + "ewc_loss": 0.04236485809087753, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016363880422431976, + "grad_norm": 5.20258903503418, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8391948938369751, + "num_tokens": 169921264.0, + "step": 4450 + }, + { + "epoch": 0.5662129500063605, + "ewc_loss": 0.04235929623246193, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016358320135623217, + "grad_norm": 5.205615997314453, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8633346557617188, + "num_tokens": 169949681.0, + "step": 4451 + }, + { + "epoch": 0.566340160284951, + "ewc_loss": 0.04235253483057022, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016351556405425072, + "grad_norm": 5.17385196685791, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8714155554771423, + "num_tokens": 169986254.0, + "step": 4452 + }, + { + "epoch": 0.5664673705635416, + "ewc_loss": 0.042397789657115936, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001639681140659377, + "grad_norm": 5.189995288848877, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8532218337059021, + "num_tokens": 170022745.0, + "step": 4453 + }, + { + "epoch": 0.566594580842132, + "ewc_loss": 0.04239598661661148, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001639501133468002, + "grad_norm": 5.189941883087158, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.868095874786377, + "num_tokens": 170057038.0, + "step": 4454 + }, + { + "epoch": 0.5667217911207225, + "ewc_loss": 0.04241508990526199, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001641411508899182, + "grad_norm": 5.213139533996582, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.875674307346344, + "num_tokens": 170098460.0, + "step": 4455 + }, + { + "epoch": 0.566849001399313, + "ewc_loss": 0.042402129620313644, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016401152242906392, + "grad_norm": 5.158637046813965, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8451263904571533, + "num_tokens": 170137689.0, + "step": 4456 + }, + { + "epoch": 0.5669762116779036, + "ewc_loss": 0.042400866746902466, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016399887681473047, + "grad_norm": 5.184074878692627, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8578809499740601, + "num_tokens": 170176716.0, + "step": 4457 + }, + { + "epoch": 0.5671034219564941, + "ewc_loss": 0.04238121211528778, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016380235319957137, + "grad_norm": 5.254995346069336, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.853183388710022, + "num_tokens": 170212621.0, + "step": 4458 + }, + { + "epoch": 0.5672306322350846, + "ewc_loss": 0.042385589331388474, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001638461253605783, + "grad_norm": 5.137824058532715, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8475063443183899, + "num_tokens": 170254493.0, + "step": 4459 + }, + { + "epoch": 0.5673578425136752, + "ewc_loss": 0.042350322008132935, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016349345969501883, + "grad_norm": 5.169836521148682, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8783422112464905, + "num_tokens": 170290559.0, + "step": 4460 + }, + { + "epoch": 0.5674850527922656, + "ewc_loss": 0.04262083023786545, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.00016375715495087206, + "grad_norm": 5.127281188964844, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8524816632270813, + "num_tokens": 170335079.0, + "step": 4461 + }, + { + "epoch": 0.5676122630708561, + "ewc_loss": 0.04259968921542168, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.0001635457156226039, + "grad_norm": 5.194275856018066, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8610929250717163, + "num_tokens": 170370665.0, + "step": 4462 + }, + { + "epoch": 0.5677394733494466, + "ewc_loss": 0.04243718832731247, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016436210717074573, + "grad_norm": 5.16520881652832, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8560702800750732, + "num_tokens": 170412585.0, + "step": 4463 + }, + { + "epoch": 0.5678666836280372, + "ewc_loss": 0.0424007773399353, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016399800369981676, + "grad_norm": 5.143548965454102, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8634195327758789, + "num_tokens": 170450806.0, + "step": 4464 + }, + { + "epoch": 0.5679938939066277, + "ewc_loss": 0.04242934286594391, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016428367234766483, + "grad_norm": 5.237889766693115, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8582526445388794, + "num_tokens": 170483265.0, + "step": 4465 + }, + { + "epoch": 0.5681211041852182, + "ewc_loss": 0.04249122738838196, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001649025216465816, + "grad_norm": 5.168440341949463, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8740138411521912, + "num_tokens": 170520910.0, + "step": 4466 + }, + { + "epoch": 0.5682483144638086, + "ewc_loss": 0.04255284368991852, + "ewc_loss_diag": 2.6106834411621094e-05, + "ewc_loss_parallel": 0.0001642979623284191, + "grad_norm": 5.21803617477417, + "learning_rate": 1e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8397724032402039, + "num_tokens": 170556516.0, + "step": 4467 + }, + { + "epoch": 0.5683755247423992, + "ewc_loss": 0.042619578540325165, + "ewc_loss_diag": 2.6106834411621094e-05, + "ewc_loss_parallel": 0.00016496532771270722, + "grad_norm": 5.173681259155273, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8557877540588379, + "num_tokens": 170595571.0, + "step": 4468 + }, + { + "epoch": 0.5685027350209897, + "ewc_loss": 0.04256434738636017, + "ewc_loss_diag": 2.6106834411621094e-05, + "ewc_loss_parallel": 0.0001644129806663841, + "grad_norm": 5.1592020988464355, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8767012357711792, + "num_tokens": 170634117.0, + "step": 4469 + }, + { + "epoch": 0.5686299452995802, + "ewc_loss": 0.04259687662124634, + "ewc_loss_diag": 2.6106834411621094e-05, + "ewc_loss_parallel": 0.000164738274179399, + "grad_norm": 5.194253921508789, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8521497249603271, + "num_tokens": 170677624.0, + "step": 4470 + }, + { + "epoch": 0.5687571555781707, + "ewc_loss": 0.04243675619363785, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016435781435575336, + "grad_norm": 5.17172908782959, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8433014154434204, + "num_tokens": 170716732.0, + "step": 4471 + }, + { + "epoch": 0.5688843658567613, + "ewc_loss": 0.042584605515003204, + "ewc_loss_diag": 2.6106834411621094e-05, + "ewc_loss_parallel": 0.00016461558698210865, + "grad_norm": 5.22672700881958, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8515549302101135, + "num_tokens": 170758521.0, + "step": 4472 + }, + { + "epoch": 0.5690115761353517, + "ewc_loss": 0.04280800372362137, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016440814943052828, + "grad_norm": 5.161569118499756, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8574809432029724, + "num_tokens": 170796799.0, + "step": 4473 + }, + { + "epoch": 0.5691387864139422, + "ewc_loss": 0.042526450008153915, + "ewc_loss_diag": 2.6106834411621094e-05, + "ewc_loss_parallel": 0.0001640340342419222, + "grad_norm": 5.206350803375244, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8443039655685425, + "num_tokens": 170838569.0, + "step": 4474 + }, + { + "epoch": 0.5692659966925327, + "ewc_loss": 0.04284606873989105, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001647887984290719, + "grad_norm": 5.131618022918701, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8598795533180237, + "num_tokens": 170878206.0, + "step": 4475 + }, + { + "epoch": 0.5693932069711233, + "ewc_loss": 0.042527228593826294, + "ewc_loss_diag": 2.6106834411621094e-05, + "ewc_loss_parallel": 0.0001640418340684846, + "grad_norm": 5.200143337249756, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8445727825164795, + "num_tokens": 170915506.0, + "step": 4476 + }, + { + "epoch": 0.5695204172497138, + "ewc_loss": 0.04259124770760536, + "ewc_loss_diag": 2.6106834411621094e-05, + "ewc_loss_parallel": 0.00016468201647512615, + "grad_norm": 5.176873683929443, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8543005585670471, + "num_tokens": 170952303.0, + "step": 4477 + }, + { + "epoch": 0.5696476275283043, + "ewc_loss": 0.04266364127397537, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.0001641852140892297, + "grad_norm": 5.2545061111450195, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8502019643783569, + "num_tokens": 170993757.0, + "step": 4478 + }, + { + "epoch": 0.5697748378068948, + "ewc_loss": 0.04244440048933029, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001644342119107023, + "grad_norm": 5.182034969329834, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8547828197479248, + "num_tokens": 171030240.0, + "step": 4479 + }, + { + "epoch": 0.5699020480854853, + "ewc_loss": 0.0424104779958725, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016409499221481383, + "grad_norm": 5.235178470611572, + "learning_rate": 1e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8336765766143799, + "num_tokens": 171069654.0, + "step": 4480 + }, + { + "epoch": 0.5700292583640758, + "ewc_loss": 0.042395975440740585, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016394999693147838, + "grad_norm": 5.188151836395264, + "learning_rate": 1e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8411547541618347, + "num_tokens": 171107341.0, + "step": 4481 + }, + { + "epoch": 0.5701564686426663, + "ewc_loss": 0.04240579530596733, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016404817870352417, + "grad_norm": 5.2739644050598145, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8478443622589111, + "num_tokens": 171144063.0, + "step": 4482 + }, + { + "epoch": 0.5702836789212569, + "ewc_loss": 0.04241427034139633, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016413291450589895, + "grad_norm": 5.188293933868408, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8497211933135986, + "num_tokens": 171180448.0, + "step": 4483 + }, + { + "epoch": 0.5704108891998474, + "ewc_loss": 0.04235481098294258, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016353835235349834, + "grad_norm": 5.270208835601807, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8602004051208496, + "num_tokens": 171214220.0, + "step": 4484 + }, + { + "epoch": 0.5705380994784378, + "ewc_loss": 0.042470574378967285, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.00016469595720991492, + "grad_norm": 5.228914260864258, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8640379309654236, + "num_tokens": 171252798.0, + "step": 4485 + }, + { + "epoch": 0.5706653097570283, + "ewc_loss": 0.04250170290470123, + "ewc_loss_diag": 2.6106834411621094e-05, + "ewc_loss_parallel": 0.00016378657892346382, + "grad_norm": 5.1277618408203125, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8625970482826233, + "num_tokens": 171295434.0, + "step": 4486 + }, + { + "epoch": 0.5707925200356189, + "ewc_loss": 0.04278844967484474, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016421262989751995, + "grad_norm": 5.238754749298096, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8587087988853455, + "num_tokens": 171334129.0, + "step": 4487 + }, + { + "epoch": 0.5709197303142094, + "ewc_loss": 0.0424504317343235, + "ewc_loss_diag": 2.5987625122070312e-05, + "ewc_loss_parallel": 0.0001644945441512391, + "grad_norm": 5.270852565765381, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8542747497558594, + "num_tokens": 171370239.0, + "step": 4488 + }, + { + "epoch": 0.5710469405927999, + "ewc_loss": 0.042542874813079834, + "ewc_loss_diag": 2.6106834411621094e-05, + "ewc_loss_parallel": 0.00016419825260527432, + "grad_norm": 5.234764099121094, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8719274997711182, + "num_tokens": 171402411.0, + "step": 4489 + }, + { + "epoch": 0.5711741508713905, + "ewc_loss": 0.04260015860199928, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.00016355041589122266, + "grad_norm": 5.157917499542236, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8670346736907959, + "num_tokens": 171438736.0, + "step": 4490 + }, + { + "epoch": 0.5713013611499809, + "ewc_loss": 0.04276243969798088, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016395252896472812, + "grad_norm": 5.212429523468018, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8622112274169922, + "num_tokens": 171477949.0, + "step": 4491 + }, + { + "epoch": 0.5714285714285714, + "ewc_loss": 0.042622923851013184, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.00016377808060497046, + "grad_norm": 5.22085428237915, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8523584604263306, + "num_tokens": 171520953.0, + "step": 4492 + }, + { + "epoch": 0.5715557817071619, + "ewc_loss": 0.04273506999015808, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016367882199119776, + "grad_norm": 5.16079044342041, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8628004193305969, + "num_tokens": 171559415.0, + "step": 4493 + }, + { + "epoch": 0.5716829919857525, + "ewc_loss": 0.04264296591281891, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.00016397848958149552, + "grad_norm": 5.2568278312683105, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8647865056991577, + "num_tokens": 171593118.0, + "step": 4494 + }, + { + "epoch": 0.571810202264343, + "ewc_loss": 0.04266311973333359, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.00016418004815932363, + "grad_norm": 5.228911399841309, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8442667722702026, + "num_tokens": 171634514.0, + "step": 4495 + }, + { + "epoch": 0.5719374125429335, + "ewc_loss": 0.04274344816803932, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016376259736716747, + "grad_norm": 5.164739608764648, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8600581288337708, + "num_tokens": 171675239.0, + "step": 4496 + }, + { + "epoch": 0.5720646228215239, + "ewc_loss": 0.042749956250190735, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016382768808398396, + "grad_norm": 5.25552225112915, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8649435043334961, + "num_tokens": 171713154.0, + "step": 4497 + }, + { + "epoch": 0.5721918331001145, + "ewc_loss": 0.042788565158843994, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.000164213779498823, + "grad_norm": 5.2064313888549805, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8499749898910522, + "num_tokens": 171750046.0, + "step": 4498 + }, + { + "epoch": 0.572319043378705, + "ewc_loss": 0.042759671807289124, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016392483667004853, + "grad_norm": 5.264726161956787, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8474342823028564, + "num_tokens": 171782708.0, + "step": 4499 + }, + { + "epoch": 0.5724462536572955, + "ewc_loss": 0.04269208014011383, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.0001644696167204529, + "grad_norm": 5.189109802246094, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8532521724700928, + "num_tokens": 171819421.0, + "step": 4500 + }, + { + "epoch": 0.572573463935886, + "ewc_loss": 0.042640410363674164, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.0001639529364183545, + "grad_norm": 5.210510730743408, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8565908670425415, + "num_tokens": 171860993.0, + "step": 4501 + }, + { + "epoch": 0.5727006742144766, + "ewc_loss": 0.042786989361047745, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016419801977463067, + "grad_norm": 5.176667213439941, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8570388555526733, + "num_tokens": 171896894.0, + "step": 4502 + }, + { + "epoch": 0.572827884493067, + "ewc_loss": 0.04262417182326317, + "ewc_loss_diag": 2.6226043701171875e-05, + "ewc_loss_parallel": 0.00016379055159632117, + "grad_norm": 5.162781238555908, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8549244403839111, + "num_tokens": 171938694.0, + "step": 4503 + }, + { + "epoch": 0.5729550947716575, + "ewc_loss": 0.04282016307115555, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016452974523417652, + "grad_norm": 5.22501802444458, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8817700147628784, + "num_tokens": 171972606.0, + "step": 4504 + }, + { + "epoch": 0.573082305050248, + "ewc_loss": 0.04281291738152504, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001644573058001697, + "grad_norm": 5.196593284606934, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8598178029060364, + "num_tokens": 172010861.0, + "step": 4505 + }, + { + "epoch": 0.5732095153288386, + "ewc_loss": 0.04278476536273956, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016417579900007695, + "grad_norm": 5.224727630615234, + "learning_rate": 1e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8424054384231567, + "num_tokens": 172049867.0, + "step": 4506 + }, + { + "epoch": 0.5733367256074291, + "ewc_loss": 0.04283350706100464, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016466320084873587, + "grad_norm": 5.178157806396484, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8570702075958252, + "num_tokens": 172092176.0, + "step": 4507 + }, + { + "epoch": 0.5734639358860196, + "ewc_loss": 0.04282141476869583, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016454228898510337, + "grad_norm": 5.187209606170654, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8568949699401855, + "num_tokens": 172132789.0, + "step": 4508 + }, + { + "epoch": 0.5735911461646102, + "ewc_loss": 0.04282102733850479, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001645384036237374, + "grad_norm": 5.1809983253479, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8718260526657104, + "num_tokens": 172170536.0, + "step": 4509 + }, + { + "epoch": 0.5737183564432006, + "ewc_loss": 0.04283565282821655, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016468467947561294, + "grad_norm": 5.221890926361084, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8569883108139038, + "num_tokens": 172206828.0, + "step": 4510 + }, + { + "epoch": 0.5738455667217911, + "ewc_loss": 0.042857274413108826, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.000164900891832076, + "grad_norm": 5.226779937744141, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8498544692993164, + "num_tokens": 172242311.0, + "step": 4511 + }, + { + "epoch": 0.5739727770003816, + "ewc_loss": 0.04283064603805542, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016463456267956644, + "grad_norm": 5.239701747894287, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8529980182647705, + "num_tokens": 172275135.0, + "step": 4512 + }, + { + "epoch": 0.5740999872789722, + "ewc_loss": 0.04284237325191498, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001647518656682223, + "grad_norm": 5.211080551147461, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8656169176101685, + "num_tokens": 172308682.0, + "step": 4513 + }, + { + "epoch": 0.5742271975575627, + "ewc_loss": 0.04288274049758911, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001651555357966572, + "grad_norm": 5.234013557434082, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8526409864425659, + "num_tokens": 172345834.0, + "step": 4514 + }, + { + "epoch": 0.5743544078361532, + "ewc_loss": 0.04287473112344742, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016507544205524027, + "grad_norm": 5.226742744445801, + "learning_rate": 1e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8341237306594849, + "num_tokens": 172387976.0, + "step": 4515 + }, + { + "epoch": 0.5744816181147436, + "ewc_loss": 0.042893532663583755, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016526345279999077, + "grad_norm": 5.194212913513184, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.857347846031189, + "num_tokens": 172427447.0, + "step": 4516 + }, + { + "epoch": 0.5746088283933342, + "ewc_loss": 0.04284946620464325, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001648228062549606, + "grad_norm": 5.193500518798828, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.848416805267334, + "num_tokens": 172468686.0, + "step": 4517 + }, + { + "epoch": 0.5747360386719247, + "ewc_loss": 0.043026477098464966, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001653722138144076, + "grad_norm": 5.191167831420898, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.853164553642273, + "num_tokens": 172508190.0, + "step": 4518 + }, + { + "epoch": 0.5748632489505152, + "ewc_loss": 0.04289446771144867, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016527278057765216, + "grad_norm": 5.17779541015625, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8722350001335144, + "num_tokens": 172552295.0, + "step": 4519 + }, + { + "epoch": 0.5749904592291057, + "ewc_loss": 0.04299651086330414, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016507251712027937, + "grad_norm": 5.19661808013916, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8618625402450562, + "num_tokens": 172591065.0, + "step": 4520 + }, + { + "epoch": 0.5751176695076963, + "ewc_loss": 0.04296888783574104, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016601700917817652, + "grad_norm": 5.2053704261779785, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8604106307029724, + "num_tokens": 172633741.0, + "step": 4521 + }, + { + "epoch": 0.5752448797862867, + "ewc_loss": 0.04288368672132492, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016516499454155564, + "grad_norm": 5.2425007820129395, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8662008047103882, + "num_tokens": 172669081.0, + "step": 4522 + }, + { + "epoch": 0.5753720900648772, + "ewc_loss": 0.04291985556483269, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001655266823945567, + "grad_norm": 5.279539108276367, + "learning_rate": 1e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8350352048873901, + "num_tokens": 172704956.0, + "step": 4523 + }, + { + "epoch": 0.5754993003434677, + "ewc_loss": 0.04291186481714249, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001654467632761225, + "grad_norm": 5.234196186065674, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.86040198802948, + "num_tokens": 172742270.0, + "step": 4524 + }, + { + "epoch": 0.5756265106220583, + "ewc_loss": 0.04300890490412712, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001651964703341946, + "grad_norm": 5.257848262786865, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8451442122459412, + "num_tokens": 172778065.0, + "step": 4525 + }, + { + "epoch": 0.5757537209006488, + "ewc_loss": 0.04292196035385132, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001655477099120617, + "grad_norm": 5.196803569793701, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8652386665344238, + "num_tokens": 172815825.0, + "step": 4526 + }, + { + "epoch": 0.5758809311792393, + "ewc_loss": 0.042994704097509384, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016505445819348097, + "grad_norm": 5.239180088043213, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8557695150375366, + "num_tokens": 172849560.0, + "step": 4527 + }, + { + "epoch": 0.5760081414578297, + "ewc_loss": 0.04303529113531113, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016546034021303058, + "grad_norm": 5.189607620239258, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8585222959518433, + "num_tokens": 172885203.0, + "step": 4528 + }, + { + "epoch": 0.5761353517364203, + "ewc_loss": 0.042981479316949844, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016492221038788557, + "grad_norm": 5.2305426597595215, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8564471006393433, + "num_tokens": 172922341.0, + "step": 4529 + }, + { + "epoch": 0.5762625620150108, + "ewc_loss": 0.04304850846529007, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016559252981096506, + "grad_norm": 5.244720935821533, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8624249696731567, + "num_tokens": 172953363.0, + "step": 4530 + }, + { + "epoch": 0.5763897722936013, + "ewc_loss": 0.04302578419446945, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016536528710275888, + "grad_norm": 5.2842912673950195, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8635266423225403, + "num_tokens": 172986253.0, + "step": 4531 + }, + { + "epoch": 0.5765169825721919, + "ewc_loss": 0.04301006346940994, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001652080682106316, + "grad_norm": 5.200874328613281, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8683273792266846, + "num_tokens": 173023466.0, + "step": 4532 + }, + { + "epoch": 0.5766441928507824, + "ewc_loss": 0.042990975081920624, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016501714708283544, + "grad_norm": 5.238654136657715, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8511440753936768, + "num_tokens": 173063329.0, + "step": 4533 + }, + { + "epoch": 0.5767714031293728, + "ewc_loss": 0.043003544211387634, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016514289018232375, + "grad_norm": 5.201626777648926, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8640831112861633, + "num_tokens": 173100676.0, + "step": 4534 + }, + { + "epoch": 0.5768986134079633, + "ewc_loss": 0.043008383363485336, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016519124619662762, + "grad_norm": 5.181351184844971, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8682129979133606, + "num_tokens": 173135088.0, + "step": 4535 + }, + { + "epoch": 0.5770258236865539, + "ewc_loss": 0.04300980269908905, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016520543431397527, + "grad_norm": 5.203081130981445, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8495739698410034, + "num_tokens": 173175810.0, + "step": 4536 + }, + { + "epoch": 0.5771530339651444, + "ewc_loss": 0.043040018528699875, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001655076048336923, + "grad_norm": 5.187036991119385, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8579088449478149, + "num_tokens": 173220826.0, + "step": 4537 + }, + { + "epoch": 0.5772802442437349, + "ewc_loss": 0.04304070398211479, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016551445878576487, + "grad_norm": 5.195098400115967, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8686246275901794, + "num_tokens": 173258411.0, + "step": 4538 + }, + { + "epoch": 0.5774074545223254, + "ewc_loss": 0.043036311864852905, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001654705556575209, + "grad_norm": 5.247150897979736, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8407506346702576, + "num_tokens": 173297298.0, + "step": 4539 + }, + { + "epoch": 0.5775346648009159, + "ewc_loss": 0.04306366294622421, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016574404435232282, + "grad_norm": 5.265132904052734, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8393179774284363, + "num_tokens": 173330839.0, + "step": 4540 + }, + { + "epoch": 0.5776618750795064, + "ewc_loss": 0.04291103035211563, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016543843958061188, + "grad_norm": 5.2036614418029785, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8496180772781372, + "num_tokens": 173370219.0, + "step": 4541 + }, + { + "epoch": 0.5777890853580969, + "ewc_loss": 0.04300512745976448, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016515869356226176, + "grad_norm": 5.16868782043457, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8579040169715881, + "num_tokens": 173410425.0, + "step": 4542 + }, + { + "epoch": 0.5779162956366874, + "ewc_loss": 0.04304773360490799, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016558474453631788, + "grad_norm": 5.272279262542725, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8571783900260925, + "num_tokens": 173447109.0, + "step": 4543 + }, + { + "epoch": 0.578043505915278, + "ewc_loss": 0.04307684302330017, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001658758264966309, + "grad_norm": 5.17366361618042, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8613324761390686, + "num_tokens": 173490064.0, + "step": 4544 + }, + { + "epoch": 0.5781707161938685, + "ewc_loss": 0.04302910342812538, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016539846546947956, + "grad_norm": 5.229053497314453, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8600392937660217, + "num_tokens": 173523791.0, + "step": 4545 + }, + { + "epoch": 0.5782979264724589, + "ewc_loss": 0.04309116303920746, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016601903189439327, + "grad_norm": 5.187712669372559, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8659830093383789, + "num_tokens": 173564069.0, + "step": 4546 + }, + { + "epoch": 0.5784251367510495, + "ewc_loss": 0.04302334785461426, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001653408835409209, + "grad_norm": 5.252315521240234, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8556180000305176, + "num_tokens": 173601893.0, + "step": 4547 + }, + { + "epoch": 0.57855234702964, + "ewc_loss": 0.04307173192501068, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001658247347222641, + "grad_norm": 5.190812110900879, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.866658091545105, + "num_tokens": 173642299.0, + "step": 4548 + }, + { + "epoch": 0.5786795573082305, + "ewc_loss": 0.043020788580179214, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016531531582586467, + "grad_norm": 5.320746898651123, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8592103719711304, + "num_tokens": 173675824.0, + "step": 4549 + }, + { + "epoch": 0.578806767586821, + "ewc_loss": 0.04304634779691696, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016557087656110525, + "grad_norm": 5.203804016113281, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8586176037788391, + "num_tokens": 173712192.0, + "step": 4550 + }, + { + "epoch": 0.5789339778654116, + "ewc_loss": 0.04299164563417435, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001650238555157557, + "grad_norm": 5.22139835357666, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8649858236312866, + "num_tokens": 173746949.0, + "step": 4551 + }, + { + "epoch": 0.579061188144002, + "ewc_loss": 0.04306185245513916, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016572595632169396, + "grad_norm": 5.160589694976807, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8597587943077087, + "num_tokens": 173784527.0, + "step": 4552 + }, + { + "epoch": 0.5791883984225925, + "ewc_loss": 0.0429980531334877, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001650879275985062, + "grad_norm": 5.199487686157227, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8628880977630615, + "num_tokens": 173820648.0, + "step": 4553 + }, + { + "epoch": 0.579315608701183, + "ewc_loss": 0.04311412572860718, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016624867566861212, + "grad_norm": 5.224850654602051, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8511172533035278, + "num_tokens": 173860427.0, + "step": 4554 + }, + { + "epoch": 0.5794428189797736, + "ewc_loss": 0.04306597262620926, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001657671673456207, + "grad_norm": 5.178522109985352, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.859818160533905, + "num_tokens": 173899876.0, + "step": 4555 + }, + { + "epoch": 0.5795700292583641, + "ewc_loss": 0.04305015131831169, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016560894437134266, + "grad_norm": 5.2047882080078125, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8659892082214355, + "num_tokens": 173937354.0, + "step": 4556 + }, + { + "epoch": 0.5796972395369546, + "ewc_loss": 0.04308808594942093, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016598828369751573, + "grad_norm": 5.237301349639893, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8458251357078552, + "num_tokens": 173976039.0, + "step": 4557 + }, + { + "epoch": 0.5798244498155452, + "ewc_loss": 0.042969461530447006, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001660227426327765, + "grad_norm": 5.2979350090026855, + "learning_rate": 1e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8362659811973572, + "num_tokens": 174006359.0, + "step": 4558 + }, + { + "epoch": 0.5799516600941356, + "ewc_loss": 0.04310488700866699, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016615630011074245, + "grad_norm": 5.2094926834106445, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8483967781066895, + "num_tokens": 174044662.0, + "step": 4559 + }, + { + "epoch": 0.5800788703727261, + "ewc_loss": 0.04304029047489166, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016551034059375525, + "grad_norm": 5.301104545593262, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8534014821052551, + "num_tokens": 174077133.0, + "step": 4560 + }, + { + "epoch": 0.5802060806513166, + "ewc_loss": 0.042997460812330246, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001663027360336855, + "grad_norm": 5.153628349304199, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8525701761245728, + "num_tokens": 174121873.0, + "step": 4561 + }, + { + "epoch": 0.5803332909299072, + "ewc_loss": 0.04299679398536682, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016507538384757936, + "grad_norm": 5.226459980010986, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8588812351226807, + "num_tokens": 174158873.0, + "step": 4562 + }, + { + "epoch": 0.5804605012084977, + "ewc_loss": 0.04296410083770752, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001659691333770752, + "grad_norm": 5.2713518142700195, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8645014762878418, + "num_tokens": 174189222.0, + "step": 4563 + }, + { + "epoch": 0.5805877114870882, + "ewc_loss": 0.042946502566337585, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.0001657931279623881, + "grad_norm": 5.220019340515137, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8495025634765625, + "num_tokens": 174229018.0, + "step": 4564 + }, + { + "epoch": 0.5807149217656786, + "ewc_loss": 0.04306884482502937, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016579586372245103, + "grad_norm": 5.230910778045654, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8585542440414429, + "num_tokens": 174269215.0, + "step": 4565 + }, + { + "epoch": 0.5808421320442692, + "ewc_loss": 0.042977988719940186, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016610798775218427, + "grad_norm": 5.194528579711914, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8661611080169678, + "num_tokens": 174303926.0, + "step": 4566 + }, + { + "epoch": 0.5809693423228597, + "ewc_loss": 0.043190859258174896, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016579532530158758, + "grad_norm": 5.190793514251709, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8593044281005859, + "num_tokens": 174347491.0, + "step": 4567 + }, + { + "epoch": 0.5810965526014502, + "ewc_loss": 0.04321002587676048, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016598697402514517, + "grad_norm": 5.202456474304199, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8586462736129761, + "num_tokens": 174384246.0, + "step": 4568 + }, + { + "epoch": 0.5812237628800407, + "ewc_loss": 0.0432523712515831, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016641044931020588, + "grad_norm": 5.272964000701904, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8561388254165649, + "num_tokens": 174417081.0, + "step": 4569 + }, + { + "epoch": 0.5813509731586313, + "ewc_loss": 0.04323245584964752, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016621126269456, + "grad_norm": 5.1989898681640625, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8465024828910828, + "num_tokens": 174457364.0, + "step": 4570 + }, + { + "epoch": 0.5814781834372217, + "ewc_loss": 0.0431995615363121, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016588236030656844, + "grad_norm": 5.212555885314941, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8587005734443665, + "num_tokens": 174494304.0, + "step": 4571 + }, + { + "epoch": 0.5816053937158122, + "ewc_loss": 0.043239615857601166, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016628287266939878, + "grad_norm": 5.257143497467041, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.858394980430603, + "num_tokens": 174528047.0, + "step": 4572 + }, + { + "epoch": 0.5817326039944027, + "ewc_loss": 0.04325489699840546, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016643568233121186, + "grad_norm": 5.204986095428467, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8617638349533081, + "num_tokens": 174565990.0, + "step": 4573 + }, + { + "epoch": 0.5818598142729933, + "ewc_loss": 0.0432300791144371, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001661875139689073, + "grad_norm": 5.225834369659424, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8433104753494263, + "num_tokens": 174599039.0, + "step": 4574 + }, + { + "epoch": 0.5819870245515838, + "ewc_loss": 0.043245598673820496, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016634268104098737, + "grad_norm": 5.245153427124023, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8518753051757812, + "num_tokens": 174634786.0, + "step": 4575 + }, + { + "epoch": 0.5821142348301743, + "ewc_loss": 0.04322715103626251, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016615823551546782, + "grad_norm": 5.169254302978516, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8474420309066772, + "num_tokens": 174677875.0, + "step": 4576 + }, + { + "epoch": 0.5822414451087647, + "ewc_loss": 0.043230749666690826, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001661942369537428, + "grad_norm": 5.21029806137085, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8513859510421753, + "num_tokens": 174715421.0, + "step": 4577 + }, + { + "epoch": 0.5823686553873553, + "ewc_loss": 0.043275319039821625, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.000166639918461442, + "grad_norm": 5.307667255401611, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8538274765014648, + "num_tokens": 174746555.0, + "step": 4578 + }, + { + "epoch": 0.5824958656659458, + "ewc_loss": 0.04323960095643997, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001662827271502465, + "grad_norm": 5.197637557983398, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8454411029815674, + "num_tokens": 174786470.0, + "step": 4579 + }, + { + "epoch": 0.5826230759445363, + "ewc_loss": 0.043233226984739304, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016621898976154625, + "grad_norm": 5.217458248138428, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8548033237457275, + "num_tokens": 174827131.0, + "step": 4580 + }, + { + "epoch": 0.5827502862231269, + "ewc_loss": 0.043205805122852325, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016594474436715245, + "grad_norm": 5.233876705169678, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8503612279891968, + "num_tokens": 174861391.0, + "step": 4581 + }, + { + "epoch": 0.5828774965017174, + "ewc_loss": 0.04327424615621567, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001666291936999187, + "grad_norm": 5.225814342498779, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8624047040939331, + "num_tokens": 174897437.0, + "step": 4582 + }, + { + "epoch": 0.5830047067803078, + "ewc_loss": 0.043201133608818054, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001658980327192694, + "grad_norm": 5.252264499664307, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8470126986503601, + "num_tokens": 174932504.0, + "step": 4583 + }, + { + "epoch": 0.5831319170588983, + "ewc_loss": 0.043275635689496994, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016664307622704655, + "grad_norm": 5.201858997344971, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8515787124633789, + "num_tokens": 174974746.0, + "step": 4584 + }, + { + "epoch": 0.5832591273374889, + "ewc_loss": 0.04324360936880112, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001663227885728702, + "grad_norm": 5.245981216430664, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8513543605804443, + "num_tokens": 175016155.0, + "step": 4585 + }, + { + "epoch": 0.5833863376160794, + "ewc_loss": 0.04325709491968155, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016645764117129147, + "grad_norm": 5.185154438018799, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8426448106765747, + "num_tokens": 175060595.0, + "step": 4586 + }, + { + "epoch": 0.5835135478946699, + "ewc_loss": 0.043222662061452866, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001661133428569883, + "grad_norm": 5.205630302429199, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8563584089279175, + "num_tokens": 175101278.0, + "step": 4587 + }, + { + "epoch": 0.5836407581732604, + "ewc_loss": 0.043264418840408325, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016653092461638153, + "grad_norm": 5.241758346557617, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8506325483322144, + "num_tokens": 175139479.0, + "step": 4588 + }, + { + "epoch": 0.5837679684518509, + "ewc_loss": 0.04322239011526108, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016611060709692538, + "grad_norm": 5.21132755279541, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8396435976028442, + "num_tokens": 175182376.0, + "step": 4589 + }, + { + "epoch": 0.5838951787304414, + "ewc_loss": 0.043222226202487946, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001661089772824198, + "grad_norm": 5.20274019241333, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8495655059814453, + "num_tokens": 175225449.0, + "step": 4590 + }, + { + "epoch": 0.5840223890090319, + "ewc_loss": 0.04326967895030975, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016658351523801684, + "grad_norm": 5.341339588165283, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8544710874557495, + "num_tokens": 175255959.0, + "step": 4591 + }, + { + "epoch": 0.5841495992876224, + "ewc_loss": 0.043327152729034424, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001671582431299612, + "grad_norm": 5.228011608123779, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8435677289962769, + "num_tokens": 175292142.0, + "step": 4592 + }, + { + "epoch": 0.584276809566213, + "ewc_loss": 0.0432124063372612, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016601078095845878, + "grad_norm": 5.195559978485107, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8494042158126831, + "num_tokens": 175336630.0, + "step": 4593 + }, + { + "epoch": 0.5844040198448035, + "ewc_loss": 0.04326074570417404, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016649419558234513, + "grad_norm": 5.390165328979492, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8685799241065979, + "num_tokens": 175372697.0, + "step": 4594 + }, + { + "epoch": 0.5845312301233939, + "ewc_loss": 0.043242134153842926, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016630803293082863, + "grad_norm": 5.1651387214660645, + "learning_rate": 1e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8448141813278198, + "num_tokens": 175412395.0, + "step": 4595 + }, + { + "epoch": 0.5846584404019844, + "ewc_loss": 0.04322110116481781, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016609771410003304, + "grad_norm": 5.281811714172363, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8565155267715454, + "num_tokens": 175447295.0, + "step": 4596 + }, + { + "epoch": 0.584785650680575, + "ewc_loss": 0.04332420974969864, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016712881915736943, + "grad_norm": 5.241218566894531, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8658522367477417, + "num_tokens": 175480229.0, + "step": 4597 + }, + { + "epoch": 0.5849128609591655, + "ewc_loss": 0.04326993227005005, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016658603271935135, + "grad_norm": 5.231546401977539, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8556419610977173, + "num_tokens": 175523081.0, + "step": 4598 + }, + { + "epoch": 0.585040071237756, + "ewc_loss": 0.04325757920742035, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016646250151097775, + "grad_norm": 5.23543119430542, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8597388863563538, + "num_tokens": 175565036.0, + "step": 4599 + }, + { + "epoch": 0.5851672815163466, + "ewc_loss": 0.0433269627392292, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001671563513809815, + "grad_norm": 5.263400077819824, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8567891120910645, + "num_tokens": 175606733.0, + "step": 4600 + }, + { + "epoch": 0.585294491794937, + "ewc_loss": 0.043248847126960754, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016637520457152277, + "grad_norm": 5.238629341125488, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8714872002601624, + "num_tokens": 175637663.0, + "step": 4601 + }, + { + "epoch": 0.5854217020735275, + "ewc_loss": 0.0433390811085701, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016727755428291857, + "grad_norm": 5.231125831604004, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8581308126449585, + "num_tokens": 175677381.0, + "step": 4602 + }, + { + "epoch": 0.585548912352118, + "ewc_loss": 0.04314836114645004, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016659105313010514, + "grad_norm": 5.234425067901611, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8667845726013184, + "num_tokens": 175713049.0, + "step": 4603 + }, + { + "epoch": 0.5856761226307086, + "ewc_loss": 0.04331178963184357, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001670046040089801, + "grad_norm": 5.222940921783447, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.846924901008606, + "num_tokens": 175753681.0, + "step": 4604 + }, + { + "epoch": 0.5858033329092991, + "ewc_loss": 0.0433354377746582, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016724111628718674, + "grad_norm": 5.267705917358398, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8437356948852539, + "num_tokens": 175789725.0, + "step": 4605 + }, + { + "epoch": 0.5859305431878896, + "ewc_loss": 0.04330294579267502, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016691618657205254, + "grad_norm": 5.301886558532715, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8559843301773071, + "num_tokens": 175823203.0, + "step": 4606 + }, + { + "epoch": 0.5860577534664801, + "ewc_loss": 0.04335148632526398, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001674016093602404, + "grad_norm": 5.243238925933838, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8508947491645813, + "num_tokens": 175860506.0, + "step": 4607 + }, + { + "epoch": 0.5861849637450706, + "ewc_loss": 0.04330822080373764, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001669689081609249, + "grad_norm": 5.243553161621094, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8728005290031433, + "num_tokens": 175895080.0, + "step": 4608 + }, + { + "epoch": 0.5863121740236611, + "ewc_loss": 0.043331436812877655, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016720106941647828, + "grad_norm": 5.204135894775391, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.85060054063797, + "num_tokens": 175937113.0, + "step": 4609 + }, + { + "epoch": 0.5864393843022516, + "ewc_loss": 0.04331835359334946, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016707024769857526, + "grad_norm": 5.314568042755127, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8692001104354858, + "num_tokens": 175966193.0, + "step": 4610 + }, + { + "epoch": 0.5865665945808421, + "ewc_loss": 0.04337490350008011, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016763576422818005, + "grad_norm": 5.259753704071045, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8499201536178589, + "num_tokens": 176003771.0, + "step": 4611 + }, + { + "epoch": 0.5866938048594327, + "ewc_loss": 0.04329238831996918, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016681061242707074, + "grad_norm": 5.185266017913818, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8619337677955627, + "num_tokens": 176050069.0, + "step": 4612 + }, + { + "epoch": 0.5868210151380232, + "ewc_loss": 0.04331858456134796, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016707259055692703, + "grad_norm": 5.244760513305664, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8703669309616089, + "num_tokens": 176088763.0, + "step": 4613 + }, + { + "epoch": 0.5869482254166136, + "ewc_loss": 0.04338352009654045, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016772191156633198, + "grad_norm": 5.26200532913208, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8578726649284363, + "num_tokens": 176127853.0, + "step": 4614 + }, + { + "epoch": 0.5870754356952042, + "ewc_loss": 0.043306104838848114, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001669477642280981, + "grad_norm": 5.2303595542907715, + "learning_rate": 1e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8427527546882629, + "num_tokens": 176173775.0, + "step": 4615 + }, + { + "epoch": 0.5872026459737947, + "ewc_loss": 0.043352700769901276, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016741374565754086, + "grad_norm": 5.265305519104004, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8605165481567383, + "num_tokens": 176210878.0, + "step": 4616 + }, + { + "epoch": 0.5873298562523852, + "ewc_loss": 0.043296024203300476, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001668469631113112, + "grad_norm": 5.187263011932373, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8505322337150574, + "num_tokens": 176255094.0, + "step": 4617 + }, + { + "epoch": 0.5874570665309757, + "ewc_loss": 0.04323215037584305, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016742893785703927, + "grad_norm": 5.3187479972839355, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8536636233329773, + "num_tokens": 176293586.0, + "step": 4618 + }, + { + "epoch": 0.5875842768095663, + "ewc_loss": 0.043249230831861496, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016759973368607461, + "grad_norm": 5.266692161560059, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8589966297149658, + "num_tokens": 176329203.0, + "step": 4619 + }, + { + "epoch": 0.5877114870881567, + "ewc_loss": 0.04316374659538269, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001667448814259842, + "grad_norm": 5.301811695098877, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8738514184951782, + "num_tokens": 176367984.0, + "step": 4620 + }, + { + "epoch": 0.5878386973667472, + "ewc_loss": 0.04321858286857605, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016729322669561952, + "grad_norm": 5.23012113571167, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8701235055923462, + "num_tokens": 176408949.0, + "step": 4621 + }, + { + "epoch": 0.5879659076453377, + "ewc_loss": 0.04312323406338692, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016633975610602647, + "grad_norm": 5.196619033813477, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8714869618415833, + "num_tokens": 176451073.0, + "step": 4622 + }, + { + "epoch": 0.5880931179239283, + "ewc_loss": 0.04319249093532562, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016703232540749013, + "grad_norm": 5.253283977508545, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8637945055961609, + "num_tokens": 176490623.0, + "step": 4623 + }, + { + "epoch": 0.5882203282025188, + "ewc_loss": 0.04317484050989151, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016685585433151573, + "grad_norm": 5.233922481536865, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8444393873214722, + "num_tokens": 176525771.0, + "step": 4624 + }, + { + "epoch": 0.5883475384811093, + "ewc_loss": 0.043173547834157944, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016684290312696248, + "grad_norm": 5.1895575523376465, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8758862018585205, + "num_tokens": 176565417.0, + "step": 4625 + }, + { + "epoch": 0.5884747487596997, + "ewc_loss": 0.04306798055768013, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016700792184565216, + "grad_norm": 5.329066276550293, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8545063734054565, + "num_tokens": 176599005.0, + "step": 4626 + }, + { + "epoch": 0.5886019590382903, + "ewc_loss": 0.04307623952627182, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016709054762031883, + "grad_norm": 5.193841457366943, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8592383861541748, + "num_tokens": 176637947.0, + "step": 4627 + }, + { + "epoch": 0.5887291693168808, + "ewc_loss": 0.043082673102617264, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016715485253371298, + "grad_norm": 5.283920764923096, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8470361828804016, + "num_tokens": 176675241.0, + "step": 4628 + }, + { + "epoch": 0.5888563795954713, + "ewc_loss": 0.04323665052652359, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016747391782701015, + "grad_norm": 5.2206196784973145, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8500846028327942, + "num_tokens": 176715375.0, + "step": 4629 + }, + { + "epoch": 0.5889835898740619, + "ewc_loss": 0.04308975860476494, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016722572036087513, + "grad_norm": 5.258305549621582, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8559457063674927, + "num_tokens": 176750977.0, + "step": 4630 + }, + { + "epoch": 0.5891108001526524, + "ewc_loss": 0.04327477887272835, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016785520710982382, + "grad_norm": 5.2392897605896, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8601484298706055, + "num_tokens": 176788968.0, + "step": 4631 + }, + { + "epoch": 0.5892380104312428, + "ewc_loss": 0.0431051179766655, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016737933037802577, + "grad_norm": 5.251171588897705, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8525980114936829, + "num_tokens": 176824399.0, + "step": 4632 + }, + { + "epoch": 0.5893652207098333, + "ewc_loss": 0.04325880482792854, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016769547073636204, + "grad_norm": 5.246427536010742, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8524012565612793, + "num_tokens": 176862053.0, + "step": 4633 + }, + { + "epoch": 0.5894924309884239, + "ewc_loss": 0.04323296993970871, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016743711603339761, + "grad_norm": 5.206806182861328, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8711361885070801, + "num_tokens": 176896943.0, + "step": 4634 + }, + { + "epoch": 0.5896196412670144, + "ewc_loss": 0.04328552633523941, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001679626730037853, + "grad_norm": 5.211775302886963, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8577048778533936, + "num_tokens": 176936263.0, + "step": 4635 + }, + { + "epoch": 0.5897468515456049, + "ewc_loss": 0.04315792769193649, + "ewc_loss_diag": 2.6345252990722656e-05, + "ewc_loss_parallel": 0.00016790739027783275, + "grad_norm": 5.249876022338867, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8452045321464539, + "num_tokens": 176976632.0, + "step": 4636 + }, + { + "epoch": 0.5898740618241954, + "ewc_loss": 0.043316297233104706, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016827040235511959, + "grad_norm": 5.289979457855225, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8629680871963501, + "num_tokens": 177007036.0, + "step": 4637 + }, + { + "epoch": 0.5900012721027859, + "ewc_loss": 0.0434228740632534, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016811545356176794, + "grad_norm": 5.257593631744385, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8478624224662781, + "num_tokens": 177042189.0, + "step": 4638 + }, + { + "epoch": 0.5901284823813764, + "ewc_loss": 0.04335084557533264, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.000167395148309879, + "grad_norm": 5.222898960113525, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8523029088973999, + "num_tokens": 177082428.0, + "step": 4639 + }, + { + "epoch": 0.5902556926599669, + "ewc_loss": 0.043276023119688034, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016786764899734408, + "grad_norm": 5.198385238647461, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8592972159385681, + "num_tokens": 177124447.0, + "step": 4640 + }, + { + "epoch": 0.5903829029385574, + "ewc_loss": 0.043412961065769196, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016801635501906276, + "grad_norm": 5.245538711547852, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.872611939907074, + "num_tokens": 177160516.0, + "step": 4641 + }, + { + "epoch": 0.590510113217148, + "ewc_loss": 0.04327978193759918, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001679052656982094, + "grad_norm": 5.222959995269775, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8650586605072021, + "num_tokens": 177198100.0, + "step": 4642 + }, + { + "epoch": 0.5906373234957385, + "ewc_loss": 0.04329659789800644, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001680734276305884, + "grad_norm": 5.224338054656982, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8506438136100769, + "num_tokens": 177243170.0, + "step": 4643 + }, + { + "epoch": 0.5907645337743289, + "ewc_loss": 0.04327527433633804, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016786018386483192, + "grad_norm": 5.235977649688721, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8548455834388733, + "num_tokens": 177282976.0, + "step": 4644 + }, + { + "epoch": 0.5908917440529194, + "ewc_loss": 0.04334487020969391, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016855612921062857, + "grad_norm": 5.223222255706787, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8399618864059448, + "num_tokens": 177323725.0, + "step": 4645 + }, + { + "epoch": 0.59101895433151, + "ewc_loss": 0.04332286864519119, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001683361188042909, + "grad_norm": 5.277993679046631, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.856641948223114, + "num_tokens": 177352178.0, + "step": 4646 + }, + { + "epoch": 0.5911461646101005, + "ewc_loss": 0.04349415749311447, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016882827912922949, + "grad_norm": 5.275265216827393, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8539239168167114, + "num_tokens": 177392322.0, + "step": 4647 + }, + { + "epoch": 0.591273374888691, + "ewc_loss": 0.04333651810884476, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001684726303210482, + "grad_norm": 5.240823268890381, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.854729413986206, + "num_tokens": 177432013.0, + "step": 4648 + }, + { + "epoch": 0.5914005851672816, + "ewc_loss": 0.04344760626554489, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016836276336107403, + "grad_norm": 5.161223888397217, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8700189590454102, + "num_tokens": 177473542.0, + "step": 4649 + }, + { + "epoch": 0.591527795445872, + "ewc_loss": 0.04336349666118622, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016874237917363644, + "grad_norm": 5.331323623657227, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.853664755821228, + "num_tokens": 177516590.0, + "step": 4650 + }, + { + "epoch": 0.5916550057244625, + "ewc_loss": 0.04346572980284691, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016854400746524334, + "grad_norm": 5.176326751708984, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8700456619262695, + "num_tokens": 177553750.0, + "step": 4651 + }, + { + "epoch": 0.591782216003053, + "ewc_loss": 0.04329599067568779, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016806733037810773, + "grad_norm": 5.286102294921875, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8504159450531006, + "num_tokens": 177586093.0, + "step": 4652 + }, + { + "epoch": 0.5919094262816436, + "ewc_loss": 0.0433635488152504, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016874290304258466, + "grad_norm": 5.219806671142578, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8526655435562134, + "num_tokens": 177626800.0, + "step": 4653 + }, + { + "epoch": 0.5920366365602341, + "ewc_loss": 0.04332556203007698, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016836303984746337, + "grad_norm": 5.244655609130859, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8556492328643799, + "num_tokens": 177662336.0, + "step": 4654 + }, + { + "epoch": 0.5921638468388246, + "ewc_loss": 0.04347611218690872, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001686478644842282, + "grad_norm": 5.191018581390381, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8648401498794556, + "num_tokens": 177706922.0, + "step": 4655 + }, + { + "epoch": 0.592291057117415, + "ewc_loss": 0.04335790127515793, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001686864416114986, + "grad_norm": 5.25040340423584, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.854468047618866, + "num_tokens": 177743922.0, + "step": 4656 + }, + { + "epoch": 0.5924182673960056, + "ewc_loss": 0.04351235553622246, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016901027993299067, + "grad_norm": 5.235617637634277, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8651139736175537, + "num_tokens": 177777069.0, + "step": 4657 + }, + { + "epoch": 0.5925454776745961, + "ewc_loss": 0.04358683526515961, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00016853435954544693, + "grad_norm": 5.260653972625732, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8654292225837708, + "num_tokens": 177810959.0, + "step": 4658 + }, + { + "epoch": 0.5926726879531866, + "ewc_loss": 0.04353543370962143, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016924107330851257, + "grad_norm": 5.296313762664795, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8473629951477051, + "num_tokens": 177846366.0, + "step": 4659 + }, + { + "epoch": 0.5927998982317771, + "ewc_loss": 0.043464310467243195, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016852984845172614, + "grad_norm": 5.238465309143066, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8698536157608032, + "num_tokens": 177881450.0, + "step": 4660 + }, + { + "epoch": 0.5929271085103677, + "ewc_loss": 0.04350971430540085, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016898386820685118, + "grad_norm": 5.1842474937438965, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8579065799713135, + "num_tokens": 177922931.0, + "step": 4661 + }, + { + "epoch": 0.5930543187889582, + "ewc_loss": 0.043480969965457916, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001686963951215148, + "grad_norm": 5.253978729248047, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8395953178405762, + "num_tokens": 177962722.0, + "step": 4662 + }, + { + "epoch": 0.5931815290675486, + "ewc_loss": 0.043561507016420364, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.000169501785421744, + "grad_norm": 5.3028340339660645, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8552801609039307, + "num_tokens": 177999956.0, + "step": 4663 + }, + { + "epoch": 0.5933087393461391, + "ewc_loss": 0.043563175946474075, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016951847646851093, + "grad_norm": 5.284886837005615, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8470101356506348, + "num_tokens": 178032450.0, + "step": 4664 + }, + { + "epoch": 0.5934359496247297, + "ewc_loss": 0.0435110479593277, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001689972123131156, + "grad_norm": 5.237936496734619, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8570730686187744, + "num_tokens": 178075028.0, + "step": 4665 + }, + { + "epoch": 0.5935631599033202, + "ewc_loss": 0.04354354366660118, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016932215658016503, + "grad_norm": 5.279937267303467, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8528691530227661, + "num_tokens": 178108267.0, + "step": 4666 + }, + { + "epoch": 0.5936903701819107, + "ewc_loss": 0.04344769939780235, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016958441119641066, + "grad_norm": 5.266839981079102, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8581361770629883, + "num_tokens": 178148091.0, + "step": 4667 + }, + { + "epoch": 0.5938175804605013, + "ewc_loss": 0.043497972190380096, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016886641969904304, + "grad_norm": 5.295547962188721, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8703211545944214, + "num_tokens": 178185626.0, + "step": 4668 + }, + { + "epoch": 0.5939447907390917, + "ewc_loss": 0.04350973665714264, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016898410103749484, + "grad_norm": 5.24260139465332, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8654180765151978, + "num_tokens": 178224165.0, + "step": 4669 + }, + { + "epoch": 0.5940720010176822, + "ewc_loss": 0.043487779796123505, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016876451263669878, + "grad_norm": 5.298157691955566, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8563465476036072, + "num_tokens": 178264118.0, + "step": 4670 + }, + { + "epoch": 0.5941992112962727, + "ewc_loss": 0.04346573352813721, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001685440365690738, + "grad_norm": 5.318199157714844, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8679978847503662, + "num_tokens": 178295634.0, + "step": 4671 + }, + { + "epoch": 0.5943264215748633, + "ewc_loss": 0.04345554858446121, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001684422168182209, + "grad_norm": 5.309014797210693, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8629975914955139, + "num_tokens": 178327076.0, + "step": 4672 + }, + { + "epoch": 0.5944536318534538, + "ewc_loss": 0.043416157364845276, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001680482819210738, + "grad_norm": 5.289918422698975, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8489199280738831, + "num_tokens": 178357469.0, + "step": 4673 + }, + { + "epoch": 0.5945808421320443, + "ewc_loss": 0.043499648571014404, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001688832271611318, + "grad_norm": 5.2492547035217285, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8631570935249329, + "num_tokens": 178393830.0, + "step": 4674 + }, + { + "epoch": 0.5947080524106347, + "ewc_loss": 0.04342614859342575, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.000168148209922947, + "grad_norm": 5.315482139587402, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.856652021408081, + "num_tokens": 178427560.0, + "step": 4675 + }, + { + "epoch": 0.5948352626892253, + "ewc_loss": 0.043467871844768524, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016856545698828995, + "grad_norm": 5.236454963684082, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8584766387939453, + "num_tokens": 178464666.0, + "step": 4676 + }, + { + "epoch": 0.5949624729678158, + "ewc_loss": 0.043440256267786026, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.000168289290741086, + "grad_norm": 5.260706901550293, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8473387956619263, + "num_tokens": 178508771.0, + "step": 4677 + }, + { + "epoch": 0.5950896832464063, + "ewc_loss": 0.04344663769006729, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016835311544127762, + "grad_norm": 5.2119646072387695, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8610796928405762, + "num_tokens": 178548516.0, + "step": 4678 + }, + { + "epoch": 0.5952168935249968, + "ewc_loss": 0.04348143935203552, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016870109539013356, + "grad_norm": 5.288938522338867, + "learning_rate": 1e-06, + "loss": 0.5871, + "mean_token_accuracy": 0.8220245242118835, + "num_tokens": 178591276.0, + "step": 4679 + }, + { + "epoch": 0.5953441038035874, + "ewc_loss": 0.043470509350299835, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016859182505868375, + "grad_norm": 5.24655818939209, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.855640172958374, + "num_tokens": 178632580.0, + "step": 4680 + }, + { + "epoch": 0.5954713140821778, + "ewc_loss": 0.04359445720911026, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001686105679254979, + "grad_norm": 5.262031078338623, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8529212474822998, + "num_tokens": 178672858.0, + "step": 4681 + }, + { + "epoch": 0.5955985243607683, + "ewc_loss": 0.043495386838912964, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016884057549759746, + "grad_norm": 5.3016462326049805, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8586357235908508, + "num_tokens": 178706435.0, + "step": 4682 + }, + { + "epoch": 0.5957257346393589, + "ewc_loss": 0.04350077733397484, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016889449034351856, + "grad_norm": 5.305061340332031, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8546496033668518, + "num_tokens": 178746309.0, + "step": 4683 + }, + { + "epoch": 0.5958529449179494, + "ewc_loss": 0.0434548519551754, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016843524645082653, + "grad_norm": 5.246931552886963, + "learning_rate": 1e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8386896848678589, + "num_tokens": 178787639.0, + "step": 4684 + }, + { + "epoch": 0.5959801551965399, + "ewc_loss": 0.043465569615364075, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016854240675456822, + "grad_norm": 5.254805088043213, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8617565631866455, + "num_tokens": 178828271.0, + "step": 4685 + }, + { + "epoch": 0.5961073654751304, + "ewc_loss": 0.04348141700029373, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001687009062152356, + "grad_norm": 5.304565906524658, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8782281875610352, + "num_tokens": 178862862.0, + "step": 4686 + }, + { + "epoch": 0.5962345757537209, + "ewc_loss": 0.04349677264690399, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016885447257664055, + "grad_norm": 5.293738842010498, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8685066103935242, + "num_tokens": 178897969.0, + "step": 4687 + }, + { + "epoch": 0.5963617860323114, + "ewc_loss": 0.04347289353609085, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016861563199199736, + "grad_norm": 5.243167877197266, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8612751960754395, + "num_tokens": 178942002.0, + "step": 4688 + }, + { + "epoch": 0.5964889963109019, + "ewc_loss": 0.04344634711742401, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016835019050631672, + "grad_norm": 5.240466117858887, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8662683367729187, + "num_tokens": 178983473.0, + "step": 4689 + }, + { + "epoch": 0.5966162065894924, + "ewc_loss": 0.04351560026407242, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016904274525586516, + "grad_norm": 5.423000812530518, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8458104729652405, + "num_tokens": 179022370.0, + "step": 4690 + }, + { + "epoch": 0.596743416868083, + "ewc_loss": 0.04345186799764633, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001684053713688627, + "grad_norm": 5.156436443328857, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8721703290939331, + "num_tokens": 179064841.0, + "step": 4691 + }, + { + "epoch": 0.5968706271466735, + "ewc_loss": 0.04346836730837822, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016857039008755237, + "grad_norm": 5.5365471839904785, + "learning_rate": 1e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.8298563361167908, + "num_tokens": 179099399.0, + "step": 4692 + }, + { + "epoch": 0.5969978374252639, + "ewc_loss": 0.04355476424098015, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016943436639849097, + "grad_norm": 5.26360559463501, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8486300706863403, + "num_tokens": 179133969.0, + "step": 4693 + }, + { + "epoch": 0.5971250477038544, + "ewc_loss": 0.043401412665843964, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016790084191597998, + "grad_norm": 5.523151397705078, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8542330265045166, + "num_tokens": 179165456.0, + "step": 4694 + }, + { + "epoch": 0.597252257982445, + "ewc_loss": 0.04351595789194107, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016904629592318088, + "grad_norm": 5.264726638793945, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8613264560699463, + "num_tokens": 179198209.0, + "step": 4695 + }, + { + "epoch": 0.5973794682610355, + "ewc_loss": 0.043361738324165344, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016750412760302424, + "grad_norm": 12.554190635681152, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8418108224868774, + "num_tokens": 179233872.0, + "step": 4696 + }, + { + "epoch": 0.597506678539626, + "ewc_loss": 0.05229855328798294, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0002544308372307569, + "grad_norm": 6.840274333953857, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8781907558441162, + "num_tokens": 179272721.0, + "step": 4697 + }, + { + "epoch": 0.5976338888182166, + "ewc_loss": 0.0425938181579113, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00015738348884042352, + "grad_norm": 4.798128128051758, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8561493754386902, + "num_tokens": 179305781.0, + "step": 4698 + }, + { + "epoch": 0.597761099096807, + "ewc_loss": 0.04586610198020935, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00019010633695870638, + "grad_norm": 6.047993183135986, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.86067795753479, + "num_tokens": 179350049.0, + "step": 4699 + }, + { + "epoch": 0.5978883093753975, + "ewc_loss": 0.04598875343799591, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00019133284513372928, + "grad_norm": 5.574833869934082, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8569015264511108, + "num_tokens": 179388063.0, + "step": 4700 + }, + { + "epoch": 0.598015519653988, + "ewc_loss": 0.044195614755153656, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017462218238506466, + "grad_norm": 5.476232528686523, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.864787220954895, + "num_tokens": 179427607.0, + "step": 4701 + }, + { + "epoch": 0.5981427299325786, + "ewc_loss": 0.04464995861053467, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001803863124223426, + "grad_norm": 5.479685306549072, + "learning_rate": 1e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.840887188911438, + "num_tokens": 179471456.0, + "step": 4702 + }, + { + "epoch": 0.5982699402111691, + "ewc_loss": 0.043977290391922, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001748803333612159, + "grad_norm": 5.349526882171631, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8689486980438232, + "num_tokens": 179514318.0, + "step": 4703 + }, + { + "epoch": 0.5983971504897596, + "ewc_loss": 0.04414704814553261, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017535719962324947, + "grad_norm": 5.498198986053467, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8704231977462769, + "num_tokens": 179553307.0, + "step": 4704 + }, + { + "epoch": 0.59852436076835, + "ewc_loss": 0.044009678065776825, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017398351337760687, + "grad_norm": 5.298153400421143, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8495178818702698, + "num_tokens": 179597288.0, + "step": 4705 + }, + { + "epoch": 0.5986515710469406, + "ewc_loss": 0.04377289116382599, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017161564028356224, + "grad_norm": 5.440425872802734, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.857395350933075, + "num_tokens": 179629583.0, + "step": 4706 + }, + { + "epoch": 0.5987787813255311, + "ewc_loss": 0.04384760558605194, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017236277926713228, + "grad_norm": 5.320329189300537, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8514480590820312, + "num_tokens": 179672104.0, + "step": 4707 + }, + { + "epoch": 0.5989059916041216, + "ewc_loss": 0.0436645932495594, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001705326430965215, + "grad_norm": 5.351177215576172, + "learning_rate": 1e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8486112356185913, + "num_tokens": 179712951.0, + "step": 4708 + }, + { + "epoch": 0.5990332018827121, + "ewc_loss": 0.043683119118213654, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001707179326331243, + "grad_norm": 5.306038856506348, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8558123111724854, + "num_tokens": 179754626.0, + "step": 4709 + }, + { + "epoch": 0.5991604121613027, + "ewc_loss": 0.04363405704498291, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017022730025928468, + "grad_norm": 5.325256824493408, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8629872798919678, + "num_tokens": 179790068.0, + "step": 4710 + }, + { + "epoch": 0.5992876224398932, + "ewc_loss": 0.043476708233356476, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016987451817840338, + "grad_norm": 5.392825126647949, + "learning_rate": 1e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8380367159843445, + "num_tokens": 179823936.0, + "step": 4711 + }, + { + "epoch": 0.5994148327184836, + "ewc_loss": 0.0434647835791111, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001697552652331069, + "grad_norm": 5.315639972686768, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8582934737205505, + "num_tokens": 179860920.0, + "step": 4712 + }, + { + "epoch": 0.5995420429970741, + "ewc_loss": 0.04337474703788757, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016885488003026694, + "grad_norm": 5.264211654663086, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.847773551940918, + "num_tokens": 179899848.0, + "step": 4713 + }, + { + "epoch": 0.5996692532756647, + "ewc_loss": 0.043587230145931244, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001697590050753206, + "grad_norm": 5.283721446990967, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8743194341659546, + "num_tokens": 179934462.0, + "step": 4714 + }, + { + "epoch": 0.5997964635542552, + "ewc_loss": 0.04342435300350189, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001693509693723172, + "grad_norm": 5.299482822418213, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8571692705154419, + "num_tokens": 179973955.0, + "step": 4715 + }, + { + "epoch": 0.5999236738328457, + "ewc_loss": 0.04359325394034386, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00016981925000436604, + "grad_norm": 5.249748706817627, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8586655855178833, + "num_tokens": 180015256.0, + "step": 4716 + }, + { + "epoch": 0.6000508841114363, + "ewc_loss": 0.04343963414430618, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016950374993029982, + "grad_norm": 5.266753196716309, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8646790981292725, + "num_tokens": 180054145.0, + "step": 4717 + }, + { + "epoch": 0.6001780943900267, + "ewc_loss": 0.04348155856132507, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00016992299060802907, + "grad_norm": 5.234438419342041, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8615780472755432, + "num_tokens": 180093411.0, + "step": 4718 + }, + { + "epoch": 0.6003053046686172, + "ewc_loss": 0.04351420700550079, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017024949193000793, + "grad_norm": 5.267571449279785, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.867082953453064, + "num_tokens": 180135799.0, + "step": 4719 + }, + { + "epoch": 0.6004325149472077, + "ewc_loss": 0.04365701228380203, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001704568276181817, + "grad_norm": 5.311200141906738, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.851296603679657, + "num_tokens": 180172623.0, + "step": 4720 + }, + { + "epoch": 0.6005597252257983, + "ewc_loss": 0.0436495840549469, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001703825400909409, + "grad_norm": 5.23628044128418, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8546891212463379, + "num_tokens": 180213833.0, + "step": 4721 + }, + { + "epoch": 0.6006869355043888, + "ewc_loss": 0.04351133853197098, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001702207955531776, + "grad_norm": 5.302526473999023, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8646612763404846, + "num_tokens": 180254982.0, + "step": 4722 + }, + { + "epoch": 0.6008141457829793, + "ewc_loss": 0.04362334683537483, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017012018361128867, + "grad_norm": 12.548674583435059, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8658467531204224, + "num_tokens": 180288544.0, + "step": 4723 + }, + { + "epoch": 0.6009413560615697, + "ewc_loss": 0.05185171589255333, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0002511831698939204, + "grad_norm": 6.615948677062988, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8545553088188171, + "num_tokens": 180330233.0, + "step": 4724 + }, + { + "epoch": 0.6010685663401603, + "ewc_loss": 0.042287860065698624, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001567653234815225, + "grad_norm": 4.673843860626221, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8597902655601501, + "num_tokens": 180361516.0, + "step": 4725 + }, + { + "epoch": 0.6011957766187508, + "ewc_loss": 0.045640863478183746, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00019029535178560764, + "grad_norm": 5.963747978210449, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8554885387420654, + "num_tokens": 180404380.0, + "step": 4726 + }, + { + "epoch": 0.6013229868973413, + "ewc_loss": 0.045604605227708817, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00018993276171386242, + "grad_norm": 5.413931369781494, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.850627064704895, + "num_tokens": 180436916.0, + "step": 4727 + }, + { + "epoch": 0.6014501971759318, + "ewc_loss": 0.04424108564853668, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017629755893722177, + "grad_norm": 5.4745025634765625, + "learning_rate": 1e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8402308225631714, + "num_tokens": 180476895.0, + "step": 4728 + }, + { + "epoch": 0.6015774074545224, + "ewc_loss": 0.044784121215343475, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00018172792624682188, + "grad_norm": 5.432616233825684, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8576755523681641, + "num_tokens": 180515248.0, + "step": 4729 + }, + { + "epoch": 0.6017046177331128, + "ewc_loss": 0.044193897396326065, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017582568398211151, + "grad_norm": 5.316213130950928, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8651727437973022, + "num_tokens": 180554575.0, + "step": 4730 + }, + { + "epoch": 0.6018318280117033, + "ewc_loss": 0.04414764791727066, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001765839260770008, + "grad_norm": 5.403878688812256, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8478094339370728, + "num_tokens": 180594485.0, + "step": 4731 + }, + { + "epoch": 0.6019590382902938, + "ewc_loss": 0.0441679023206234, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001755657431203872, + "grad_norm": 5.341293811798096, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8430062532424927, + "num_tokens": 180634746.0, + "step": 4732 + }, + { + "epoch": 0.6020862485688844, + "ewc_loss": 0.04389717057347298, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017407911946065724, + "grad_norm": 5.382063865661621, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8483892679214478, + "num_tokens": 180674946.0, + "step": 4733 + }, + { + "epoch": 0.6022134588474749, + "ewc_loss": 0.04387778043746948, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017388521519023925, + "grad_norm": 5.290894985198975, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8576681613922119, + "num_tokens": 180715538.0, + "step": 4734 + }, + { + "epoch": 0.6023406691260654, + "ewc_loss": 0.04391762614250183, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017306295922026038, + "grad_norm": 5.370605945587158, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8568264842033386, + "num_tokens": 180755298.0, + "step": 4735 + }, + { + "epoch": 0.6024678794046558, + "ewc_loss": 0.04370616376399994, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017216906417161226, + "grad_norm": 5.266945838928223, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8506060838699341, + "num_tokens": 180797104.0, + "step": 4736 + }, + { + "epoch": 0.6025950896832464, + "ewc_loss": 0.043802566826343536, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017191241204272956, + "grad_norm": 5.3308491706848145, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8683977127075195, + "num_tokens": 180839845.0, + "step": 4737 + }, + { + "epoch": 0.6027222999618369, + "ewc_loss": 0.04365191608667374, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017162658332381397, + "grad_norm": 5.2926506996154785, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8720147609710693, + "num_tokens": 180878849.0, + "step": 4738 + }, + { + "epoch": 0.6028495102404274, + "ewc_loss": 0.04361521452665329, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001712595549179241, + "grad_norm": 5.34726095199585, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8564552068710327, + "num_tokens": 180918427.0, + "step": 4739 + }, + { + "epoch": 0.602976720519018, + "ewc_loss": 0.0436030775308609, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001711381773930043, + "grad_norm": 5.308426856994629, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8502722978591919, + "num_tokens": 180956206.0, + "step": 4740 + }, + { + "epoch": 0.6031039307976085, + "ewc_loss": 0.04352501779794693, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017035759810823947, + "grad_norm": 5.2890706062316895, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8634004592895508, + "num_tokens": 180996945.0, + "step": 4741 + }, + { + "epoch": 0.6032311410761989, + "ewc_loss": 0.043576523661613464, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001708726485958323, + "grad_norm": 5.358608722686768, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.862633228302002, + "num_tokens": 181031763.0, + "step": 4742 + }, + { + "epoch": 0.6033583513547894, + "ewc_loss": 0.043512679636478424, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017023419786710292, + "grad_norm": 5.2668986320495605, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8651260137557983, + "num_tokens": 181070661.0, + "step": 4743 + }, + { + "epoch": 0.60348556163338, + "ewc_loss": 0.043669573962688446, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001705824543023482, + "grad_norm": 5.318876266479492, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8444194793701172, + "num_tokens": 181112769.0, + "step": 4744 + }, + { + "epoch": 0.6036127719119705, + "ewc_loss": 0.04354361817240715, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017054360068868846, + "grad_norm": 5.311954021453857, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8584814667701721, + "num_tokens": 181151155.0, + "step": 4745 + }, + { + "epoch": 0.603739982190561, + "ewc_loss": 0.04348954185843468, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017000283696688712, + "grad_norm": 5.277805328369141, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8581005334854126, + "num_tokens": 181188689.0, + "step": 4746 + }, + { + "epoch": 0.6038671924691515, + "ewc_loss": 0.043498750776052475, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017009493603836745, + "grad_norm": 5.255304336547852, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8627676963806152, + "num_tokens": 181230031.0, + "step": 4747 + }, + { + "epoch": 0.603994402747742, + "ewc_loss": 0.043534055352211, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001704479946056381, + "grad_norm": 5.330961227416992, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8572254180908203, + "num_tokens": 181271464.0, + "step": 4748 + }, + { + "epoch": 0.6041216130263325, + "ewc_loss": 0.0435556061565876, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017066347936633974, + "grad_norm": 5.267515182495117, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8510096669197083, + "num_tokens": 181312607.0, + "step": 4749 + }, + { + "epoch": 0.604248823304923, + "ewc_loss": 0.043556731194257736, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001706747425487265, + "grad_norm": 5.353632926940918, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.866534948348999, + "num_tokens": 181348628.0, + "step": 4750 + }, + { + "epoch": 0.6043760335835135, + "ewc_loss": 0.04352555423974991, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.00017036293866112828, + "grad_norm": 5.298114776611328, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8497838973999023, + "num_tokens": 181387975.0, + "step": 4751 + }, + { + "epoch": 0.6045032438621041, + "ewc_loss": 0.04356355220079422, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001707429182715714, + "grad_norm": 5.307434558868408, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8708717823028564, + "num_tokens": 181424163.0, + "step": 4752 + }, + { + "epoch": 0.6046304541406946, + "ewc_loss": 0.04369039833545685, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017079069220926613, + "grad_norm": 5.367105960845947, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8484477996826172, + "num_tokens": 181457002.0, + "step": 4753 + }, + { + "epoch": 0.604757664419285, + "ewc_loss": 0.04372141510248184, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.000171100880834274, + "grad_norm": 5.341941833496094, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8527026176452637, + "num_tokens": 181494410.0, + "step": 4754 + }, + { + "epoch": 0.6048848746978756, + "ewc_loss": 0.04361549764871597, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017004170513246208, + "grad_norm": 5.263489246368408, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8540148735046387, + "num_tokens": 181534595.0, + "step": 4755 + }, + { + "epoch": 0.6050120849764661, + "ewc_loss": 0.04366237670183182, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017051050963345915, + "grad_norm": 5.325433731079102, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8642668724060059, + "num_tokens": 181570654.0, + "step": 4756 + }, + { + "epoch": 0.6051392952550566, + "ewc_loss": 0.0435921773314476, + "ewc_loss_diag": 2.6464462280273438e-05, + "ewc_loss_parallel": 0.0001710291689960286, + "grad_norm": 5.3429718017578125, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8518015146255493, + "num_tokens": 181608366.0, + "step": 4757 + }, + { + "epoch": 0.6052665055336471, + "ewc_loss": 0.043646760284900665, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017035433847922832, + "grad_norm": 5.295722484588623, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8460876941680908, + "num_tokens": 181647168.0, + "step": 4758 + }, + { + "epoch": 0.6053937158122377, + "ewc_loss": 0.04364817589521408, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017036849749274552, + "grad_norm": 5.259149551391602, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8612291812896729, + "num_tokens": 181689177.0, + "step": 4759 + }, + { + "epoch": 0.6055209260908282, + "ewc_loss": 0.04368489980697632, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017073570052161813, + "grad_norm": 5.363495349884033, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8621519804000854, + "num_tokens": 181723742.0, + "step": 4760 + }, + { + "epoch": 0.6056481363694186, + "ewc_loss": 0.04368191957473755, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017070591275114566, + "grad_norm": 5.249317646026611, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.866638720035553, + "num_tokens": 181761539.0, + "step": 4761 + }, + { + "epoch": 0.6057753466480091, + "ewc_loss": 0.0437035895884037, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017092260532081127, + "grad_norm": 5.384239673614502, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8543895483016968, + "num_tokens": 181799674.0, + "step": 4762 + }, + { + "epoch": 0.6059025569265997, + "ewc_loss": 0.04369290918111801, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017081579426303506, + "grad_norm": 5.256964206695557, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8451930284500122, + "num_tokens": 181838784.0, + "step": 4763 + }, + { + "epoch": 0.6060297672051902, + "ewc_loss": 0.04369157552719116, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017080249381251633, + "grad_norm": 5.289062976837158, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8534791469573975, + "num_tokens": 181875669.0, + "step": 4764 + }, + { + "epoch": 0.6061569774837807, + "ewc_loss": 0.04372170940041542, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017110382032115012, + "grad_norm": 5.312238693237305, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8411281704902649, + "num_tokens": 181912450.0, + "step": 4765 + }, + { + "epoch": 0.6062841877623713, + "ewc_loss": 0.04373571276664734, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017124382429756224, + "grad_norm": 5.310214519500732, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.856400728225708, + "num_tokens": 181947691.0, + "step": 4766 + }, + { + "epoch": 0.6064113980409617, + "ewc_loss": 0.04374397546052933, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017132646462414414, + "grad_norm": 5.307028770446777, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8616122007369995, + "num_tokens": 181982364.0, + "step": 4767 + }, + { + "epoch": 0.6065386083195522, + "ewc_loss": 0.04377156123518944, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001716023252811283, + "grad_norm": 5.331514835357666, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8575666546821594, + "num_tokens": 182017071.0, + "step": 4768 + }, + { + "epoch": 0.6066658185981427, + "ewc_loss": 0.0437975749373436, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001718624698696658, + "grad_norm": 5.295282363891602, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8571629524230957, + "num_tokens": 182057234.0, + "step": 4769 + }, + { + "epoch": 0.6067930288767333, + "ewc_loss": 0.04375200346112251, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017140676209237427, + "grad_norm": 5.294946193695068, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.869128942489624, + "num_tokens": 182092450.0, + "step": 4770 + }, + { + "epoch": 0.6069202391553238, + "ewc_loss": 0.04389820992946625, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001716480910545215, + "grad_norm": 5.339709281921387, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8490958213806152, + "num_tokens": 182126398.0, + "step": 4771 + }, + { + "epoch": 0.6070474494339143, + "ewc_loss": 0.04378712922334671, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017175801622215658, + "grad_norm": 5.33690881729126, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8719329833984375, + "num_tokens": 182160848.0, + "step": 4772 + }, + { + "epoch": 0.6071746597125047, + "ewc_loss": 0.04376637190580368, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017155044770333916, + "grad_norm": 5.253757476806641, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8662744760513306, + "num_tokens": 182202596.0, + "step": 4773 + }, + { + "epoch": 0.6073018699910953, + "ewc_loss": 0.043803200125694275, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017191872757393867, + "grad_norm": 5.326929569244385, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.855197548866272, + "num_tokens": 182239235.0, + "step": 4774 + }, + { + "epoch": 0.6074290802696858, + "ewc_loss": 0.04378166049718857, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017170333012472838, + "grad_norm": 5.340180397033691, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8697958588600159, + "num_tokens": 182273662.0, + "step": 4775 + }, + { + "epoch": 0.6075562905482763, + "ewc_loss": 0.04381134361028671, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017200013098772615, + "grad_norm": 5.328457832336426, + "learning_rate": 1e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8426936268806458, + "num_tokens": 182311991.0, + "step": 4776 + }, + { + "epoch": 0.6076835008268668, + "ewc_loss": 0.04378562420606613, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017174296954181045, + "grad_norm": 5.274016380310059, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8588959574699402, + "num_tokens": 182350722.0, + "step": 4777 + }, + { + "epoch": 0.6078107111054574, + "ewc_loss": 0.04394284263253212, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001720944419503212, + "grad_norm": 5.347895622253418, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8596736192703247, + "num_tokens": 182386755.0, + "step": 4778 + }, + { + "epoch": 0.6079379213840478, + "ewc_loss": 0.043943196535110474, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017209800716955215, + "grad_norm": 5.318343639373779, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8737116456031799, + "num_tokens": 182423128.0, + "step": 4779 + }, + { + "epoch": 0.6080651316626383, + "ewc_loss": 0.04375328868627548, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017141959688160568, + "grad_norm": 5.327807903289795, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8439927101135254, + "num_tokens": 182461518.0, + "step": 4780 + }, + { + "epoch": 0.6081923419412288, + "ewc_loss": 0.04390823841094971, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017174839740619063, + "grad_norm": 5.305687427520752, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8570802211761475, + "num_tokens": 182495707.0, + "step": 4781 + }, + { + "epoch": 0.6083195522198194, + "ewc_loss": 0.04376699775457382, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.0001715566759230569, + "grad_norm": 5.261355400085449, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8477433323860168, + "num_tokens": 182539058.0, + "step": 4782 + }, + { + "epoch": 0.6084467624984099, + "ewc_loss": 0.04374340549111366, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017132077482528985, + "grad_norm": 5.397966384887695, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8433082699775696, + "num_tokens": 182572482.0, + "step": 4783 + }, + { + "epoch": 0.6085739727770004, + "ewc_loss": 0.043838076293468475, + "ewc_loss_diag": 2.658367156982422e-05, + "ewc_loss_parallel": 0.00017226749332621694, + "grad_norm": 5.3281636238098145, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8626453876495361, + "num_tokens": 182607665.0, + "step": 4784 + }, + { + "epoch": 0.6087011830555908, + "ewc_loss": 0.04383459687232971, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001710119831841439, + "grad_norm": 5.384527683258057, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8588944673538208, + "num_tokens": 182650503.0, + "step": 4785 + }, + { + "epoch": 0.6088283933341814, + "ewc_loss": 0.04389328509569168, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017159886192530394, + "grad_norm": 5.289285182952881, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8686800599098206, + "num_tokens": 182681662.0, + "step": 4786 + }, + { + "epoch": 0.6089556036127719, + "ewc_loss": 0.04397086799144745, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017115400987677276, + "grad_norm": 5.35221004486084, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8457781076431274, + "num_tokens": 182714912.0, + "step": 4787 + }, + { + "epoch": 0.6090828138913624, + "ewc_loss": 0.0440535843372345, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017198115529026836, + "grad_norm": 5.3036723136901855, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8551957607269287, + "num_tokens": 182748813.0, + "step": 4788 + }, + { + "epoch": 0.609210024169953, + "ewc_loss": 0.043982185423374176, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017126718012150377, + "grad_norm": 5.294494152069092, + "learning_rate": 1e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8416016101837158, + "num_tokens": 182786123.0, + "step": 4789 + }, + { + "epoch": 0.6093372344485435, + "ewc_loss": 0.04407268762588501, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017217219283338636, + "grad_norm": 5.288200855255127, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8704372048377991, + "num_tokens": 182820550.0, + "step": 4790 + }, + { + "epoch": 0.6094644447271339, + "ewc_loss": 0.04401688650250435, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001716141850920394, + "grad_norm": 5.446791172027588, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8575654625892639, + "num_tokens": 182863135.0, + "step": 4791 + }, + { + "epoch": 0.6095916550057244, + "ewc_loss": 0.04407048970460892, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017215019033756107, + "grad_norm": 5.277899742126465, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8599799871444702, + "num_tokens": 182905193.0, + "step": 4792 + }, + { + "epoch": 0.609718865284315, + "ewc_loss": 0.043956756591796875, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017101287085097283, + "grad_norm": 5.2841877937316895, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8570235967636108, + "num_tokens": 182944934.0, + "step": 4793 + }, + { + "epoch": 0.6098460755629055, + "ewc_loss": 0.04401146620512009, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017155999375972897, + "grad_norm": 5.273901462554932, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8444085717201233, + "num_tokens": 182984604.0, + "step": 4794 + }, + { + "epoch": 0.609973285841496, + "ewc_loss": 0.044007886201143265, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017152416694443673, + "grad_norm": 5.333253383636475, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8558064699172974, + "num_tokens": 183020421.0, + "step": 4795 + }, + { + "epoch": 0.6101004961200865, + "ewc_loss": 0.04387078434228897, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017137386021204293, + "grad_norm": 5.330576419830322, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.849417507648468, + "num_tokens": 183057635.0, + "step": 4796 + }, + { + "epoch": 0.610227706398677, + "ewc_loss": 0.043877191841602325, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017143793229479343, + "grad_norm": 5.304577350616455, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8451257944107056, + "num_tokens": 183094689.0, + "step": 4797 + }, + { + "epoch": 0.6103549166772675, + "ewc_loss": 0.04386617988348007, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017132781795226038, + "grad_norm": 5.300115585327148, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8627392053604126, + "num_tokens": 183129118.0, + "step": 4798 + }, + { + "epoch": 0.610482126955858, + "ewc_loss": 0.04394526034593582, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017211861268151551, + "grad_norm": 5.336299419403076, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8572051525115967, + "num_tokens": 183163365.0, + "step": 4799 + }, + { + "epoch": 0.6106093372344485, + "ewc_loss": 0.043957144021987915, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001722374581731856, + "grad_norm": 5.338709354400635, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.860847532749176, + "num_tokens": 183199271.0, + "step": 4800 + }, + { + "epoch": 0.6107365475130391, + "ewc_loss": 0.043887440115213394, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017154042143374681, + "grad_norm": 5.425117015838623, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.876673698425293, + "num_tokens": 183234523.0, + "step": 4801 + }, + { + "epoch": 0.6108637577916296, + "ewc_loss": 0.04393705725669861, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001720365835353732, + "grad_norm": 5.320461273193359, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8580067157745361, + "num_tokens": 183267098.0, + "step": 4802 + }, + { + "epoch": 0.61099096807022, + "ewc_loss": 0.043946925550699234, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001709145726636052, + "grad_norm": 5.2582011222839355, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.858320415019989, + "num_tokens": 183306825.0, + "step": 4803 + }, + { + "epoch": 0.6111181783488105, + "ewc_loss": 0.0440550222992897, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017199556168634444, + "grad_norm": 5.342283248901367, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8576010465621948, + "num_tokens": 183348712.0, + "step": 4804 + }, + { + "epoch": 0.6112453886274011, + "ewc_loss": 0.04388204216957092, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017148641927633435, + "grad_norm": 5.275132179260254, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.869576096534729, + "num_tokens": 183383797.0, + "step": 4805 + }, + { + "epoch": 0.6113725989059916, + "ewc_loss": 0.04404905065894127, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.000171935826074332, + "grad_norm": 5.344950199127197, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8691900372505188, + "num_tokens": 183417484.0, + "step": 4806 + }, + { + "epoch": 0.6114998091845821, + "ewc_loss": 0.04401836544275284, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017162894073408097, + "grad_norm": 5.298900127410889, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8638729453086853, + "num_tokens": 183453808.0, + "step": 4807 + }, + { + "epoch": 0.6116270194631727, + "ewc_loss": 0.04392851144075394, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017195110558532178, + "grad_norm": 5.279140472412109, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8641780614852905, + "num_tokens": 183496693.0, + "step": 4808 + }, + { + "epoch": 0.6117542297417632, + "ewc_loss": 0.04388626664876938, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017152869259007275, + "grad_norm": 5.277867794036865, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8607467412948608, + "num_tokens": 183536727.0, + "step": 4809 + }, + { + "epoch": 0.6118814400203536, + "ewc_loss": 0.04388341307640076, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001715001417323947, + "grad_norm": 5.327123641967773, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8586267232894897, + "num_tokens": 183571809.0, + "step": 4810 + }, + { + "epoch": 0.6120086502989441, + "ewc_loss": 0.04393163323402405, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017198236309923232, + "grad_norm": 5.262246131896973, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8575373888015747, + "num_tokens": 183610761.0, + "step": 4811 + }, + { + "epoch": 0.6121358605775347, + "ewc_loss": 0.043857939541339874, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017124542500823736, + "grad_norm": 5.364740371704102, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8551084995269775, + "num_tokens": 183648304.0, + "step": 4812 + }, + { + "epoch": 0.6122630708561252, + "ewc_loss": 0.04396234452724457, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017228945216629654, + "grad_norm": 5.289906978607178, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8534433245658875, + "num_tokens": 183684201.0, + "step": 4813 + }, + { + "epoch": 0.6123902811347157, + "ewc_loss": 0.04387325793504715, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017139861301984638, + "grad_norm": 5.280320644378662, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.851464033126831, + "num_tokens": 183726595.0, + "step": 4814 + }, + { + "epoch": 0.6125174914133062, + "ewc_loss": 0.04390568658709526, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017172287334688008, + "grad_norm": 5.322522163391113, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8670568466186523, + "num_tokens": 183762222.0, + "step": 4815 + }, + { + "epoch": 0.6126447016918967, + "ewc_loss": 0.04387760907411575, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017144209414254874, + "grad_norm": 5.356087684631348, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8707008957862854, + "num_tokens": 183797167.0, + "step": 4816 + }, + { + "epoch": 0.6127719119704872, + "ewc_loss": 0.04388369992375374, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017150300845969468, + "grad_norm": 5.296438217163086, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8565904498100281, + "num_tokens": 183839591.0, + "step": 4817 + }, + { + "epoch": 0.6128991222490777, + "ewc_loss": 0.043887797743082047, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.000171544001204893, + "grad_norm": 5.323773384094238, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8583436012268066, + "num_tokens": 183877862.0, + "step": 4818 + }, + { + "epoch": 0.6130263325276682, + "ewc_loss": 0.04389066994190216, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017157269758172333, + "grad_norm": 5.285745143890381, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8625123500823975, + "num_tokens": 183915964.0, + "step": 4819 + }, + { + "epoch": 0.6131535428062588, + "ewc_loss": 0.04387158155441284, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001713818492135033, + "grad_norm": 5.357665061950684, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8549716472625732, + "num_tokens": 183952823.0, + "step": 4820 + }, + { + "epoch": 0.6132807530848493, + "ewc_loss": 0.04388204962015152, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001714864920359105, + "grad_norm": 5.315321445465088, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8490320444107056, + "num_tokens": 183991650.0, + "step": 4821 + }, + { + "epoch": 0.6134079633634397, + "ewc_loss": 0.04388275370001793, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017149354971479625, + "grad_norm": 5.303013324737549, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8591938018798828, + "num_tokens": 184027768.0, + "step": 4822 + }, + { + "epoch": 0.6135351736420303, + "ewc_loss": 0.043943341821432114, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017209943325724453, + "grad_norm": 5.341826915740967, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8701573014259338, + "num_tokens": 184063754.0, + "step": 4823 + }, + { + "epoch": 0.6136623839206208, + "ewc_loss": 0.04383388161659241, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017100483819376677, + "grad_norm": 5.261717796325684, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8525875210762024, + "num_tokens": 184105286.0, + "step": 4824 + }, + { + "epoch": 0.6137895941992113, + "ewc_loss": 0.04390018433332443, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017166788165923208, + "grad_norm": 5.328347682952881, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8621189594268799, + "num_tokens": 184139312.0, + "step": 4825 + }, + { + "epoch": 0.6139168044778018, + "ewc_loss": 0.04390880838036537, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017175410175696015, + "grad_norm": 5.244898319244385, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8571365475654602, + "num_tokens": 184181541.0, + "step": 4826 + }, + { + "epoch": 0.6140440147563924, + "ewc_loss": 0.04390396177768707, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001717056438792497, + "grad_norm": 5.34044885635376, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.85860276222229, + "num_tokens": 184216857.0, + "step": 4827 + }, + { + "epoch": 0.6141712250349828, + "ewc_loss": 0.043951429426670074, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017218032735399902, + "grad_norm": 5.274762153625488, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8619471788406372, + "num_tokens": 184253318.0, + "step": 4828 + }, + { + "epoch": 0.6142984353135733, + "ewc_loss": 0.04392625391483307, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017192855011671782, + "grad_norm": 5.275361061096191, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8694028258323669, + "num_tokens": 184298625.0, + "step": 4829 + }, + { + "epoch": 0.6144256455921638, + "ewc_loss": 0.04394802823662758, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017214629042427987, + "grad_norm": 5.370198726654053, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8560004234313965, + "num_tokens": 184334569.0, + "step": 4830 + }, + { + "epoch": 0.6145528558707544, + "ewc_loss": 0.04396900534629822, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017235605628229678, + "grad_norm": 5.317654132843018, + "learning_rate": 1e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8403931856155396, + "num_tokens": 184377477.0, + "step": 4831 + }, + { + "epoch": 0.6146800661493449, + "ewc_loss": 0.0439048632979393, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017171465151477605, + "grad_norm": 5.318323135375977, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8457315564155579, + "num_tokens": 184416671.0, + "step": 4832 + }, + { + "epoch": 0.6148072764279354, + "ewc_loss": 0.043947674334049225, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017214273975696415, + "grad_norm": 5.336429119110107, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8532426357269287, + "num_tokens": 184453174.0, + "step": 4833 + }, + { + "epoch": 0.6149344867065258, + "ewc_loss": 0.043948255479335785, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017214857507497072, + "grad_norm": 5.457510471343994, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8646640777587891, + "num_tokens": 184488313.0, + "step": 4834 + }, + { + "epoch": 0.6150616969851164, + "ewc_loss": 0.04395123943686485, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017217840650118887, + "grad_norm": 5.282220840454102, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8385520577430725, + "num_tokens": 184523733.0, + "step": 4835 + }, + { + "epoch": 0.6151889072637069, + "ewc_loss": 0.04394257813692093, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017209179350174963, + "grad_norm": 5.344829559326172, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.855599045753479, + "num_tokens": 184568399.0, + "step": 4836 + }, + { + "epoch": 0.6153161175422974, + "ewc_loss": 0.04396034777164459, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017226950149051845, + "grad_norm": 5.267801761627197, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8569484949111938, + "num_tokens": 184603302.0, + "step": 4837 + }, + { + "epoch": 0.615443327820888, + "ewc_loss": 0.0439542680978775, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001722086890367791, + "grad_norm": 5.347695350646973, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8778895139694214, + "num_tokens": 184637346.0, + "step": 4838 + }, + { + "epoch": 0.6155705380994785, + "ewc_loss": 0.04404665529727936, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001731325755827129, + "grad_norm": 5.386723041534424, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8646384477615356, + "num_tokens": 184671687.0, + "step": 4839 + }, + { + "epoch": 0.6156977483780689, + "ewc_loss": 0.04398268461227417, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017249287338927388, + "grad_norm": 5.329483509063721, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8679654598236084, + "num_tokens": 184706867.0, + "step": 4840 + }, + { + "epoch": 0.6158249586566594, + "ewc_loss": 0.04394134134054184, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017207942437380552, + "grad_norm": 5.306967258453369, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8535382151603699, + "num_tokens": 184743916.0, + "step": 4841 + }, + { + "epoch": 0.61595216893525, + "ewc_loss": 0.044023264199495316, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001728986535454169, + "grad_norm": 5.282044887542725, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8677463531494141, + "num_tokens": 184783247.0, + "step": 4842 + }, + { + "epoch": 0.6160793792138405, + "ewc_loss": 0.043950848281383514, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017217449203599244, + "grad_norm": 5.252496719360352, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8588811755180359, + "num_tokens": 184826501.0, + "step": 4843 + }, + { + "epoch": 0.616206589492431, + "ewc_loss": 0.044061414897441864, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017328017565887421, + "grad_norm": 5.332472324371338, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8560981750488281, + "num_tokens": 184868743.0, + "step": 4844 + }, + { + "epoch": 0.6163337997710215, + "ewc_loss": 0.044011808931827545, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001727841008687392, + "grad_norm": 5.310827732086182, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.853628396987915, + "num_tokens": 184908992.0, + "step": 4845 + }, + { + "epoch": 0.616461010049612, + "ewc_loss": 0.04401203989982605, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001727864146232605, + "grad_norm": 5.333746910095215, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8473688364028931, + "num_tokens": 184942456.0, + "step": 4846 + }, + { + "epoch": 0.6165882203282025, + "ewc_loss": 0.04406615346670151, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017332752759102732, + "grad_norm": 5.384152412414551, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8472781181335449, + "num_tokens": 184975834.0, + "step": 4847 + }, + { + "epoch": 0.616715430606793, + "ewc_loss": 0.04398896545171738, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017255565035156906, + "grad_norm": 5.292256832122803, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8806463479995728, + "num_tokens": 185012295.0, + "step": 4848 + }, + { + "epoch": 0.6168426408853835, + "ewc_loss": 0.04401297867298126, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017279582971241325, + "grad_norm": 5.328033447265625, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8619648814201355, + "num_tokens": 185049272.0, + "step": 4849 + }, + { + "epoch": 0.6169698511639741, + "ewc_loss": 0.044110603630542755, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017255134298466146, + "grad_norm": 5.327052116394043, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8689001798629761, + "num_tokens": 185081054.0, + "step": 4850 + }, + { + "epoch": 0.6170970614425646, + "ewc_loss": 0.04400426149368286, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017270863463636488, + "grad_norm": 5.32190465927124, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8543164134025574, + "num_tokens": 185119809.0, + "step": 4851 + }, + { + "epoch": 0.617224271721155, + "ewc_loss": 0.043984975665807724, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017251577810384333, + "grad_norm": 5.267978668212891, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8682452440261841, + "num_tokens": 185161462.0, + "step": 4852 + }, + { + "epoch": 0.6173514819997455, + "ewc_loss": 0.044021300971508026, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017287902301177382, + "grad_norm": 5.41184663772583, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8496488928794861, + "num_tokens": 185196238.0, + "step": 4853 + }, + { + "epoch": 0.6174786922783361, + "ewc_loss": 0.044150322675704956, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001729485229589045, + "grad_norm": 5.334184646606445, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8578773736953735, + "num_tokens": 185239317.0, + "step": 4854 + }, + { + "epoch": 0.6176059025569266, + "ewc_loss": 0.04394269734621048, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017209298675879836, + "grad_norm": 5.329258441925049, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8675764799118042, + "num_tokens": 185273555.0, + "step": 4855 + }, + { + "epoch": 0.6177331128355171, + "ewc_loss": 0.04398311302065849, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001724971371004358, + "grad_norm": 5.30672550201416, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8655033707618713, + "num_tokens": 185309721.0, + "step": 4856 + }, + { + "epoch": 0.6178603231141077, + "ewc_loss": 0.04398874193429947, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001725534093566239, + "grad_norm": 5.305652618408203, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8511854410171509, + "num_tokens": 185350846.0, + "step": 4857 + }, + { + "epoch": 0.6179875333926982, + "ewc_loss": 0.043993767350912094, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001726036862237379, + "grad_norm": 5.277241230010986, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8508277535438538, + "num_tokens": 185392472.0, + "step": 4858 + }, + { + "epoch": 0.6181147436712886, + "ewc_loss": 0.04398253560066223, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017249137454200536, + "grad_norm": 5.302396297454834, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8548427224159241, + "num_tokens": 185438250.0, + "step": 4859 + }, + { + "epoch": 0.6182419539498791, + "ewc_loss": 0.04413759335875511, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001728212519083172, + "grad_norm": 5.330343246459961, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8552103042602539, + "num_tokens": 185479455.0, + "step": 4860 + }, + { + "epoch": 0.6183691642284697, + "ewc_loss": 0.04399088770151138, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017257488798350096, + "grad_norm": 5.335402965545654, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8542750477790833, + "num_tokens": 185518719.0, + "step": 4861 + }, + { + "epoch": 0.6184963745070602, + "ewc_loss": 0.04401867464184761, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017285275680478662, + "grad_norm": 5.307066917419434, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8435050845146179, + "num_tokens": 185556671.0, + "step": 4862 + }, + { + "epoch": 0.6186235847856507, + "ewc_loss": 0.04399493336677551, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017261535685975105, + "grad_norm": 5.298943519592285, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8554307818412781, + "num_tokens": 185595063.0, + "step": 4863 + }, + { + "epoch": 0.6187507950642412, + "ewc_loss": 0.04405142366886139, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001731802331050858, + "grad_norm": 5.294939041137695, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8489366769790649, + "num_tokens": 185635626.0, + "step": 4864 + }, + { + "epoch": 0.6188780053428317, + "ewc_loss": 0.044076815247535706, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017343414947390556, + "grad_norm": 5.341194152832031, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8477235436439514, + "num_tokens": 185671687.0, + "step": 4865 + }, + { + "epoch": 0.6190052156214222, + "ewc_loss": 0.04404976963996887, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017316368757747114, + "grad_norm": 5.396130561828613, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8612211346626282, + "num_tokens": 185706459.0, + "step": 4866 + }, + { + "epoch": 0.6191324259000127, + "ewc_loss": 0.044044382870197296, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017310983093921095, + "grad_norm": 5.298316955566406, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8767467737197876, + "num_tokens": 185743162.0, + "step": 4867 + }, + { + "epoch": 0.6192596361786032, + "ewc_loss": 0.04404694214463234, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017313542775809765, + "grad_norm": 5.311970233917236, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8431565761566162, + "num_tokens": 185784036.0, + "step": 4868 + }, + { + "epoch": 0.6193868464571938, + "ewc_loss": 0.044032543897628784, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017299143655691296, + "grad_norm": 5.331350803375244, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8544352054595947, + "num_tokens": 185821407.0, + "step": 4869 + }, + { + "epoch": 0.6195140567357843, + "ewc_loss": 0.04403024911880493, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017296850273851305, + "grad_norm": 5.3270440101623535, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8633136749267578, + "num_tokens": 185857789.0, + "step": 4870 + }, + { + "epoch": 0.6196412670143747, + "ewc_loss": 0.044065214693546295, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017331814160570502, + "grad_norm": 5.305666446685791, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.869574248790741, + "num_tokens": 185896989.0, + "step": 4871 + }, + { + "epoch": 0.6197684772929652, + "ewc_loss": 0.04403742402791977, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017304022912867367, + "grad_norm": 5.334322452545166, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8721619844436646, + "num_tokens": 185938123.0, + "step": 4872 + }, + { + "epoch": 0.6198956875715558, + "ewc_loss": 0.04400825873017311, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001727486087474972, + "grad_norm": 5.321954250335693, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8581077456474304, + "num_tokens": 185973440.0, + "step": 4873 + }, + { + "epoch": 0.6200228978501463, + "ewc_loss": 0.04401776194572449, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001728436618577689, + "grad_norm": 5.334835529327393, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8590244054794312, + "num_tokens": 186010069.0, + "step": 4874 + }, + { + "epoch": 0.6201501081287368, + "ewc_loss": 0.0439969003200531, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017263500194530934, + "grad_norm": 5.321016788482666, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8520858287811279, + "num_tokens": 186048365.0, + "step": 4875 + }, + { + "epoch": 0.6202773184073274, + "ewc_loss": 0.044045090675354004, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017311690317001194, + "grad_norm": 5.367520809173584, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8638098239898682, + "num_tokens": 186085358.0, + "step": 4876 + }, + { + "epoch": 0.6204045286859178, + "ewc_loss": 0.04401988536119461, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001728648494463414, + "grad_norm": 5.371680736541748, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8625882863998413, + "num_tokens": 186118476.0, + "step": 4877 + }, + { + "epoch": 0.6205317389645083, + "ewc_loss": 0.0440346822142601, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017301285697612911, + "grad_norm": 5.293756008148193, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8537641763687134, + "num_tokens": 186155487.0, + "step": 4878 + }, + { + "epoch": 0.6206589492430988, + "ewc_loss": 0.04401431605219841, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017280917381867766, + "grad_norm": 5.337697505950928, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8640839457511902, + "num_tokens": 186192941.0, + "step": 4879 + }, + { + "epoch": 0.6207861595216894, + "ewc_loss": 0.04404980689287186, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017316409503109753, + "grad_norm": 5.311208248138428, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8617583513259888, + "num_tokens": 186233860.0, + "step": 4880 + }, + { + "epoch": 0.6209133698002799, + "ewc_loss": 0.0440269336104393, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017293535347562283, + "grad_norm": 5.410863876342773, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8469412326812744, + "num_tokens": 186273081.0, + "step": 4881 + }, + { + "epoch": 0.6210405800788704, + "ewc_loss": 0.04404173791408539, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017308341921307147, + "grad_norm": 5.312872886657715, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8635551333427429, + "num_tokens": 186310596.0, + "step": 4882 + }, + { + "epoch": 0.6211677903574608, + "ewc_loss": 0.044055551290512085, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001732215314405039, + "grad_norm": 5.379443168640137, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8533514738082886, + "num_tokens": 186346262.0, + "step": 4883 + }, + { + "epoch": 0.6212950006360514, + "ewc_loss": 0.0440281480550766, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017294751887675375, + "grad_norm": 5.345144748687744, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8593959808349609, + "num_tokens": 186386508.0, + "step": 4884 + }, + { + "epoch": 0.6214222109146419, + "ewc_loss": 0.04402012377977371, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.0001728672650642693, + "grad_norm": 5.395411014556885, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8533217906951904, + "num_tokens": 186425184.0, + "step": 4885 + }, + { + "epoch": 0.6215494211932324, + "ewc_loss": 0.044149719178676605, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001729425130179152, + "grad_norm": 5.303691864013672, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8567723631858826, + "num_tokens": 186463072.0, + "step": 4886 + }, + { + "epoch": 0.621676631471823, + "ewc_loss": 0.04410210996866226, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017246638890355825, + "grad_norm": 5.326742649078369, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8683640956878662, + "num_tokens": 186504062.0, + "step": 4887 + }, + { + "epoch": 0.6218038417504135, + "ewc_loss": 0.044077832251787186, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017344433581456542, + "grad_norm": 5.374915599822998, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.848078727722168, + "num_tokens": 186544564.0, + "step": 4888 + }, + { + "epoch": 0.6219310520290039, + "ewc_loss": 0.04400699585676193, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017273596313316375, + "grad_norm": 5.332857131958008, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8438934087753296, + "num_tokens": 186587196.0, + "step": 4889 + }, + { + "epoch": 0.6220582623075944, + "ewc_loss": 0.0441334992647171, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017278030281886458, + "grad_norm": 5.444239139556885, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8588647246360779, + "num_tokens": 186620673.0, + "step": 4890 + }, + { + "epoch": 0.622185472586185, + "ewc_loss": 0.04399552196264267, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017262122128158808, + "grad_norm": 5.308452606201172, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8613578081130981, + "num_tokens": 186654028.0, + "step": 4891 + }, + { + "epoch": 0.6223126828647755, + "ewc_loss": 0.04401056095957756, + "ewc_loss_diag": 2.6702880859375e-05, + "ewc_loss_parallel": 0.00017277162987738848, + "grad_norm": 5.347678184509277, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8554921746253967, + "num_tokens": 186693395.0, + "step": 4892 + }, + { + "epoch": 0.622439893143366, + "ewc_loss": 0.04412221163511276, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017266742361243814, + "grad_norm": 5.331111431121826, + "learning_rate": 1e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.8405537605285645, + "num_tokens": 186734440.0, + "step": 4893 + }, + { + "epoch": 0.6225671034219565, + "ewc_loss": 0.04415936395525932, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017303894856013358, + "grad_norm": 5.369478225708008, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8611771464347839, + "num_tokens": 186769637.0, + "step": 4894 + }, + { + "epoch": 0.622694313700547, + "ewc_loss": 0.04415608197450638, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001730061339912936, + "grad_norm": 5.32187557220459, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8542015552520752, + "num_tokens": 186810188.0, + "step": 4895 + }, + { + "epoch": 0.6228215239791375, + "ewc_loss": 0.04413134977221489, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017275880964007229, + "grad_norm": 5.35521936416626, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8485425710678101, + "num_tokens": 186847512.0, + "step": 4896 + }, + { + "epoch": 0.622948734257728, + "ewc_loss": 0.04414387792348862, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017288407252635807, + "grad_norm": 5.377674102783203, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8661531209945679, + "num_tokens": 186878721.0, + "step": 4897 + }, + { + "epoch": 0.6230759445363185, + "ewc_loss": 0.04412924870848656, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017273779667448252, + "grad_norm": 5.328861713409424, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8623887300491333, + "num_tokens": 186920239.0, + "step": 4898 + }, + { + "epoch": 0.6232031548149091, + "ewc_loss": 0.044165294617414474, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001730982621666044, + "grad_norm": 5.330768585205078, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8514750599861145, + "num_tokens": 186964158.0, + "step": 4899 + }, + { + "epoch": 0.6233303650934996, + "ewc_loss": 0.04415535926818848, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017299893079325557, + "grad_norm": 5.31427526473999, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8735197186470032, + "num_tokens": 187003176.0, + "step": 4900 + }, + { + "epoch": 0.62345757537209, + "ewc_loss": 0.04414403438568115, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001728856732370332, + "grad_norm": 5.31767463684082, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8572105169296265, + "num_tokens": 187046152.0, + "step": 4901 + }, + { + "epoch": 0.6235847856506805, + "ewc_loss": 0.04419545456767082, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017339986516162753, + "grad_norm": 5.336342811584473, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8693649768829346, + "num_tokens": 187086629.0, + "step": 4902 + }, + { + "epoch": 0.6237119959292711, + "ewc_loss": 0.044183678925037384, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001732821110635996, + "grad_norm": 5.411170482635498, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8518827557563782, + "num_tokens": 187122856.0, + "step": 4903 + }, + { + "epoch": 0.6238392062078616, + "ewc_loss": 0.044193465262651443, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017337997269351035, + "grad_norm": 5.291277885437012, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8710201978683472, + "num_tokens": 187161568.0, + "step": 4904 + }, + { + "epoch": 0.6239664164864521, + "ewc_loss": 0.04412338137626648, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001726791524561122, + "grad_norm": 5.3235554695129395, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8586203455924988, + "num_tokens": 187202831.0, + "step": 4905 + }, + { + "epoch": 0.6240936267650427, + "ewc_loss": 0.04420678690075874, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017351318092551082, + "grad_norm": 5.301631450653076, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.864342451095581, + "num_tokens": 187239059.0, + "step": 4906 + }, + { + "epoch": 0.6242208370436332, + "ewc_loss": 0.044246166944503784, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017390699940733612, + "grad_norm": 5.416100025177002, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.852150559425354, + "num_tokens": 187272940.0, + "step": 4907 + }, + { + "epoch": 0.6243480473222236, + "ewc_loss": 0.04420706257224083, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017351593123748899, + "grad_norm": 5.358044624328613, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8713240623474121, + "num_tokens": 187310963.0, + "step": 4908 + }, + { + "epoch": 0.6244752576008141, + "ewc_loss": 0.04415109381079674, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017295625002589077, + "grad_norm": 5.355734825134277, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8772182464599609, + "num_tokens": 187345698.0, + "step": 4909 + }, + { + "epoch": 0.6246024678794047, + "ewc_loss": 0.0442250519990921, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017369582201354206, + "grad_norm": 5.384902000427246, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8552927374839783, + "num_tokens": 187384762.0, + "step": 4910 + }, + { + "epoch": 0.6247296781579952, + "ewc_loss": 0.04409605637192726, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001724058820400387, + "grad_norm": 5.308908462524414, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8471782207489014, + "num_tokens": 187424288.0, + "step": 4911 + }, + { + "epoch": 0.6248568884365857, + "ewc_loss": 0.044177308678627014, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017321841733064502, + "grad_norm": 5.3393778800964355, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8664523363113403, + "num_tokens": 187461481.0, + "step": 4912 + }, + { + "epoch": 0.6249840987151762, + "ewc_loss": 0.04417271912097931, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017317250603809953, + "grad_norm": 5.329663276672363, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8730134963989258, + "num_tokens": 187497690.0, + "step": 4913 + }, + { + "epoch": 0.6251113089937667, + "ewc_loss": 0.044279083609580994, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017423616372980177, + "grad_norm": 5.362115383148193, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8526730537414551, + "num_tokens": 187537825.0, + "step": 4914 + }, + { + "epoch": 0.6252385192723572, + "ewc_loss": 0.044287439435720444, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017431970627512783, + "grad_norm": 5.28282356262207, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8680983781814575, + "num_tokens": 187583998.0, + "step": 4915 + }, + { + "epoch": 0.6253657295509477, + "ewc_loss": 0.044248584657907486, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001739311555866152, + "grad_norm": 5.382269859313965, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8567301630973816, + "num_tokens": 187621962.0, + "step": 4916 + }, + { + "epoch": 0.6254929398295382, + "ewc_loss": 0.04433943331241608, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001748396607581526, + "grad_norm": 5.379726409912109, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8652196526527405, + "num_tokens": 187655913.0, + "step": 4917 + }, + { + "epoch": 0.6256201501081288, + "ewc_loss": 0.04422386735677719, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017368396220263094, + "grad_norm": 5.378334999084473, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8491668701171875, + "num_tokens": 187692433.0, + "step": 4918 + }, + { + "epoch": 0.6257473603867193, + "ewc_loss": 0.04430525749921799, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017449787992518395, + "grad_norm": 5.471035003662109, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8693810701370239, + "num_tokens": 187724311.0, + "step": 4919 + }, + { + "epoch": 0.6258745706653097, + "ewc_loss": 0.044190630316734314, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017335161101073027, + "grad_norm": 5.313649654388428, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8658039569854736, + "num_tokens": 187766614.0, + "step": 4920 + }, + { + "epoch": 0.6260017809439002, + "ewc_loss": 0.04418347030878067, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017328003013972193, + "grad_norm": 5.38686990737915, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8697819709777832, + "num_tokens": 187804847.0, + "step": 4921 + }, + { + "epoch": 0.6261289912224908, + "ewc_loss": 0.04426771402359009, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017412242596037686, + "grad_norm": 5.421616077423096, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8593205213546753, + "num_tokens": 187842791.0, + "step": 4922 + }, + { + "epoch": 0.6262562015010813, + "ewc_loss": 0.04414128512144089, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017285815556533635, + "grad_norm": 5.40411376953125, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8488918542861938, + "num_tokens": 187878304.0, + "step": 4923 + }, + { + "epoch": 0.6263834117796718, + "ewc_loss": 0.044225260615348816, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017369791748933494, + "grad_norm": 5.461000919342041, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.860231876373291, + "num_tokens": 187915022.0, + "step": 4924 + }, + { + "epoch": 0.6265106220582624, + "ewc_loss": 0.04417960345745087, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017324132204521447, + "grad_norm": 5.3623247146606445, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.858218252658844, + "num_tokens": 187950533.0, + "step": 4925 + }, + { + "epoch": 0.6266378323368528, + "ewc_loss": 0.04415926709771156, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001730379881337285, + "grad_norm": 5.35710334777832, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8548064231872559, + "num_tokens": 187989893.0, + "step": 4926 + }, + { + "epoch": 0.6267650426154433, + "ewc_loss": 0.04419513791799545, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017339667829219252, + "grad_norm": 5.40334939956665, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8521870374679565, + "num_tokens": 188025031.0, + "step": 4927 + }, + { + "epoch": 0.6268922528940338, + "ewc_loss": 0.0442362017929554, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017380733333993703, + "grad_norm": 5.331437587738037, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.851355254650116, + "num_tokens": 188060797.0, + "step": 4928 + }, + { + "epoch": 0.6270194631726244, + "ewc_loss": 0.04419828578829765, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001734281686367467, + "grad_norm": 5.4260969161987305, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8509905934333801, + "num_tokens": 188096271.0, + "step": 4929 + }, + { + "epoch": 0.6271466734512149, + "ewc_loss": 0.04426867514848709, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017413207388017327, + "grad_norm": 5.330402374267578, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8589363694190979, + "num_tokens": 188136697.0, + "step": 4930 + }, + { + "epoch": 0.6272738837298054, + "ewc_loss": 0.04426053166389465, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017405061225872487, + "grad_norm": 5.3627238273620605, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8563998937606812, + "num_tokens": 188174715.0, + "step": 4931 + }, + { + "epoch": 0.6274010940083958, + "ewc_loss": 0.04436440020799637, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017386861145496368, + "grad_norm": 5.359685897827148, + "learning_rate": 1e-06, + "loss": 0.547, + "mean_token_accuracy": 0.8364460468292236, + "num_tokens": 188216379.0, + "step": 4932 + }, + { + "epoch": 0.6275283042869864, + "ewc_loss": 0.04427375644445419, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017418286006432027, + "grad_norm": 5.317981243133545, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8627930879592896, + "num_tokens": 188252183.0, + "step": 4933 + }, + { + "epoch": 0.6276555145655769, + "ewc_loss": 0.044519029557704926, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001741942105581984, + "grad_norm": 5.322061538696289, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8603711128234863, + "num_tokens": 188291128.0, + "step": 4934 + }, + { + "epoch": 0.6277827248441674, + "ewc_loss": 0.044428154826164246, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017450613086111844, + "grad_norm": 5.3666768074035645, + "learning_rate": 1e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8409609198570251, + "num_tokens": 188334256.0, + "step": 4935 + }, + { + "epoch": 0.627909935122758, + "ewc_loss": 0.044542908668518066, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017443297838326544, + "grad_norm": 5.424846649169922, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8507391810417175, + "num_tokens": 188373207.0, + "step": 4936 + }, + { + "epoch": 0.6280371454013485, + "ewc_loss": 0.044542260468006134, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017442653188481927, + "grad_norm": 5.318463325500488, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8552544116973877, + "num_tokens": 188414937.0, + "step": 4937 + }, + { + "epoch": 0.6281643556799389, + "ewc_loss": 0.04455186799168587, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017452258907724172, + "grad_norm": 5.3424506187438965, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8545340299606323, + "num_tokens": 188455398.0, + "step": 4938 + }, + { + "epoch": 0.6282915659585294, + "ewc_loss": 0.044429779052734375, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017452241445425898, + "grad_norm": 5.405502796173096, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8573011159896851, + "num_tokens": 188496561.0, + "step": 4939 + }, + { + "epoch": 0.62841877623712, + "ewc_loss": 0.044549908488988876, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001745029876474291, + "grad_norm": 5.359766006469727, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8628391623497009, + "num_tokens": 188531318.0, + "step": 4940 + }, + { + "epoch": 0.6285459865157105, + "ewc_loss": 0.04453626275062561, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001743665343383327, + "grad_norm": 5.397068977355957, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8489766120910645, + "num_tokens": 188566807.0, + "step": 4941 + }, + { + "epoch": 0.628673196794301, + "ewc_loss": 0.0445333868265152, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.000174337750650011, + "grad_norm": 5.36013650894165, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8532219529151917, + "num_tokens": 188608062.0, + "step": 4942 + }, + { + "epoch": 0.6288004070728915, + "ewc_loss": 0.04450542479753494, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017405815015081316, + "grad_norm": 5.489561557769775, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8695865869522095, + "num_tokens": 188644953.0, + "step": 4943 + }, + { + "epoch": 0.628927617351482, + "ewc_loss": 0.04452434182167053, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001742473104968667, + "grad_norm": 5.284601211547852, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8732892870903015, + "num_tokens": 188687720.0, + "step": 4944 + }, + { + "epoch": 0.6290548276300725, + "ewc_loss": 0.04446089267730713, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017361283244099468, + "grad_norm": 5.386173248291016, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8512173891067505, + "num_tokens": 188730077.0, + "step": 4945 + }, + { + "epoch": 0.629182037908663, + "ewc_loss": 0.0445576012134552, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017457989451941103, + "grad_norm": 5.310962200164795, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8632375001907349, + "num_tokens": 188768306.0, + "step": 4946 + }, + { + "epoch": 0.6293092481872535, + "ewc_loss": 0.044480107724666595, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017380497592967004, + "grad_norm": 5.3763041496276855, + "learning_rate": 1e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.8378033638000488, + "num_tokens": 188807480.0, + "step": 4947 + }, + { + "epoch": 0.6294364584658441, + "ewc_loss": 0.04455899819731712, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001745938789099455, + "grad_norm": 5.487419128417969, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8471993207931519, + "num_tokens": 188836383.0, + "step": 4948 + }, + { + "epoch": 0.6295636687444346, + "ewc_loss": 0.044506847858428955, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017407239647582173, + "grad_norm": 5.266868591308594, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8609286546707153, + "num_tokens": 188876079.0, + "step": 4949 + }, + { + "epoch": 0.629690879023025, + "ewc_loss": 0.04450243338942528, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017402824596501887, + "grad_norm": 5.369072914123535, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8553237318992615, + "num_tokens": 188912936.0, + "step": 4950 + }, + { + "epoch": 0.6298180893016155, + "ewc_loss": 0.044592708349227905, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017493100313004106, + "grad_norm": 5.349524021148682, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8566475510597229, + "num_tokens": 188951475.0, + "step": 4951 + }, + { + "epoch": 0.6299452995802061, + "ewc_loss": 0.0446101576089859, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017510546604171395, + "grad_norm": 5.306818962097168, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8437492847442627, + "num_tokens": 188996972.0, + "step": 4952 + }, + { + "epoch": 0.6300725098587966, + "ewc_loss": 0.04459073394536972, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001749112270772457, + "grad_norm": 5.32994270324707, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8768426179885864, + "num_tokens": 189030706.0, + "step": 4953 + }, + { + "epoch": 0.6301997201373871, + "ewc_loss": 0.04460706561803818, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017507454322185367, + "grad_norm": 5.345098972320557, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8802906274795532, + "num_tokens": 189065638.0, + "step": 4954 + }, + { + "epoch": 0.6303269304159776, + "ewc_loss": 0.044673480093479156, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017573869263287634, + "grad_norm": 5.371865272521973, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8715172410011292, + "num_tokens": 189101747.0, + "step": 4955 + }, + { + "epoch": 0.6304541406945681, + "ewc_loss": 0.04460809752345085, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017508487508166581, + "grad_norm": 5.326545238494873, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8542656898498535, + "num_tokens": 189142965.0, + "step": 4956 + }, + { + "epoch": 0.6305813509731586, + "ewc_loss": 0.04459493234753609, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017495323845651, + "grad_norm": 5.374081611633301, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8627891540527344, + "num_tokens": 189177822.0, + "step": 4957 + }, + { + "epoch": 0.6307085612517491, + "ewc_loss": 0.0446198508143425, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017520242545288056, + "grad_norm": 5.3534440994262695, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8525571227073669, + "num_tokens": 189215710.0, + "step": 4958 + }, + { + "epoch": 0.6308357715303397, + "ewc_loss": 0.044573165476322174, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017473558546043932, + "grad_norm": 5.4185919761657715, + "learning_rate": 1e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8309683799743652, + "num_tokens": 189250788.0, + "step": 4959 + }, + { + "epoch": 0.6309629818089302, + "ewc_loss": 0.04463697224855423, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017537364328745753, + "grad_norm": 5.453921794891357, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8552783131599426, + "num_tokens": 189278451.0, + "step": 4960 + }, + { + "epoch": 0.6310901920875207, + "ewc_loss": 0.04457876831293106, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017479161033406854, + "grad_norm": 5.292236804962158, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8618783950805664, + "num_tokens": 189316857.0, + "step": 4961 + }, + { + "epoch": 0.6312174023661112, + "ewc_loss": 0.04458269476890564, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017483082774560899, + "grad_norm": 5.385410785675049, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8691689968109131, + "num_tokens": 189356600.0, + "step": 4962 + }, + { + "epoch": 0.6313446126447017, + "ewc_loss": 0.04436568170785904, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017510211910121143, + "grad_norm": 5.359140396118164, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8525736331939697, + "num_tokens": 189393134.0, + "step": 4963 + }, + { + "epoch": 0.6314718229232922, + "ewc_loss": 0.04461430385708809, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017514695355203003, + "grad_norm": 5.4357194900512695, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8427948951721191, + "num_tokens": 189433431.0, + "step": 4964 + }, + { + "epoch": 0.6315990332018827, + "ewc_loss": 0.04434039443731308, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017484926502220333, + "grad_norm": 5.316490650177002, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8430920839309692, + "num_tokens": 189473738.0, + "step": 4965 + }, + { + "epoch": 0.6317262434804732, + "ewc_loss": 0.04457390680909157, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017474297783337533, + "grad_norm": 5.377929210662842, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8719150424003601, + "num_tokens": 189509930.0, + "step": 4966 + }, + { + "epoch": 0.6318534537590638, + "ewc_loss": 0.04459380358457565, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017494196072220802, + "grad_norm": 5.323331832885742, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8634409308433533, + "num_tokens": 189547801.0, + "step": 4967 + }, + { + "epoch": 0.6319806640376543, + "ewc_loss": 0.044603072106838226, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001750346418702975, + "grad_norm": 5.393950939178467, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8711232542991638, + "num_tokens": 189582226.0, + "step": 4968 + }, + { + "epoch": 0.6321078743162447, + "ewc_loss": 0.04462805390357971, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001752844691509381, + "grad_norm": 5.341903209686279, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8573113679885864, + "num_tokens": 189622915.0, + "step": 4969 + }, + { + "epoch": 0.6322350845948352, + "ewc_loss": 0.04459980130195618, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001750019146129489, + "grad_norm": 5.366822242736816, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8658750057220459, + "num_tokens": 189658182.0, + "step": 4970 + }, + { + "epoch": 0.6323622948734258, + "ewc_loss": 0.04462272673845291, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017523116548545659, + "grad_norm": 5.374904155731201, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8601152896881104, + "num_tokens": 189696627.0, + "step": 4971 + }, + { + "epoch": 0.6324895051520163, + "ewc_loss": 0.04466751217842102, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017567902978044003, + "grad_norm": 5.435873985290527, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8560982346534729, + "num_tokens": 189729826.0, + "step": 4972 + }, + { + "epoch": 0.6326167154306068, + "ewc_loss": 0.044598840177059174, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001749923249008134, + "grad_norm": 5.357705116271973, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8630483150482178, + "num_tokens": 189771504.0, + "step": 4973 + }, + { + "epoch": 0.6327439257091974, + "ewc_loss": 0.044620148837566376, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017520540859550238, + "grad_norm": 5.347099781036377, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8760112524032593, + "num_tokens": 189809220.0, + "step": 4974 + }, + { + "epoch": 0.6328711359877878, + "ewc_loss": 0.04458325356245041, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017483641568105668, + "grad_norm": 5.389063358306885, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8659906387329102, + "num_tokens": 189849586.0, + "step": 4975 + }, + { + "epoch": 0.6329983462663783, + "ewc_loss": 0.04460491985082626, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017505312280263752, + "grad_norm": 5.417396068572998, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8593882322311401, + "num_tokens": 189885054.0, + "step": 4976 + }, + { + "epoch": 0.6331255565449688, + "ewc_loss": 0.04462166130542755, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001752204989315942, + "grad_norm": 5.427530288696289, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8515337705612183, + "num_tokens": 189918286.0, + "step": 4977 + }, + { + "epoch": 0.6332527668235594, + "ewc_loss": 0.04458129405975342, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017481687245890498, + "grad_norm": 5.328601837158203, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.839584231376648, + "num_tokens": 189956845.0, + "step": 4978 + }, + { + "epoch": 0.6333799771021499, + "ewc_loss": 0.044632166624069214, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017532557831145823, + "grad_norm": 5.386809349060059, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8749097585678101, + "num_tokens": 189994827.0, + "step": 4979 + }, + { + "epoch": 0.6335071873807404, + "ewc_loss": 0.04462384060025215, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017524231225252151, + "grad_norm": 5.4180073738098145, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8533217906951904, + "num_tokens": 190036066.0, + "step": 4980 + }, + { + "epoch": 0.6336343976593308, + "ewc_loss": 0.04463372379541397, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017534111975692213, + "grad_norm": 5.379726886749268, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8697383403778076, + "num_tokens": 190073326.0, + "step": 4981 + }, + { + "epoch": 0.6337616079379214, + "ewc_loss": 0.0445544570684433, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017454846238251776, + "grad_norm": 5.34209680557251, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8565723299980164, + "num_tokens": 190109125.0, + "step": 4982 + }, + { + "epoch": 0.6338888182165119, + "ewc_loss": 0.04458307474851608, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017483464034739882, + "grad_norm": 5.3661580085754395, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8638632893562317, + "num_tokens": 190148520.0, + "step": 4983 + }, + { + "epoch": 0.6340160284951024, + "ewc_loss": 0.044643811881542206, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017544202273711562, + "grad_norm": 5.417550086975098, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8616639375686646, + "num_tokens": 190184272.0, + "step": 4984 + }, + { + "epoch": 0.6341432387736929, + "ewc_loss": 0.04459064453840256, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.000174910353962332, + "grad_norm": 5.400880813598633, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8576046228408813, + "num_tokens": 190221483.0, + "step": 4985 + }, + { + "epoch": 0.6342704490522835, + "ewc_loss": 0.04457949101924896, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017479884263593704, + "grad_norm": 5.3111982345581055, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8704251646995544, + "num_tokens": 190266430.0, + "step": 4986 + }, + { + "epoch": 0.6343976593308739, + "ewc_loss": 0.044580888003110886, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001748127833707258, + "grad_norm": 5.354398727416992, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8708208799362183, + "num_tokens": 190308575.0, + "step": 4987 + }, + { + "epoch": 0.6345248696094644, + "ewc_loss": 0.044582221657037735, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017482612747699022, + "grad_norm": 5.347148895263672, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8681079149246216, + "num_tokens": 190344584.0, + "step": 4988 + }, + { + "epoch": 0.6346520798880549, + "ewc_loss": 0.04459936171770096, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001749975053826347, + "grad_norm": 5.421868801116943, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8671976327896118, + "num_tokens": 190376497.0, + "step": 4989 + }, + { + "epoch": 0.6347792901666455, + "ewc_loss": 0.044630542397499084, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017530932382214814, + "grad_norm": 5.392088890075684, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.848060131072998, + "num_tokens": 190416879.0, + "step": 4990 + }, + { + "epoch": 0.634906500445236, + "ewc_loss": 0.044564783573150635, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017465173732489347, + "grad_norm": 5.342103958129883, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.879511833190918, + "num_tokens": 190454684.0, + "step": 4991 + }, + { + "epoch": 0.6350337107238265, + "ewc_loss": 0.044540852308273315, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017441240197513252, + "grad_norm": 5.482726573944092, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8510969877243042, + "num_tokens": 190490524.0, + "step": 4992 + }, + { + "epoch": 0.635160921002417, + "ewc_loss": 0.044542908668518066, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017443299293518066, + "grad_norm": 5.337422847747803, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8653489947319031, + "num_tokens": 190526558.0, + "step": 4993 + }, + { + "epoch": 0.6352881312810075, + "ewc_loss": 0.0442889928817749, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001743352331686765, + "grad_norm": 5.353374004364014, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8594026565551758, + "num_tokens": 190571206.0, + "step": 4994 + }, + { + "epoch": 0.635415341559598, + "ewc_loss": 0.04428618401288986, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.0001743071770761162, + "grad_norm": 5.32106876373291, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8474839329719543, + "num_tokens": 190611480.0, + "step": 4995 + }, + { + "epoch": 0.6355425518381885, + "ewc_loss": 0.044281646609306335, + "ewc_loss_diag": 2.682209014892578e-05, + "ewc_loss_parallel": 0.00017426178965251893, + "grad_norm": 5.433285713195801, + "learning_rate": 1e-06, + "loss": 0.5433, + "mean_token_accuracy": 0.8312118053436279, + "num_tokens": 190647699.0, + "step": 4996 + }, + { + "epoch": 0.6356697621167791, + "ewc_loss": 0.04457789659500122, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017478289373684675, + "grad_norm": 5.353052139282227, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8598030805587769, + "num_tokens": 190684863.0, + "step": 4997 + }, + { + "epoch": 0.6357969723953696, + "ewc_loss": 0.044566720724105835, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017467112047597766, + "grad_norm": 5.405806541442871, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8435274958610535, + "num_tokens": 190725006.0, + "step": 4998 + }, + { + "epoch": 0.63592418267396, + "ewc_loss": 0.04462224990129471, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017522642156109214, + "grad_norm": 5.424312114715576, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8557803630828857, + "num_tokens": 190757431.0, + "step": 4999 + }, + { + "epoch": 0.6360513929525505, + "ewc_loss": 0.044562697410583496, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017463088443037122, + "grad_norm": 5.384906768798828, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8634811043739319, + "num_tokens": 190793118.0, + "step": 5000 + }, + { + "epoch": 0.6361786032311411, + "ewc_loss": 0.04457196220755577, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.000174723521922715, + "grad_norm": 5.313136100769043, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.867638111114502, + "num_tokens": 190837105.0, + "step": 5001 + }, + { + "epoch": 0.6363058135097316, + "ewc_loss": 0.04460645839571953, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017506848962511867, + "grad_norm": 5.45651388168335, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8531608581542969, + "num_tokens": 190875372.0, + "step": 5002 + }, + { + "epoch": 0.6364330237883221, + "ewc_loss": 0.04460133984684944, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001750173105392605, + "grad_norm": 5.342552661895752, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8634631633758545, + "num_tokens": 190921307.0, + "step": 5003 + }, + { + "epoch": 0.6365602340669126, + "ewc_loss": 0.04457809031009674, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001747848145896569, + "grad_norm": 5.3144659996032715, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.859244704246521, + "num_tokens": 190967919.0, + "step": 5004 + }, + { + "epoch": 0.6366874443455031, + "ewc_loss": 0.044635094702243805, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017535484221298248, + "grad_norm": 5.372855186462402, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8578363060951233, + "num_tokens": 191005668.0, + "step": 5005 + }, + { + "epoch": 0.6368146546240936, + "ewc_loss": 0.04458830505609512, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017488693993072957, + "grad_norm": 5.327171325683594, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8476365804672241, + "num_tokens": 191044455.0, + "step": 5006 + }, + { + "epoch": 0.6369418649026841, + "ewc_loss": 0.04462829977273941, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001752869284246117, + "grad_norm": 5.342491626739502, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.863761305809021, + "num_tokens": 191086369.0, + "step": 5007 + }, + { + "epoch": 0.6370690751812746, + "ewc_loss": 0.04467125982046127, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017571653006598353, + "grad_norm": 5.390233993530273, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8537930250167847, + "num_tokens": 191127070.0, + "step": 5008 + }, + { + "epoch": 0.6371962854598652, + "ewc_loss": 0.04454245790839195, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017564918380230665, + "grad_norm": 5.328702449798584, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8730294704437256, + "num_tokens": 191163623.0, + "step": 5009 + }, + { + "epoch": 0.6373234957384557, + "ewc_loss": 0.04456295073032379, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017585411842446774, + "grad_norm": 5.4232096672058105, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8547087907791138, + "num_tokens": 191196790.0, + "step": 5010 + }, + { + "epoch": 0.6374507060170462, + "ewc_loss": 0.044547103345394135, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.0001756956335157156, + "grad_norm": 5.39207649230957, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8549861907958984, + "num_tokens": 191230265.0, + "step": 5011 + }, + { + "epoch": 0.6375779162956366, + "ewc_loss": 0.044637151062488556, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001753754186211154, + "grad_norm": 5.3678297996521, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8506900072097778, + "num_tokens": 191266781.0, + "step": 5012 + }, + { + "epoch": 0.6377051265742272, + "ewc_loss": 0.044527895748615265, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017550354823470116, + "grad_norm": 5.338446617126465, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8489031791687012, + "num_tokens": 191305869.0, + "step": 5013 + }, + { + "epoch": 0.6378323368528177, + "ewc_loss": 0.04457857459783554, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017601033323444426, + "grad_norm": 5.460192680358887, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8468515276908875, + "num_tokens": 191343498.0, + "step": 5014 + }, + { + "epoch": 0.6379595471314082, + "ewc_loss": 0.04455719143152237, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.0001757965364959091, + "grad_norm": 5.399372100830078, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8433648943901062, + "num_tokens": 191383260.0, + "step": 5015 + }, + { + "epoch": 0.6380867574099988, + "ewc_loss": 0.04454634338617325, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.0001756880374159664, + "grad_norm": 5.3929948806762695, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8494300246238708, + "num_tokens": 191417350.0, + "step": 5016 + }, + { + "epoch": 0.6382139676885893, + "ewc_loss": 0.044830381870269775, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017608700727578253, + "grad_norm": 5.3529372215271, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8597290515899658, + "num_tokens": 191454336.0, + "step": 5017 + }, + { + "epoch": 0.6383411779671797, + "ewc_loss": 0.04459312558174133, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017615585238672793, + "grad_norm": 5.348685264587402, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8603084087371826, + "num_tokens": 191489977.0, + "step": 5018 + }, + { + "epoch": 0.6384683882457702, + "ewc_loss": 0.044874224811792374, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.000176525441929698, + "grad_norm": 5.433620452880859, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.848883867263794, + "num_tokens": 191524622.0, + "step": 5019 + }, + { + "epoch": 0.6385955985243608, + "ewc_loss": 0.0448589101433754, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017637228302191943, + "grad_norm": 5.355646133422852, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8476250171661377, + "num_tokens": 191565389.0, + "step": 5020 + }, + { + "epoch": 0.6387228088029513, + "ewc_loss": 0.04484295845031738, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017621276492718607, + "grad_norm": 5.34567403793335, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8778688311576843, + "num_tokens": 191604640.0, + "step": 5021 + }, + { + "epoch": 0.6388500190815418, + "ewc_loss": 0.04489175230264664, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017670070519670844, + "grad_norm": 5.338164806365967, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8566306829452515, + "num_tokens": 191651029.0, + "step": 5022 + }, + { + "epoch": 0.6389772293601323, + "ewc_loss": 0.04493432119488716, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001771264214767143, + "grad_norm": 5.428707599639893, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8606698513031006, + "num_tokens": 191690524.0, + "step": 5023 + }, + { + "epoch": 0.6391044396387228, + "ewc_loss": 0.044940970838069916, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017719290917739272, + "grad_norm": 5.36733341217041, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8538631200790405, + "num_tokens": 191733992.0, + "step": 5024 + }, + { + "epoch": 0.6392316499173133, + "ewc_loss": 0.04481720179319382, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017717589798849076, + "grad_norm": 5.3875203132629395, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8691264390945435, + "num_tokens": 191769717.0, + "step": 5025 + }, + { + "epoch": 0.6393588601959038, + "ewc_loss": 0.04494862258434296, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017726940859574825, + "grad_norm": 5.363579273223877, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8441952466964722, + "num_tokens": 191813334.0, + "step": 5026 + }, + { + "epoch": 0.6394860704744944, + "ewc_loss": 0.044821880757808685, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001772226969478652, + "grad_norm": 5.4393134117126465, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.860331118106842, + "num_tokens": 191848295.0, + "step": 5027 + }, + { + "epoch": 0.6396132807530849, + "ewc_loss": 0.04481707513332367, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017717464652378112, + "grad_norm": 5.384220600128174, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.851871132850647, + "num_tokens": 191885329.0, + "step": 5028 + }, + { + "epoch": 0.6397404910316754, + "ewc_loss": 0.04478343203663826, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.0001768382207956165, + "grad_norm": 5.450050354003906, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8710416555404663, + "num_tokens": 191918070.0, + "step": 5029 + }, + { + "epoch": 0.6398677013102658, + "ewc_loss": 0.04479457437992096, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017694965936243534, + "grad_norm": 5.340160369873047, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8622463941574097, + "num_tokens": 191966084.0, + "step": 5030 + }, + { + "epoch": 0.6399949115888564, + "ewc_loss": 0.04472213238477707, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017622520681470633, + "grad_norm": 5.452345848083496, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8576064109802246, + "num_tokens": 192004907.0, + "step": 5031 + }, + { + "epoch": 0.6401221218674469, + "ewc_loss": 0.045049965381622314, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00017706214566715062, + "grad_norm": 5.428176403045654, + "learning_rate": 1e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8404399752616882, + "num_tokens": 192042942.0, + "step": 5032 + }, + { + "epoch": 0.6402493321460374, + "ewc_loss": 0.04470646381378174, + "ewc_loss_diag": 2.7060508728027344e-05, + "ewc_loss_parallel": 0.00017606854089535773, + "grad_norm": 5.357554912567139, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.859682023525238, + "num_tokens": 192081133.0, + "step": 5033 + }, + { + "epoch": 0.6403765424246279, + "ewc_loss": 0.04485858604311943, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017636906704865396, + "grad_norm": 5.403750896453857, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8586990833282471, + "num_tokens": 192120171.0, + "step": 5034 + }, + { + "epoch": 0.6405037527032185, + "ewc_loss": 0.04483519494533539, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017613513045944273, + "grad_norm": 5.392974853515625, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8589831590652466, + "num_tokens": 192159583.0, + "step": 5035 + }, + { + "epoch": 0.6406309629818089, + "ewc_loss": 0.04486599564552307, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017644317995291203, + "grad_norm": 5.432289123535156, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8567008972167969, + "num_tokens": 192199513.0, + "step": 5036 + }, + { + "epoch": 0.6407581732603994, + "ewc_loss": 0.04486645758152008, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001764477783581242, + "grad_norm": 5.462756156921387, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8609507083892822, + "num_tokens": 192228352.0, + "step": 5037 + }, + { + "epoch": 0.6408853835389899, + "ewc_loss": 0.044844236224889755, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001762255560606718, + "grad_norm": 5.3722825050354, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8536379337310791, + "num_tokens": 192264480.0, + "step": 5038 + }, + { + "epoch": 0.6410125938175805, + "ewc_loss": 0.04485144466161728, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017629764624871314, + "grad_norm": 5.392927169799805, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8572355508804321, + "num_tokens": 192303307.0, + "step": 5039 + }, + { + "epoch": 0.641139804096171, + "ewc_loss": 0.04490964859724045, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001768796646501869, + "grad_norm": 5.384252071380615, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8609090447425842, + "num_tokens": 192342064.0, + "step": 5040 + }, + { + "epoch": 0.6412670143747615, + "ewc_loss": 0.04487569257616997, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001765401248121634, + "grad_norm": 5.371614456176758, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8651549816131592, + "num_tokens": 192379297.0, + "step": 5041 + }, + { + "epoch": 0.6413942246533519, + "ewc_loss": 0.04487321525812149, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017651537200435996, + "grad_norm": 5.395259380340576, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8620387315750122, + "num_tokens": 192416087.0, + "step": 5042 + }, + { + "epoch": 0.6415214349319425, + "ewc_loss": 0.04493061080574989, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017708931409288198, + "grad_norm": 5.3503642082214355, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.863732635974884, + "num_tokens": 192454421.0, + "step": 5043 + }, + { + "epoch": 0.641648645210533, + "ewc_loss": 0.044938139617443085, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017716459115035832, + "grad_norm": 5.375521659851074, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8556764721870422, + "num_tokens": 192493860.0, + "step": 5044 + }, + { + "epoch": 0.6417758554891235, + "ewc_loss": 0.04495672136545181, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001773504336597398, + "grad_norm": 5.374996185302734, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8495668768882751, + "num_tokens": 192533672.0, + "step": 5045 + }, + { + "epoch": 0.641903065767714, + "ewc_loss": 0.04493160545825958, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001770992821548134, + "grad_norm": 5.426258563995361, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8460121154785156, + "num_tokens": 192569569.0, + "step": 5046 + }, + { + "epoch": 0.6420302760463046, + "ewc_loss": 0.044999945908784866, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017778265464585274, + "grad_norm": 5.387705326080322, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8644610643386841, + "num_tokens": 192610716.0, + "step": 5047 + }, + { + "epoch": 0.642157486324895, + "ewc_loss": 0.04497915506362915, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017757473688106984, + "grad_norm": 5.40415096282959, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8512608408927917, + "num_tokens": 192648040.0, + "step": 5048 + }, + { + "epoch": 0.6422846966034855, + "ewc_loss": 0.04501168429851532, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001779000594979152, + "grad_norm": 5.43743371963501, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8574105501174927, + "num_tokens": 192690063.0, + "step": 5049 + }, + { + "epoch": 0.6424119068820761, + "ewc_loss": 0.04493912309408188, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017717445734888315, + "grad_norm": 5.409693717956543, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8540937304496765, + "num_tokens": 192732067.0, + "step": 5050 + }, + { + "epoch": 0.6425391171606666, + "ewc_loss": 0.04496832564473152, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017746645607985556, + "grad_norm": 5.429717063903809, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8673231601715088, + "num_tokens": 192768598.0, + "step": 5051 + }, + { + "epoch": 0.6426663274392571, + "ewc_loss": 0.04494991898536682, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001772824180079624, + "grad_norm": 5.493818283081055, + "learning_rate": 1e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.8291541337966919, + "num_tokens": 192805420.0, + "step": 5052 + }, + { + "epoch": 0.6427935377178476, + "ewc_loss": 0.0449209064245224, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017699228192213923, + "grad_norm": 5.411098480224609, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8531103134155273, + "num_tokens": 192852082.0, + "step": 5053 + }, + { + "epoch": 0.6429207479964381, + "ewc_loss": 0.04484640061855316, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017624722386244684, + "grad_norm": 5.397014141082764, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8588809370994568, + "num_tokens": 192892846.0, + "step": 5054 + }, + { + "epoch": 0.6430479582750286, + "ewc_loss": 0.044891126453876495, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017669446242507547, + "grad_norm": 5.3751115798950195, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8764084577560425, + "num_tokens": 192932091.0, + "step": 5055 + }, + { + "epoch": 0.6431751685536191, + "ewc_loss": 0.04493524134159088, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001771356037352234, + "grad_norm": 5.5236992835998535, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8595573306083679, + "num_tokens": 192960628.0, + "step": 5056 + }, + { + "epoch": 0.6433023788322096, + "ewc_loss": 0.04490984231233597, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017688160005491227, + "grad_norm": 5.387044906616211, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8517787456512451, + "num_tokens": 193002075.0, + "step": 5057 + }, + { + "epoch": 0.6434295891108002, + "ewc_loss": 0.04488808661699295, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017666407802607864, + "grad_norm": 5.430765151977539, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8480541706085205, + "num_tokens": 193039720.0, + "step": 5058 + }, + { + "epoch": 0.6435567993893907, + "ewc_loss": 0.044888392090797424, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017666714848019183, + "grad_norm": 5.470836162567139, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8593245148658752, + "num_tokens": 193072806.0, + "step": 5059 + }, + { + "epoch": 0.6436840096679812, + "ewc_loss": 0.04466285929083824, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017685320926830173, + "grad_norm": 5.427033424377441, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8670939803123474, + "num_tokens": 193108478.0, + "step": 5060 + }, + { + "epoch": 0.6438112199465716, + "ewc_loss": 0.044863585382699966, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001764190528774634, + "grad_norm": 5.365452766418457, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8625211119651794, + "num_tokens": 193150663.0, + "step": 5061 + }, + { + "epoch": 0.6439384302251622, + "ewc_loss": 0.04489832744002342, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017676647985354066, + "grad_norm": 5.413099765777588, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8616763949394226, + "num_tokens": 193191499.0, + "step": 5062 + }, + { + "epoch": 0.6440656405037527, + "ewc_loss": 0.04489009082317352, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017668413056526333, + "grad_norm": 5.379480361938477, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8491899371147156, + "num_tokens": 193236103.0, + "step": 5063 + }, + { + "epoch": 0.6441928507823432, + "ewc_loss": 0.044660747051239014, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017683207988739014, + "grad_norm": 5.482495307922363, + "learning_rate": 1e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.8367738723754883, + "num_tokens": 193274095.0, + "step": 5064 + }, + { + "epoch": 0.6443200610609338, + "ewc_loss": 0.044606566429138184, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017629026842769235, + "grad_norm": 5.348298072814941, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8635642528533936, + "num_tokens": 193316988.0, + "step": 5065 + }, + { + "epoch": 0.6444472713395243, + "ewc_loss": 0.04490568861365318, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017684008344076574, + "grad_norm": 5.415311813354492, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8561720848083496, + "num_tokens": 193357430.0, + "step": 5066 + }, + { + "epoch": 0.6445744816181147, + "ewc_loss": 0.044885069131851196, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.000176633897353895, + "grad_norm": 5.401206970214844, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8667545914649963, + "num_tokens": 193390392.0, + "step": 5067 + }, + { + "epoch": 0.6447016918967052, + "ewc_loss": 0.04489073529839516, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017669054795987904, + "grad_norm": 5.374063968658447, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8562508225440979, + "num_tokens": 193430961.0, + "step": 5068 + }, + { + "epoch": 0.6448289021752958, + "ewc_loss": 0.044666796922683716, + "ewc_loss_diag": 2.6941299438476562e-05, + "ewc_loss_parallel": 0.00017689260130282491, + "grad_norm": 5.376476287841797, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8562943935394287, + "num_tokens": 193473767.0, + "step": 5069 + }, + { + "epoch": 0.6449561124538863, + "ewc_loss": 0.04490039497613907, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017678715812508017, + "grad_norm": 5.421017646789551, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8470736145973206, + "num_tokens": 193511375.0, + "step": 5070 + }, + { + "epoch": 0.6450833227324768, + "ewc_loss": 0.04494420439004898, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017722522898111492, + "grad_norm": 5.341014385223389, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8689147233963013, + "num_tokens": 193551296.0, + "step": 5071 + }, + { + "epoch": 0.6452105330110673, + "ewc_loss": 0.044974036514759064, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017752358689904213, + "grad_norm": 5.412433624267578, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8616035580635071, + "num_tokens": 193596484.0, + "step": 5072 + }, + { + "epoch": 0.6453377432896578, + "ewc_loss": 0.04495249316096306, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017730813124217093, + "grad_norm": 5.450399875640869, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8644340634346008, + "num_tokens": 193637513.0, + "step": 5073 + }, + { + "epoch": 0.6454649535682483, + "ewc_loss": 0.04498705267906189, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017765372467692941, + "grad_norm": 5.4376397132873535, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8526439666748047, + "num_tokens": 193672808.0, + "step": 5074 + }, + { + "epoch": 0.6455921638468388, + "ewc_loss": 0.0449647530913353, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017743074567988515, + "grad_norm": 5.395505428314209, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8732237815856934, + "num_tokens": 193713018.0, + "step": 5075 + }, + { + "epoch": 0.6457193741254293, + "ewc_loss": 0.04497262090444565, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001775094133336097, + "grad_norm": 5.378966331481934, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.860038161277771, + "num_tokens": 193757649.0, + "step": 5076 + }, + { + "epoch": 0.6458465844040199, + "ewc_loss": 0.04498264566063881, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017760966147761792, + "grad_norm": 5.3940043449401855, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8554606437683105, + "num_tokens": 193792390.0, + "step": 5077 + }, + { + "epoch": 0.6459737946826104, + "ewc_loss": 0.04498092830181122, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017759250476956367, + "grad_norm": 5.387497901916504, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8658665418624878, + "num_tokens": 193827721.0, + "step": 5078 + }, + { + "epoch": 0.6461010049612008, + "ewc_loss": 0.04500233009457588, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017780650523491204, + "grad_norm": 5.424454212188721, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8482061624526978, + "num_tokens": 193867217.0, + "step": 5079 + }, + { + "epoch": 0.6462282152397913, + "ewc_loss": 0.04496530443429947, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001774362608557567, + "grad_norm": 5.389834403991699, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8660271167755127, + "num_tokens": 193900128.0, + "step": 5080 + }, + { + "epoch": 0.6463554255183819, + "ewc_loss": 0.04500570893287659, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017784026567824185, + "grad_norm": 5.5002593994140625, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8691834807395935, + "num_tokens": 193935744.0, + "step": 5081 + }, + { + "epoch": 0.6464826357969724, + "ewc_loss": 0.04498206824064255, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017760386981535703, + "grad_norm": 5.351296424865723, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8556974530220032, + "num_tokens": 193977344.0, + "step": 5082 + }, + { + "epoch": 0.6466098460755629, + "ewc_loss": 0.04493267834186554, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017711000691633672, + "grad_norm": 5.430802822113037, + "learning_rate": 1e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.831775426864624, + "num_tokens": 194014295.0, + "step": 5083 + }, + { + "epoch": 0.6467370563541535, + "ewc_loss": 0.044999681413173676, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001777800207491964, + "grad_norm": 5.364141464233398, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8423776030540466, + "num_tokens": 194055245.0, + "step": 5084 + }, + { + "epoch": 0.6468642666327439, + "ewc_loss": 0.044975437223911285, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017753758584149182, + "grad_norm": 5.450906753540039, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8511155247688293, + "num_tokens": 194089313.0, + "step": 5085 + }, + { + "epoch": 0.6469914769113344, + "ewc_loss": 0.04500044509768486, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001777876605046913, + "grad_norm": 5.348636150360107, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8661792874336243, + "num_tokens": 194128293.0, + "step": 5086 + }, + { + "epoch": 0.6471186871899249, + "ewc_loss": 0.04494956135749817, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017727879458107054, + "grad_norm": 5.377068042755127, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8517305850982666, + "num_tokens": 194166632.0, + "step": 5087 + }, + { + "epoch": 0.6472458974685155, + "ewc_loss": 0.045176275074481964, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00017832525190897286, + "grad_norm": 5.384030342102051, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8447158932685852, + "num_tokens": 194209176.0, + "step": 5088 + }, + { + "epoch": 0.647373107747106, + "ewc_loss": 0.04503773897886276, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017816056788433343, + "grad_norm": 5.424889087677002, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8481565713882446, + "num_tokens": 194245617.0, + "step": 5089 + }, + { + "epoch": 0.6475003180256965, + "ewc_loss": 0.04545081406831741, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00017862925597000867, + "grad_norm": 5.389344215393066, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8646421432495117, + "num_tokens": 194285983.0, + "step": 5090 + }, + { + "epoch": 0.6476275283042869, + "ewc_loss": 0.045176707208156586, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00017832955927588046, + "grad_norm": 5.4550628662109375, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8543723821640015, + "num_tokens": 194317088.0, + "step": 5091 + }, + { + "epoch": 0.6477547385828775, + "ewc_loss": 0.04517870023846626, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00017834949539974332, + "grad_norm": 12.586749076843262, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8625065684318542, + "num_tokens": 194354025.0, + "step": 5092 + }, + { + "epoch": 0.647881948861468, + "ewc_loss": 0.053361423313617706, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00026017671916633844, + "grad_norm": 6.723464012145996, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8619328737258911, + "num_tokens": 194388539.0, + "step": 5093 + }, + { + "epoch": 0.6480091591400585, + "ewc_loss": 0.04385732486844063, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00016391504323109984, + "grad_norm": 4.776307582855225, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8588983416557312, + "num_tokens": 194424518.0, + "step": 5094 + }, + { + "epoch": 0.648136369418649, + "ewc_loss": 0.047164611518383026, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00019820862507913262, + "grad_norm": 6.043126583099365, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8551343679428101, + "num_tokens": 194463310.0, + "step": 5095 + }, + { + "epoch": 0.6482635796972396, + "ewc_loss": 0.04700745642185211, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00019663704733829945, + "grad_norm": 5.497504234313965, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8570149540901184, + "num_tokens": 194499911.0, + "step": 5096 + }, + { + "epoch": 0.64839078997583, + "ewc_loss": 0.045823097229003906, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00018479349091649055, + "grad_norm": 5.6466965675354, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8491290211677551, + "num_tokens": 194536731.0, + "step": 5097 + }, + { + "epoch": 0.6485180002544205, + "ewc_loss": 0.04636542871594429, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00019021678599528968, + "grad_norm": 5.506872653961182, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8477382659912109, + "num_tokens": 194574014.0, + "step": 5098 + }, + { + "epoch": 0.648645210533011, + "ewc_loss": 0.0457829087972641, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00018439159612171352, + "grad_norm": 5.527527809143066, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8495900630950928, + "num_tokens": 194611944.0, + "step": 5099 + }, + { + "epoch": 0.6487724208116016, + "ewc_loss": 0.04591910168528557, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.0001857535244198516, + "grad_norm": 5.496778964996338, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.867572546005249, + "num_tokens": 194649734.0, + "step": 5100 + }, + { + "epoch": 0.6488996310901921, + "ewc_loss": 0.04566573351621628, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.0001832198176998645, + "grad_norm": 5.445245265960693, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8655983805656433, + "num_tokens": 194688344.0, + "step": 5101 + }, + { + "epoch": 0.6490268413687826, + "ewc_loss": 0.04564507305622101, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00018301322415936738, + "grad_norm": 5.455165386199951, + "learning_rate": 1e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8446265459060669, + "num_tokens": 194728582.0, + "step": 5102 + }, + { + "epoch": 0.649154051647373, + "ewc_loss": 0.04574134573340416, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001827552478061989, + "grad_norm": 5.51556396484375, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.857840895652771, + "num_tokens": 194762753.0, + "step": 5103 + }, + { + "epoch": 0.6492812619259636, + "ewc_loss": 0.04565202817320824, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001818620803533122, + "grad_norm": 5.442556858062744, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8707863092422485, + "num_tokens": 194802325.0, + "step": 5104 + }, + { + "epoch": 0.6494084722045541, + "ewc_loss": 0.045582689344882965, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018116869614459574, + "grad_norm": 5.414942264556885, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8574154376983643, + "num_tokens": 194846349.0, + "step": 5105 + }, + { + "epoch": 0.6495356824831446, + "ewc_loss": 0.045535579323768616, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018069760699290782, + "grad_norm": 5.393815517425537, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8586692810058594, + "num_tokens": 194880334.0, + "step": 5106 + }, + { + "epoch": 0.6496628927617352, + "ewc_loss": 0.04548494890332222, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018019128765445203, + "grad_norm": 6.066845893859863, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8486628532409668, + "num_tokens": 194919343.0, + "step": 5107 + }, + { + "epoch": 0.6497901030403257, + "ewc_loss": 0.04575241357088089, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001828659587772563, + "grad_norm": 5.380277633666992, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8591831922531128, + "num_tokens": 194958800.0, + "step": 5108 + }, + { + "epoch": 0.6499173133189162, + "ewc_loss": 0.0452364906668663, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017770669364836067, + "grad_norm": 5.466797351837158, + "learning_rate": 1e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.8321001529693604, + "num_tokens": 194997249.0, + "step": 5109 + }, + { + "epoch": 0.6500445235975066, + "ewc_loss": 0.04545419290661812, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001798837329261005, + "grad_norm": 5.409463882446289, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8600834608078003, + "num_tokens": 195035879.0, + "step": 5110 + }, + { + "epoch": 0.6501717338760972, + "ewc_loss": 0.04537951201200485, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017913691408466548, + "grad_norm": 5.393834590911865, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8528828024864197, + "num_tokens": 195075925.0, + "step": 5111 + }, + { + "epoch": 0.6502989441546877, + "ewc_loss": 0.04539782553911209, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001793200644897297, + "grad_norm": 5.431655406951904, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8598056435585022, + "num_tokens": 195110185.0, + "step": 5112 + }, + { + "epoch": 0.6504261544332782, + "ewc_loss": 0.045422933995723724, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017957115778699517, + "grad_norm": 5.356913089752197, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8493238687515259, + "num_tokens": 195154666.0, + "step": 5113 + }, + { + "epoch": 0.6505533647118688, + "ewc_loss": 0.04527066648006439, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00017926916189026088, + "grad_norm": 5.465190410614014, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8685865998268127, + "num_tokens": 195188998.0, + "step": 5114 + }, + { + "epoch": 0.6506805749904593, + "ewc_loss": 0.045266129076480865, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.0001792237744666636, + "grad_norm": 5.356362819671631, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.865266740322113, + "num_tokens": 195224124.0, + "step": 5115 + }, + { + "epoch": 0.6508077852690497, + "ewc_loss": 0.04524815082550049, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00017904400010593235, + "grad_norm": 5.495250225067139, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8590313196182251, + "num_tokens": 195253601.0, + "step": 5116 + }, + { + "epoch": 0.6509349955476402, + "ewc_loss": 0.04528385400772095, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.0001794010604498908, + "grad_norm": 5.3921403884887695, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8506232500076294, + "num_tokens": 195289045.0, + "step": 5117 + }, + { + "epoch": 0.6510622058262308, + "ewc_loss": 0.045220550149679184, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00017876799392979592, + "grad_norm": 5.389382839202881, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8589683175086975, + "num_tokens": 195326983.0, + "step": 5118 + }, + { + "epoch": 0.6511894161048213, + "ewc_loss": 0.04527828097343445, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00017934532661456615, + "grad_norm": 5.495537281036377, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8693891763687134, + "num_tokens": 195360425.0, + "step": 5119 + }, + { + "epoch": 0.6513166263834118, + "ewc_loss": 0.045241691172122955, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.0001789793896023184, + "grad_norm": 5.351245880126953, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8468003273010254, + "num_tokens": 195402648.0, + "step": 5120 + }, + { + "epoch": 0.6514438366620023, + "ewc_loss": 0.045400723814964294, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017934903735294938, + "grad_norm": 5.474792957305908, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8752033114433289, + "num_tokens": 195440945.0, + "step": 5121 + }, + { + "epoch": 0.6515710469405928, + "ewc_loss": 0.04527093842625618, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00017927188309840858, + "grad_norm": 5.355285167694092, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8609325885772705, + "num_tokens": 195481872.0, + "step": 5122 + }, + { + "epoch": 0.6516982572191833, + "ewc_loss": 0.045244231820106506, + "ewc_loss_diag": 2.7298927307128906e-05, + "ewc_loss_parallel": 0.00017900481179822236, + "grad_norm": 5.375618934631348, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8634756207466125, + "num_tokens": 195525460.0, + "step": 5123 + }, + { + "epoch": 0.6518254674977738, + "ewc_loss": 0.04541947692632675, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017953655333258212, + "grad_norm": 5.7310590744018555, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8601686358451843, + "num_tokens": 195563522.0, + "step": 5124 + }, + { + "epoch": 0.6519526777763643, + "ewc_loss": 0.04543772339820862, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001797190197976306, + "grad_norm": 5.363499164581299, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8675102591514587, + "num_tokens": 195598114.0, + "step": 5125 + }, + { + "epoch": 0.6520798880549549, + "ewc_loss": 0.045218177139759064, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017752358689904213, + "grad_norm": 5.4170427322387695, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8532295227050781, + "num_tokens": 195639826.0, + "step": 5126 + }, + { + "epoch": 0.6522070983335454, + "ewc_loss": 0.04536084830760956, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017895028577186167, + "grad_norm": 5.441141128540039, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8652714490890503, + "num_tokens": 195682047.0, + "step": 5127 + }, + { + "epoch": 0.6523343086121358, + "ewc_loss": 0.045287251472473145, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017821432265918702, + "grad_norm": 5.387712001800537, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8623855113983154, + "num_tokens": 195722453.0, + "step": 5128 + }, + { + "epoch": 0.6524615188907263, + "ewc_loss": 0.045281440019607544, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017815620230976492, + "grad_norm": 5.412428855895996, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8534326553344727, + "num_tokens": 195759182.0, + "step": 5129 + }, + { + "epoch": 0.6525887291693169, + "ewc_loss": 0.04531942307949066, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001785360072972253, + "grad_norm": 5.40358304977417, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.862297534942627, + "num_tokens": 195795641.0, + "step": 5130 + }, + { + "epoch": 0.6527159394479074, + "ewc_loss": 0.045363783836364746, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017897960788104683, + "grad_norm": 5.393132209777832, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8683832883834839, + "num_tokens": 195831200.0, + "step": 5131 + }, + { + "epoch": 0.6528431497264979, + "ewc_loss": 0.045356396585702896, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017890575691126287, + "grad_norm": 5.374948024749756, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.860525906085968, + "num_tokens": 195875110.0, + "step": 5132 + }, + { + "epoch": 0.6529703600050885, + "ewc_loss": 0.04533457010984421, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017868750728666782, + "grad_norm": 5.380441188812256, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8614413738250732, + "num_tokens": 195916724.0, + "step": 5133 + }, + { + "epoch": 0.6530975702836789, + "ewc_loss": 0.045353151857852936, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017887332069221884, + "grad_norm": 5.418230056762695, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8521436452865601, + "num_tokens": 195956219.0, + "step": 5134 + }, + { + "epoch": 0.6532247805622694, + "ewc_loss": 0.04532536491751671, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017859543731901795, + "grad_norm": 5.436361312866211, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.861388087272644, + "num_tokens": 195991563.0, + "step": 5135 + }, + { + "epoch": 0.6533519908408599, + "ewc_loss": 0.04540121927857399, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017935398500412703, + "grad_norm": 5.462978363037109, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8653357028961182, + "num_tokens": 196031864.0, + "step": 5136 + }, + { + "epoch": 0.6534792011194505, + "ewc_loss": 0.04534171149134636, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017875891353469342, + "grad_norm": 5.6070685386657715, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8503053188323975, + "num_tokens": 196067208.0, + "step": 5137 + }, + { + "epoch": 0.653606411398041, + "ewc_loss": 0.045330826193094254, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017865006520878524, + "grad_norm": 5.393986701965332, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8622455596923828, + "num_tokens": 196103934.0, + "step": 5138 + }, + { + "epoch": 0.6537336216766315, + "ewc_loss": 0.04531228914856911, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017846468836069107, + "grad_norm": 5.44018030166626, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8537698984146118, + "num_tokens": 196140886.0, + "step": 5139 + }, + { + "epoch": 0.6538608319552219, + "ewc_loss": 0.04527795687317848, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001781213650247082, + "grad_norm": 5.441754341125488, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.867141842842102, + "num_tokens": 196178940.0, + "step": 5140 + }, + { + "epoch": 0.6539880422338125, + "ewc_loss": 0.045349106192588806, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017883285181596875, + "grad_norm": 5.3867716789245605, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8614851236343384, + "num_tokens": 196229461.0, + "step": 5141 + }, + { + "epoch": 0.654115252512403, + "ewc_loss": 0.0453101322054863, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017844309331849217, + "grad_norm": 5.3874616622924805, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8708820343017578, + "num_tokens": 196265793.0, + "step": 5142 + }, + { + "epoch": 0.6542424627909935, + "ewc_loss": 0.04537838324904442, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017912562179844826, + "grad_norm": 5.419346332550049, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8652776479721069, + "num_tokens": 196304919.0, + "step": 5143 + }, + { + "epoch": 0.654369673069584, + "ewc_loss": 0.045176148414611816, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017954467330127954, + "grad_norm": 5.407952785491943, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8738078474998474, + "num_tokens": 196344761.0, + "step": 5144 + }, + { + "epoch": 0.6544968833481746, + "ewc_loss": 0.04510728642344475, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017885607667267323, + "grad_norm": 5.446925163269043, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8735452890396118, + "num_tokens": 196382625.0, + "step": 5145 + }, + { + "epoch": 0.654624093626765, + "ewc_loss": 0.04516588896512985, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017944208229891956, + "grad_norm": 5.377084732055664, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.869767427444458, + "num_tokens": 196422279.0, + "step": 5146 + }, + { + "epoch": 0.6547513039053555, + "ewc_loss": 0.045176561921834946, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017954882059711963, + "grad_norm": 5.475566864013672, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.854926347732544, + "num_tokens": 196463684.0, + "step": 5147 + }, + { + "epoch": 0.654878514183946, + "ewc_loss": 0.04512768238782883, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001790600217645988, + "grad_norm": 5.347592353820801, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8626204133033752, + "num_tokens": 196506319.0, + "step": 5148 + }, + { + "epoch": 0.6550057244625366, + "ewc_loss": 0.045449480414390564, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001798365992726758, + "grad_norm": 5.460923671722412, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8526797294616699, + "num_tokens": 196546367.0, + "step": 5149 + }, + { + "epoch": 0.6551329347411271, + "ewc_loss": 0.04539519175887108, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017929371097125113, + "grad_norm": 5.4890666007995605, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8444830775260925, + "num_tokens": 196576598.0, + "step": 5150 + }, + { + "epoch": 0.6552601450197176, + "ewc_loss": 0.04515586793422699, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.00017934190691448748, + "grad_norm": 5.439350128173828, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8526287078857422, + "num_tokens": 196610633.0, + "step": 5151 + }, + { + "epoch": 0.655387355298308, + "ewc_loss": 0.0451744869351387, + "ewc_loss_diag": 2.7179718017578125e-05, + "ewc_loss_parallel": 0.0001795280841179192, + "grad_norm": 5.509439945220947, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8501681685447693, + "num_tokens": 196646183.0, + "step": 5152 + }, + { + "epoch": 0.6555145655768986, + "ewc_loss": 0.04538407549262047, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017918254889082164, + "grad_norm": 5.358654499053955, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8692052364349365, + "num_tokens": 196689454.0, + "step": 5153 + }, + { + "epoch": 0.6556417758554891, + "ewc_loss": 0.04546511173248291, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001799929013941437, + "grad_norm": 5.471818923950195, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8437204957008362, + "num_tokens": 196731820.0, + "step": 5154 + }, + { + "epoch": 0.6557689861340796, + "ewc_loss": 0.045418597757816315, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017952779307961464, + "grad_norm": 5.470498085021973, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.849433422088623, + "num_tokens": 196768542.0, + "step": 5155 + }, + { + "epoch": 0.6558961964126702, + "ewc_loss": 0.04539338871836662, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017927568114828318, + "grad_norm": 5.401920795440674, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8434093594551086, + "num_tokens": 196810300.0, + "step": 5156 + }, + { + "epoch": 0.6560234066912607, + "ewc_loss": 0.04540615528821945, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017940333054866642, + "grad_norm": 5.45159912109375, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.843036413192749, + "num_tokens": 196847271.0, + "step": 5157 + }, + { + "epoch": 0.6561506169698512, + "ewc_loss": 0.045442380011081696, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017976561503019184, + "grad_norm": 5.425758361816406, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.873914897441864, + "num_tokens": 196880439.0, + "step": 5158 + }, + { + "epoch": 0.6562778272484416, + "ewc_loss": 0.04548107087612152, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018015252135228366, + "grad_norm": 5.533701419830322, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8522585034370422, + "num_tokens": 196914911.0, + "step": 5159 + }, + { + "epoch": 0.6564050375270322, + "ewc_loss": 0.045454010367393494, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001798818848328665, + "grad_norm": 5.4583587646484375, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.858241617679596, + "num_tokens": 196949419.0, + "step": 5160 + }, + { + "epoch": 0.6565322478056227, + "ewc_loss": 0.04541344195604324, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017947620654013008, + "grad_norm": 5.443924427032471, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8518201112747192, + "num_tokens": 196989401.0, + "step": 5161 + }, + { + "epoch": 0.6566594580842132, + "ewc_loss": 0.045488566160202026, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018022747826762497, + "grad_norm": 5.44957971572876, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8514626026153564, + "num_tokens": 197028377.0, + "step": 5162 + }, + { + "epoch": 0.6567866683628037, + "ewc_loss": 0.045457422733306885, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.000179916009074077, + "grad_norm": 5.494993686676025, + "learning_rate": 1e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.838279128074646, + "num_tokens": 197068334.0, + "step": 5163 + }, + { + "epoch": 0.6569138786413943, + "ewc_loss": 0.04542867839336395, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001796285796444863, + "grad_norm": 5.445291519165039, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8611438274383545, + "num_tokens": 197105978.0, + "step": 5164 + }, + { + "epoch": 0.6570410889199847, + "ewc_loss": 0.04541877657175064, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001795295684132725, + "grad_norm": 5.423164367675781, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8409276008605957, + "num_tokens": 197148641.0, + "step": 5165 + }, + { + "epoch": 0.6571682991985752, + "ewc_loss": 0.045399244874715805, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017933425260707736, + "grad_norm": 5.495828628540039, + "learning_rate": 1e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8391450643539429, + "num_tokens": 197188820.0, + "step": 5166 + }, + { + "epoch": 0.6572955094771658, + "ewc_loss": 0.045445457100868225, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017979634867515415, + "grad_norm": 5.4177374839782715, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8460968732833862, + "num_tokens": 197227109.0, + "step": 5167 + }, + { + "epoch": 0.6574227197557563, + "ewc_loss": 0.04541882872581482, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001795300777303055, + "grad_norm": 5.443793296813965, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8645145893096924, + "num_tokens": 197264083.0, + "step": 5168 + }, + { + "epoch": 0.6575499300343468, + "ewc_loss": 0.04542196914553642, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017956149531528354, + "grad_norm": 5.459176540374756, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8535767793655396, + "num_tokens": 197301349.0, + "step": 5169 + }, + { + "epoch": 0.6576771403129373, + "ewc_loss": 0.04543347656726837, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017967657186090946, + "grad_norm": 5.372735023498535, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.84800124168396, + "num_tokens": 197348176.0, + "step": 5170 + }, + { + "epoch": 0.6578043505915278, + "ewc_loss": 0.04545610398054123, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017990285414271057, + "grad_norm": 5.513171672821045, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8507774472236633, + "num_tokens": 197380490.0, + "step": 5171 + }, + { + "epoch": 0.6579315608701183, + "ewc_loss": 0.04548975080251694, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018023932352662086, + "grad_norm": 5.451839447021484, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8453216552734375, + "num_tokens": 197423075.0, + "step": 5172 + }, + { + "epoch": 0.6580587711487088, + "ewc_loss": 0.04544569551944733, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017979876429308206, + "grad_norm": 5.449045658111572, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8615214228630066, + "num_tokens": 197461730.0, + "step": 5173 + }, + { + "epoch": 0.6581859814272993, + "ewc_loss": 0.04548586905002594, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018020049901679158, + "grad_norm": 5.442263603210449, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8474482297897339, + "num_tokens": 197505801.0, + "step": 5174 + }, + { + "epoch": 0.6583131917058899, + "ewc_loss": 0.04549282789230347, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018027005717158318, + "grad_norm": 5.48836088180542, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8744657039642334, + "num_tokens": 197534643.0, + "step": 5175 + }, + { + "epoch": 0.6584404019844804, + "ewc_loss": 0.045458756387233734, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00017992936773225665, + "grad_norm": 5.375047206878662, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8631577491760254, + "num_tokens": 197576490.0, + "step": 5176 + }, + { + "epoch": 0.6585676122630708, + "ewc_loss": 0.04548010975122452, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018014288798440248, + "grad_norm": 5.490079879760742, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8714420795440674, + "num_tokens": 197612745.0, + "step": 5177 + }, + { + "epoch": 0.6586948225416613, + "ewc_loss": 0.04552600160241127, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018060181173495948, + "grad_norm": 5.4238128662109375, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8545731902122498, + "num_tokens": 197653756.0, + "step": 5178 + }, + { + "epoch": 0.6588220328202519, + "ewc_loss": 0.0455254465341568, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018059623835142702, + "grad_norm": 5.443585395812988, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8828590512275696, + "num_tokens": 197694417.0, + "step": 5179 + }, + { + "epoch": 0.6589492430988424, + "ewc_loss": 0.045519474893808365, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018053654639516026, + "grad_norm": 5.433859348297119, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8533498048782349, + "num_tokens": 197732738.0, + "step": 5180 + }, + { + "epoch": 0.6590764533774329, + "ewc_loss": 0.045575596392154694, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001810977846616879, + "grad_norm": 5.539781093597412, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8507131934165955, + "num_tokens": 197759408.0, + "step": 5181 + }, + { + "epoch": 0.6592036636560235, + "ewc_loss": 0.0455801859498024, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001811436377465725, + "grad_norm": 5.432713031768799, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8613021373748779, + "num_tokens": 197794334.0, + "step": 5182 + }, + { + "epoch": 0.6593308739346139, + "ewc_loss": 0.04563138633966446, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018043497402686626, + "grad_norm": 5.410772800445557, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8684455752372742, + "num_tokens": 197832497.0, + "step": 5183 + }, + { + "epoch": 0.6594580842132044, + "ewc_loss": 0.04567570239305496, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018087813805323094, + "grad_norm": 5.446896076202393, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8500003814697266, + "num_tokens": 197869021.0, + "step": 5184 + }, + { + "epoch": 0.6595852944917949, + "ewc_loss": 0.04570737108588219, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018119480228051543, + "grad_norm": 5.436467170715332, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8443439602851868, + "num_tokens": 197906365.0, + "step": 5185 + }, + { + "epoch": 0.6597125047703855, + "ewc_loss": 0.0457150861620903, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018127197108697146, + "grad_norm": 5.402475357055664, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8610315322875977, + "num_tokens": 197950725.0, + "step": 5186 + }, + { + "epoch": 0.659839715048976, + "ewc_loss": 0.0457022488117218, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001811435940908268, + "grad_norm": 5.419923305511475, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8657732605934143, + "num_tokens": 197991178.0, + "step": 5187 + }, + { + "epoch": 0.6599669253275665, + "ewc_loss": 0.045716702938079834, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018128812371287495, + "grad_norm": 5.419037818908691, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8488734364509583, + "num_tokens": 198030176.0, + "step": 5188 + }, + { + "epoch": 0.6600941356061569, + "ewc_loss": 0.04569489508867264, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018107003415934741, + "grad_norm": 5.440106391906738, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8528879880905151, + "num_tokens": 198067169.0, + "step": 5189 + }, + { + "epoch": 0.6602213458847475, + "ewc_loss": 0.0458463653922081, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018136404105462134, + "grad_norm": 5.424143314361572, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8481186628341675, + "num_tokens": 198104994.0, + "step": 5190 + }, + { + "epoch": 0.660348556163338, + "ewc_loss": 0.045672331005334854, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001808444067137316, + "grad_norm": 5.464722633361816, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8621432185173035, + "num_tokens": 198142791.0, + "step": 5191 + }, + { + "epoch": 0.6604757664419285, + "ewc_loss": 0.04567936062812805, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018091467791236937, + "grad_norm": 5.475652694702148, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8549354672431946, + "num_tokens": 198179300.0, + "step": 5192 + }, + { + "epoch": 0.660602976720519, + "ewc_loss": 0.04567912966012955, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001809123787097633, + "grad_norm": 5.425126075744629, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8722646236419678, + "num_tokens": 198220230.0, + "step": 5193 + }, + { + "epoch": 0.6607301869991096, + "ewc_loss": 0.04575725272297859, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.0001804729108698666, + "grad_norm": 5.53314208984375, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8454129099845886, + "num_tokens": 198251067.0, + "step": 5194 + }, + { + "epoch": 0.6608573972777, + "ewc_loss": 0.04583552107214928, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018125560018233955, + "grad_norm": 5.479875564575195, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8664196729660034, + "num_tokens": 198282130.0, + "step": 5195 + }, + { + "epoch": 0.6609846075562905, + "ewc_loss": 0.04577144235372543, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018061483569908887, + "grad_norm": 5.417721748352051, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8680230379104614, + "num_tokens": 198325609.0, + "step": 5196 + }, + { + "epoch": 0.661111817834881, + "ewc_loss": 0.045789167284965515, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018079206347465515, + "grad_norm": 5.447951793670654, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8609626293182373, + "num_tokens": 198365116.0, + "step": 5197 + }, + { + "epoch": 0.6612390281134716, + "ewc_loss": 0.04579685255885124, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018086891213897616, + "grad_norm": 5.423852443695068, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8597855567932129, + "num_tokens": 198403467.0, + "step": 5198 + }, + { + "epoch": 0.6613662383920621, + "ewc_loss": 0.04569455236196518, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018106662901118398, + "grad_norm": 5.41982889175415, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8649464249610901, + "num_tokens": 198440676.0, + "step": 5199 + }, + { + "epoch": 0.6614934486706526, + "ewc_loss": 0.045825839042663574, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018115875718649477, + "grad_norm": 5.4629621505737305, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8599755764007568, + "num_tokens": 198479082.0, + "step": 5200 + }, + { + "epoch": 0.661620658949243, + "ewc_loss": 0.04581063985824585, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018100676243193448, + "grad_norm": 5.389442443847656, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.854447603225708, + "num_tokens": 198521741.0, + "step": 5201 + }, + { + "epoch": 0.6617478692278336, + "ewc_loss": 0.04580282047390938, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.0001809285895433277, + "grad_norm": 5.437571048736572, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8515041470527649, + "num_tokens": 198562177.0, + "step": 5202 + }, + { + "epoch": 0.6618750795064241, + "ewc_loss": 0.04571198672056198, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018124094640370458, + "grad_norm": 5.4478912353515625, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8585351705551147, + "num_tokens": 198600726.0, + "step": 5203 + }, + { + "epoch": 0.6620022897850146, + "ewc_loss": 0.045710138976573944, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018122246547136456, + "grad_norm": 5.510158538818359, + "learning_rate": 1e-06, + "loss": 0.5557, + "mean_token_accuracy": 0.8302015066146851, + "num_tokens": 198638156.0, + "step": 5204 + }, + { + "epoch": 0.6621295000636052, + "ewc_loss": 0.04565129429101944, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001806340296752751, + "grad_norm": 5.5325517654418945, + "learning_rate": 1e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8405477404594421, + "num_tokens": 198680312.0, + "step": 5205 + }, + { + "epoch": 0.6622567103421957, + "ewc_loss": 0.0456184446811676, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018030553474090993, + "grad_norm": 5.357337474822998, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8759632110595703, + "num_tokens": 198723886.0, + "step": 5206 + }, + { + "epoch": 0.6623839206207861, + "ewc_loss": 0.04575662314891815, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018046660989057273, + "grad_norm": 5.480297565460205, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8579574823379517, + "num_tokens": 198765331.0, + "step": 5207 + }, + { + "epoch": 0.6625111308993766, + "ewc_loss": 0.04560914635658264, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018021254800260067, + "grad_norm": 5.419557094573975, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.861708402633667, + "num_tokens": 198802421.0, + "step": 5208 + }, + { + "epoch": 0.6626383411779672, + "ewc_loss": 0.045671891421079636, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018084001203533262, + "grad_norm": 5.415698528289795, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8683422803878784, + "num_tokens": 198843335.0, + "step": 5209 + }, + { + "epoch": 0.6627655514565577, + "ewc_loss": 0.0456349141895771, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001804702333174646, + "grad_norm": 5.447547435760498, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8653342723846436, + "num_tokens": 198880925.0, + "step": 5210 + }, + { + "epoch": 0.6628927617351482, + "ewc_loss": 0.04569009318947792, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018102202739100903, + "grad_norm": 5.497634410858154, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8573691248893738, + "num_tokens": 198915448.0, + "step": 5211 + }, + { + "epoch": 0.6630199720137387, + "ewc_loss": 0.04570437967777252, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018116491264663637, + "grad_norm": 5.424262046813965, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8687237501144409, + "num_tokens": 198952254.0, + "step": 5212 + }, + { + "epoch": 0.6631471822923293, + "ewc_loss": 0.04556255042552948, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018096729763783514, + "grad_norm": 5.470888614654541, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8628454208374023, + "num_tokens": 198990841.0, + "step": 5213 + }, + { + "epoch": 0.6632743925709197, + "ewc_loss": 0.0455581396818161, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001809232053346932, + "grad_norm": 5.449977397918701, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8609769940376282, + "num_tokens": 199029383.0, + "step": 5214 + }, + { + "epoch": 0.6634016028495102, + "ewc_loss": 0.04554150253534317, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018075680418405682, + "grad_norm": 5.484439373016357, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8583543300628662, + "num_tokens": 199066731.0, + "step": 5215 + }, + { + "epoch": 0.6635288131281007, + "ewc_loss": 0.04558083042502403, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001811500987969339, + "grad_norm": 5.592194557189941, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.870354175567627, + "num_tokens": 199108752.0, + "step": 5216 + }, + { + "epoch": 0.6636560234066913, + "ewc_loss": 0.045659735798835754, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001807184744393453, + "grad_norm": 5.400177478790283, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8630678653717041, + "num_tokens": 199146438.0, + "step": 5217 + }, + { + "epoch": 0.6637832336852818, + "ewc_loss": 0.04565770924091339, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018069817451760173, + "grad_norm": 5.5594563484191895, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8600591421127319, + "num_tokens": 199186367.0, + "step": 5218 + }, + { + "epoch": 0.6639104439638723, + "ewc_loss": 0.04565328732132912, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018065396579913795, + "grad_norm": 5.394326686859131, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8714881539344788, + "num_tokens": 199225785.0, + "step": 5219 + }, + { + "epoch": 0.6640376542424627, + "ewc_loss": 0.045664120465517044, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001807622902560979, + "grad_norm": 5.4929375648498535, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8620575666427612, + "num_tokens": 199262574.0, + "step": 5220 + }, + { + "epoch": 0.6641648645210533, + "ewc_loss": 0.045671332627534866, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018083442409988493, + "grad_norm": 5.425841331481934, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8632981777191162, + "num_tokens": 199299093.0, + "step": 5221 + }, + { + "epoch": 0.6642920747996438, + "ewc_loss": 0.04553912580013275, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001807330409064889, + "grad_norm": 5.447487831115723, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8643231987953186, + "num_tokens": 199337446.0, + "step": 5222 + }, + { + "epoch": 0.6644192850782343, + "ewc_loss": 0.045716263353824615, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018128371448256075, + "grad_norm": 5.463757514953613, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8607057332992554, + "num_tokens": 199377321.0, + "step": 5223 + }, + { + "epoch": 0.6645464953568249, + "ewc_loss": 0.0455867163836956, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018120896129403263, + "grad_norm": 5.487314701080322, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.869235634803772, + "num_tokens": 199413057.0, + "step": 5224 + }, + { + "epoch": 0.6646737056354154, + "ewc_loss": 0.04573536664247513, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018147475202567875, + "grad_norm": 5.455984115600586, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8651700019836426, + "num_tokens": 199452730.0, + "step": 5225 + }, + { + "epoch": 0.6648009159140058, + "ewc_loss": 0.04568100720643997, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001821518671931699, + "grad_norm": 5.476676940917969, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8693803548812866, + "num_tokens": 199487860.0, + "step": 5226 + }, + { + "epoch": 0.6649281261925963, + "ewc_loss": 0.04573611542582512, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018148224626202136, + "grad_norm": 5.419984817504883, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8666879534721375, + "num_tokens": 199532790.0, + "step": 5227 + }, + { + "epoch": 0.6650553364711869, + "ewc_loss": 0.04577374458312988, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018185851513408124, + "grad_norm": 5.546677589416504, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8785558938980103, + "num_tokens": 199561072.0, + "step": 5228 + }, + { + "epoch": 0.6651825467497774, + "ewc_loss": 0.045613206923007965, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018147389346268028, + "grad_norm": 5.480773448944092, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8652681112289429, + "num_tokens": 199595820.0, + "step": 5229 + }, + { + "epoch": 0.6653097570283679, + "ewc_loss": 0.04556906223297119, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001810324174584821, + "grad_norm": 5.4319047927856445, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.847649097442627, + "num_tokens": 199640101.0, + "step": 5230 + }, + { + "epoch": 0.6654369673069584, + "ewc_loss": 0.045630909502506256, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018165090295951813, + "grad_norm": 5.464846134185791, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8555880784988403, + "num_tokens": 199680287.0, + "step": 5231 + }, + { + "epoch": 0.6655641775855489, + "ewc_loss": 0.04561479017138481, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001814896968426183, + "grad_norm": 5.446226119995117, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8593230843544006, + "num_tokens": 199721958.0, + "step": 5232 + }, + { + "epoch": 0.6656913878641394, + "ewc_loss": 0.045759838074445724, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001817194715840742, + "grad_norm": 5.49594783782959, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8647990822792053, + "num_tokens": 199756021.0, + "step": 5233 + }, + { + "epoch": 0.6658185981427299, + "ewc_loss": 0.04563544690608978, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018169627583120018, + "grad_norm": 5.532460689544678, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8639464378356934, + "num_tokens": 199793586.0, + "step": 5234 + }, + { + "epoch": 0.6659458084213205, + "ewc_loss": 0.045592300593853, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018126478244084865, + "grad_norm": 5.431495189666748, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8460632562637329, + "num_tokens": 199835728.0, + "step": 5235 + }, + { + "epoch": 0.666073018699911, + "ewc_loss": 0.045575372874736786, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018109552911482751, + "grad_norm": 5.467376232147217, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8682728409767151, + "num_tokens": 199873348.0, + "step": 5236 + }, + { + "epoch": 0.6662002289785015, + "ewc_loss": 0.04565934091806412, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018193520372733474, + "grad_norm": 5.49424409866333, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8603130578994751, + "num_tokens": 199908223.0, + "step": 5237 + }, + { + "epoch": 0.6663274392570919, + "ewc_loss": 0.045659348368644714, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018193530559074134, + "grad_norm": 5.508474349975586, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8558315634727478, + "num_tokens": 199947017.0, + "step": 5238 + }, + { + "epoch": 0.6664546495356825, + "ewc_loss": 0.04557349905371666, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018107678624801338, + "grad_norm": 5.438987731933594, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8678090572357178, + "num_tokens": 199984806.0, + "step": 5239 + }, + { + "epoch": 0.666581859814273, + "ewc_loss": 0.04562851041555405, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018162692140322179, + "grad_norm": 5.545626640319824, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8525253534317017, + "num_tokens": 200022600.0, + "step": 5240 + }, + { + "epoch": 0.6667090700928635, + "ewc_loss": 0.04560635983943939, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018140541214961559, + "grad_norm": 5.465429782867432, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8688659071922302, + "num_tokens": 200058407.0, + "step": 5241 + }, + { + "epoch": 0.666836280371454, + "ewc_loss": 0.045556701719760895, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001809087989386171, + "grad_norm": 5.4482269287109375, + "learning_rate": 1e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8324726223945618, + "num_tokens": 200101871.0, + "step": 5242 + }, + { + "epoch": 0.6669634906500446, + "ewc_loss": 0.04576710984110832, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018179218750447035, + "grad_norm": 5.537775993347168, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8595784902572632, + "num_tokens": 200138501.0, + "step": 5243 + }, + { + "epoch": 0.667090700928635, + "ewc_loss": 0.04557085409760475, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.0001810503308661282, + "grad_norm": 5.466838359832764, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8578518033027649, + "num_tokens": 200176639.0, + "step": 5244 + }, + { + "epoch": 0.6672179112072255, + "ewc_loss": 0.04575177654623985, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018163886852562428, + "grad_norm": 5.495095729827881, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8543505072593689, + "num_tokens": 200211873.0, + "step": 5245 + }, + { + "epoch": 0.667345121485816, + "ewc_loss": 0.04561154544353485, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018145724607165903, + "grad_norm": 5.472534656524658, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8551839590072632, + "num_tokens": 200246596.0, + "step": 5246 + }, + { + "epoch": 0.6674723317644066, + "ewc_loss": 0.04563963785767555, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018173817079514265, + "grad_norm": 5.465329647064209, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8743667006492615, + "num_tokens": 200282761.0, + "step": 5247 + }, + { + "epoch": 0.6675995420429971, + "ewc_loss": 0.04582510516047478, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018237213953398168, + "grad_norm": 5.526041507720947, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8442375659942627, + "num_tokens": 200321821.0, + "step": 5248 + }, + { + "epoch": 0.6677267523215876, + "ewc_loss": 0.045796506106853485, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018208617984782904, + "grad_norm": 5.441497325897217, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8429532051086426, + "num_tokens": 200369066.0, + "step": 5249 + }, + { + "epoch": 0.667853962600178, + "ewc_loss": 0.045777641236782074, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018189751426689327, + "grad_norm": 5.489675045013428, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8528761863708496, + "num_tokens": 200408067.0, + "step": 5250 + }, + { + "epoch": 0.6679811728787686, + "ewc_loss": 0.045696817338466644, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018230994464829564, + "grad_norm": 5.45928430557251, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8769420385360718, + "num_tokens": 200446693.0, + "step": 5251 + }, + { + "epoch": 0.6681083831573591, + "ewc_loss": 0.045791711658239365, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018203821673523635, + "grad_norm": 5.483681678771973, + "learning_rate": 1e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8406636714935303, + "num_tokens": 200485340.0, + "step": 5252 + }, + { + "epoch": 0.6682355934359496, + "ewc_loss": 0.04572809487581253, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018262272351421416, + "grad_norm": 5.498873710632324, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8553325533866882, + "num_tokens": 200521978.0, + "step": 5253 + }, + { + "epoch": 0.6683628037145402, + "ewc_loss": 0.04582227021455765, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018234380695503205, + "grad_norm": 5.480568885803223, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8597866296768188, + "num_tokens": 200560604.0, + "step": 5254 + }, + { + "epoch": 0.6684900139931307, + "ewc_loss": 0.04583727940917015, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001824938808567822, + "grad_norm": 5.433426380157471, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8559939861297607, + "num_tokens": 200601718.0, + "step": 5255 + }, + { + "epoch": 0.6686172242717211, + "ewc_loss": 0.045869380235672, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018281488155480474, + "grad_norm": 5.463799476623535, + "learning_rate": 1e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8355236053466797, + "num_tokens": 200646186.0, + "step": 5256 + }, + { + "epoch": 0.6687444345503116, + "ewc_loss": 0.045815128833055496, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018227238615509123, + "grad_norm": 5.511177062988281, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8613181710243225, + "num_tokens": 200684849.0, + "step": 5257 + }, + { + "epoch": 0.6688716448289022, + "ewc_loss": 0.045853935182094574, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018266044207848608, + "grad_norm": 5.446027755737305, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8651787638664246, + "num_tokens": 200719542.0, + "step": 5258 + }, + { + "epoch": 0.6689988551074927, + "ewc_loss": 0.04578061401844025, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018192724382970482, + "grad_norm": 5.478088855743408, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8617223501205444, + "num_tokens": 200756212.0, + "step": 5259 + }, + { + "epoch": 0.6691260653860832, + "ewc_loss": 0.04583774507045746, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001824985520215705, + "grad_norm": 5.508089542388916, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8625948429107666, + "num_tokens": 200791184.0, + "step": 5260 + }, + { + "epoch": 0.6692532756646737, + "ewc_loss": 0.04580436646938324, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018216473108623177, + "grad_norm": 5.501043796539307, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8424800634384155, + "num_tokens": 200823966.0, + "step": 5261 + }, + { + "epoch": 0.6693804859432643, + "ewc_loss": 0.04584992676973343, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018262036610394716, + "grad_norm": 5.494767189025879, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8634309768676758, + "num_tokens": 200862718.0, + "step": 5262 + }, + { + "epoch": 0.6695076962218547, + "ewc_loss": 0.04590649530291557, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018196534074377269, + "grad_norm": 5.430717945098877, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8539919853210449, + "num_tokens": 200901755.0, + "step": 5263 + }, + { + "epoch": 0.6696349065004452, + "ewc_loss": 0.04597732424736023, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018267365521751344, + "grad_norm": 5.512216567993164, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8711796998977661, + "num_tokens": 200935788.0, + "step": 5264 + }, + { + "epoch": 0.6697621167790357, + "ewc_loss": 0.045815855264663696, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001822796621127054, + "grad_norm": 5.425137519836426, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8732749223709106, + "num_tokens": 200975159.0, + "step": 5265 + }, + { + "epoch": 0.6698893270576263, + "ewc_loss": 0.04581001028418541, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018222119251731783, + "grad_norm": 5.478475570678711, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8711035847663879, + "num_tokens": 201010813.0, + "step": 5266 + }, + { + "epoch": 0.6700165373362168, + "ewc_loss": 0.04582519084215164, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018237299809698015, + "grad_norm": 5.482996940612793, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.858243465423584, + "num_tokens": 201051937.0, + "step": 5267 + }, + { + "epoch": 0.6701437476148073, + "ewc_loss": 0.04582733288407326, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001823944185161963, + "grad_norm": 5.477159023284912, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8654383420944214, + "num_tokens": 201090926.0, + "step": 5268 + }, + { + "epoch": 0.6702709578933977, + "ewc_loss": 0.04578234255313873, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018194453150499612, + "grad_norm": 5.5268354415893555, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8399041295051575, + "num_tokens": 201126086.0, + "step": 5269 + }, + { + "epoch": 0.6703981681719883, + "ewc_loss": 0.04581734910607338, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018229459237772971, + "grad_norm": 5.449706077575684, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8487874269485474, + "num_tokens": 201165933.0, + "step": 5270 + }, + { + "epoch": 0.6705253784505788, + "ewc_loss": 0.04577655345201492, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018188664398621768, + "grad_norm": 5.46390962600708, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8632378578186035, + "num_tokens": 201198794.0, + "step": 5271 + }, + { + "epoch": 0.6706525887291693, + "ewc_loss": 0.04580835998058319, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018220471974927932, + "grad_norm": 5.530913352966309, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8503488898277283, + "num_tokens": 201234081.0, + "step": 5272 + }, + { + "epoch": 0.6707797990077599, + "ewc_loss": 0.045800067484378815, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001821217592805624, + "grad_norm": 5.457287788391113, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8561100959777832, + "num_tokens": 201271796.0, + "step": 5273 + }, + { + "epoch": 0.6709070092863504, + "ewc_loss": 0.04582240432500839, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018234514573123306, + "grad_norm": 5.494231224060059, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8405951261520386, + "num_tokens": 201309526.0, + "step": 5274 + }, + { + "epoch": 0.6710342195649408, + "ewc_loss": 0.04579164832830429, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001820375764509663, + "grad_norm": 5.49162483215332, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8661128878593445, + "num_tokens": 201344015.0, + "step": 5275 + }, + { + "epoch": 0.6711614298435313, + "ewc_loss": 0.04590007662773132, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018190113769378513, + "grad_norm": 5.431628227233887, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.859825611114502, + "num_tokens": 201385201.0, + "step": 5276 + }, + { + "epoch": 0.6712886401221219, + "ewc_loss": 0.04577868431806564, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018190793343819678, + "grad_norm": 5.4558424949646, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8645563721656799, + "num_tokens": 201422370.0, + "step": 5277 + }, + { + "epoch": 0.6714158504007124, + "ewc_loss": 0.045921411365270615, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018211451242677867, + "grad_norm": 5.434453964233398, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8439714908599854, + "num_tokens": 201467085.0, + "step": 5278 + }, + { + "epoch": 0.6715430606793029, + "ewc_loss": 0.045962296426296234, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018252334848511964, + "grad_norm": 5.552478313446045, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.872860312461853, + "num_tokens": 201503432.0, + "step": 5279 + }, + { + "epoch": 0.6716702709578934, + "ewc_loss": 0.04592297971248627, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018213018483947963, + "grad_norm": 5.454549312591553, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8665120601654053, + "num_tokens": 201541358.0, + "step": 5280 + }, + { + "epoch": 0.6717974812364839, + "ewc_loss": 0.04581543803215027, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001822754566092044, + "grad_norm": 5.507612705230713, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.854243814945221, + "num_tokens": 201580235.0, + "step": 5281 + }, + { + "epoch": 0.6719246915150744, + "ewc_loss": 0.04581797495484352, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001823008497012779, + "grad_norm": 5.408751010894775, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8597167730331421, + "num_tokens": 201627703.0, + "step": 5282 + }, + { + "epoch": 0.6720519017936649, + "ewc_loss": 0.04583953320980072, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001825164072215557, + "grad_norm": 5.525378227233887, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8616726398468018, + "num_tokens": 201667182.0, + "step": 5283 + }, + { + "epoch": 0.6721791120722554, + "ewc_loss": 0.04584871977567673, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001826082734623924, + "grad_norm": 5.479668617248535, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8607555627822876, + "num_tokens": 201699876.0, + "step": 5284 + }, + { + "epoch": 0.672306322350846, + "ewc_loss": 0.04577801376581192, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018190123955719173, + "grad_norm": 5.544759750366211, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8484988212585449, + "num_tokens": 201732770.0, + "step": 5285 + }, + { + "epoch": 0.6724335326294365, + "ewc_loss": 0.045724086463451385, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018258264753967524, + "grad_norm": 5.489214897155762, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8613876104354858, + "num_tokens": 201771388.0, + "step": 5286 + }, + { + "epoch": 0.6725607429080269, + "ewc_loss": 0.04575475677847862, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018166867084801197, + "grad_norm": 5.4731831550598145, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8546183705329895, + "num_tokens": 201809283.0, + "step": 5287 + }, + { + "epoch": 0.6726879531866174, + "ewc_loss": 0.04579947143793106, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018211582209914923, + "grad_norm": 5.465071201324463, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8652637004852295, + "num_tokens": 201854950.0, + "step": 5288 + }, + { + "epoch": 0.672815163465208, + "ewc_loss": 0.04575178027153015, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001816388830775395, + "grad_norm": 5.494053840637207, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8636969327926636, + "num_tokens": 201895638.0, + "step": 5289 + }, + { + "epoch": 0.6729423737437985, + "ewc_loss": 0.045789651572704315, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018201758211944252, + "grad_norm": 5.529083251953125, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8456450700759888, + "num_tokens": 201928243.0, + "step": 5290 + }, + { + "epoch": 0.673069584022389, + "ewc_loss": 0.04575579985976219, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018167909001931548, + "grad_norm": 5.428976058959961, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8591879606246948, + "num_tokens": 201964687.0, + "step": 5291 + }, + { + "epoch": 0.6731967943009796, + "ewc_loss": 0.04580305889248848, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018215167801827192, + "grad_norm": 5.500486850738525, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8727271556854248, + "num_tokens": 201997166.0, + "step": 5292 + }, + { + "epoch": 0.67332400457957, + "ewc_loss": 0.04582566022872925, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018237766926176846, + "grad_norm": 5.478235244750977, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8615775108337402, + "num_tokens": 202030771.0, + "step": 5293 + }, + { + "epoch": 0.6734512148581605, + "ewc_loss": 0.04568440467119217, + "ewc_loss_diag": 2.7418136596679688e-05, + "ewc_loss_parallel": 0.00018218584591522813, + "grad_norm": 5.431529521942139, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8517296314239502, + "num_tokens": 202077280.0, + "step": 5294 + }, + { + "epoch": 0.673578425136751, + "ewc_loss": 0.04575002193450928, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001816212898120284, + "grad_norm": 5.517869472503662, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8514547944068909, + "num_tokens": 202115273.0, + "step": 5295 + }, + { + "epoch": 0.6737056354153416, + "ewc_loss": 0.0458187460899353, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001823085331125185, + "grad_norm": 5.470950603485107, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8698883056640625, + "num_tokens": 202145347.0, + "step": 5296 + }, + { + "epoch": 0.6738328456939321, + "ewc_loss": 0.04583106189966202, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018243171507492661, + "grad_norm": 5.437906742095947, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8634978532791138, + "num_tokens": 202184621.0, + "step": 5297 + }, + { + "epoch": 0.6739600559725226, + "ewc_loss": 0.045831866562366486, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018243973318021744, + "grad_norm": 5.446914196014404, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8618242144584656, + "num_tokens": 202230099.0, + "step": 5298 + }, + { + "epoch": 0.674087266251113, + "ewc_loss": 0.04582749679684639, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018239606288261712, + "grad_norm": 5.474869728088379, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8519173264503479, + "num_tokens": 202267872.0, + "step": 5299 + }, + { + "epoch": 0.6742144765297036, + "ewc_loss": 0.04588877782225609, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001830088731367141, + "grad_norm": 5.460522174835205, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8568138480186462, + "num_tokens": 202305517.0, + "step": 5300 + }, + { + "epoch": 0.6743416868082941, + "ewc_loss": 0.045860640704631805, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018272751185577363, + "grad_norm": 5.509335041046143, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8452253341674805, + "num_tokens": 202340671.0, + "step": 5301 + }, + { + "epoch": 0.6744688970868846, + "ewc_loss": 0.0458499938249588, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001826210063882172, + "grad_norm": 5.42834997177124, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8676289319992065, + "num_tokens": 202379552.0, + "step": 5302 + }, + { + "epoch": 0.6745961073654752, + "ewc_loss": 0.045912206172943115, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001832431589718908, + "grad_norm": 5.472841739654541, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.853874146938324, + "num_tokens": 202421404.0, + "step": 5303 + }, + { + "epoch": 0.6747233176440657, + "ewc_loss": 0.04584335535764694, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001825546205509454, + "grad_norm": 5.439961910247803, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8450475335121155, + "num_tokens": 202463765.0, + "step": 5304 + }, + { + "epoch": 0.6748505279226561, + "ewc_loss": 0.04594508558511734, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001835719303926453, + "grad_norm": 5.465064525604248, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8534829020500183, + "num_tokens": 202505872.0, + "step": 5305 + }, + { + "epoch": 0.6749777382012466, + "ewc_loss": 0.04590203985571861, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001831414847401902, + "grad_norm": 5.471458435058594, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8579274415969849, + "num_tokens": 202544085.0, + "step": 5306 + }, + { + "epoch": 0.6751049484798372, + "ewc_loss": 0.04585389792919159, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018266006372869015, + "grad_norm": 5.448258399963379, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8555750846862793, + "num_tokens": 202588237.0, + "step": 5307 + }, + { + "epoch": 0.6752321587584277, + "ewc_loss": 0.046095408499240875, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.0001838544849306345, + "grad_norm": 5.496095657348633, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.852728009223938, + "num_tokens": 202628462.0, + "step": 5308 + }, + { + "epoch": 0.6753593690370182, + "ewc_loss": 0.04589029401540756, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018302403623238206, + "grad_norm": 5.471665859222412, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8613197207450867, + "num_tokens": 202668149.0, + "step": 5309 + }, + { + "epoch": 0.6754865793156087, + "ewc_loss": 0.04602634906768799, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018316386558581144, + "grad_norm": 5.459490776062012, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.874106764793396, + "num_tokens": 202708651.0, + "step": 5310 + }, + { + "epoch": 0.6756137895941993, + "ewc_loss": 0.04591989517211914, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018332006584387273, + "grad_norm": 5.526559352874756, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8641688823699951, + "num_tokens": 202738613.0, + "step": 5311 + }, + { + "epoch": 0.6757409998727897, + "ewc_loss": 0.04588562250137329, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018297729548066854, + "grad_norm": 5.4966888427734375, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8550934791564941, + "num_tokens": 202779279.0, + "step": 5312 + }, + { + "epoch": 0.6758682101513802, + "ewc_loss": 0.04591410234570503, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018326212011743337, + "grad_norm": 5.587893962860107, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8588649034500122, + "num_tokens": 202812095.0, + "step": 5313 + }, + { + "epoch": 0.6759954204299707, + "ewc_loss": 0.045937515795230865, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018349623132962734, + "grad_norm": 5.500970363616943, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.854253351688385, + "num_tokens": 202855705.0, + "step": 5314 + }, + { + "epoch": 0.6761226307085613, + "ewc_loss": 0.04584083706140518, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018252944573760033, + "grad_norm": 5.597156524658203, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8422369956970215, + "num_tokens": 202894180.0, + "step": 5315 + }, + { + "epoch": 0.6762498409871518, + "ewc_loss": 0.045857299119234085, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018269408610649407, + "grad_norm": 5.560330867767334, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8661671876907349, + "num_tokens": 202925842.0, + "step": 5316 + }, + { + "epoch": 0.6763770512657423, + "ewc_loss": 0.04578914865851402, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018201257626060396, + "grad_norm": 5.4974470138549805, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8651309013366699, + "num_tokens": 202959103.0, + "step": 5317 + }, + { + "epoch": 0.6765042615443327, + "ewc_loss": 0.04577694833278656, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018189058755524457, + "grad_norm": 5.516563892364502, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8586964011192322, + "num_tokens": 203000393.0, + "step": 5318 + }, + { + "epoch": 0.6766314718229233, + "ewc_loss": 0.045740239322185516, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018152350094169378, + "grad_norm": 5.480660438537598, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8601881265640259, + "num_tokens": 203041613.0, + "step": 5319 + }, + { + "epoch": 0.6767586821015138, + "ewc_loss": 0.04579758644104004, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018209694826509804, + "grad_norm": 5.5711588859558105, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8735511302947998, + "num_tokens": 203073744.0, + "step": 5320 + }, + { + "epoch": 0.6768858923801043, + "ewc_loss": 0.04578721523284912, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001819932513171807, + "grad_norm": 5.536879062652588, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8438668251037598, + "num_tokens": 203108350.0, + "step": 5321 + }, + { + "epoch": 0.6770131026586949, + "ewc_loss": 0.045719802379608154, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018131911929231137, + "grad_norm": 5.499627590179443, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8622078895568848, + "num_tokens": 203139149.0, + "step": 5322 + }, + { + "epoch": 0.6771403129372854, + "ewc_loss": 0.04575154185295105, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018163651111535728, + "grad_norm": 5.531442642211914, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8419279456138611, + "num_tokens": 203173953.0, + "step": 5323 + }, + { + "epoch": 0.6772675232158758, + "ewc_loss": 0.045764923095703125, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001817703159758821, + "grad_norm": 5.486773490905762, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8689166307449341, + "num_tokens": 203208915.0, + "step": 5324 + }, + { + "epoch": 0.6773947334944663, + "ewc_loss": 0.045810483396053314, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018222592188976705, + "grad_norm": 5.426462173461914, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8473830223083496, + "num_tokens": 203251906.0, + "step": 5325 + }, + { + "epoch": 0.6775219437730569, + "ewc_loss": 0.04582935944199562, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018241468933410943, + "grad_norm": 5.530764102935791, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8589772582054138, + "num_tokens": 203289460.0, + "step": 5326 + }, + { + "epoch": 0.6776491540516474, + "ewc_loss": 0.04583004117012024, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018242152873426676, + "grad_norm": 5.483554840087891, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8670673370361328, + "num_tokens": 203325647.0, + "step": 5327 + }, + { + "epoch": 0.6777763643302379, + "ewc_loss": 0.04580417275428772, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018216281023342162, + "grad_norm": 5.471724987030029, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8546406030654907, + "num_tokens": 203364972.0, + "step": 5328 + }, + { + "epoch": 0.6779035746088284, + "ewc_loss": 0.04607196897268295, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001823993952712044, + "grad_norm": 5.46109676361084, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8593583106994629, + "num_tokens": 203407778.0, + "step": 5329 + }, + { + "epoch": 0.6780307848874189, + "ewc_loss": 0.04608047753572464, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018248446576762944, + "grad_norm": 5.437411785125732, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.859817385673523, + "num_tokens": 203443820.0, + "step": 5330 + }, + { + "epoch": 0.6781579951660094, + "ewc_loss": 0.04612702131271362, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018294989422429353, + "grad_norm": 5.561317443847656, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8640339970588684, + "num_tokens": 203475742.0, + "step": 5331 + }, + { + "epoch": 0.6782852054445999, + "ewc_loss": 0.0461312010884285, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018299171642865986, + "grad_norm": 5.5054545402526855, + "learning_rate": 1e-06, + "loss": 0.546, + "mean_token_accuracy": 0.8327267169952393, + "num_tokens": 203512617.0, + "step": 5332 + }, + { + "epoch": 0.6784124157231904, + "ewc_loss": 0.04612639546394348, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018294365145266056, + "grad_norm": 5.505248546600342, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8440631031990051, + "num_tokens": 203546778.0, + "step": 5333 + }, + { + "epoch": 0.678539626001781, + "ewc_loss": 0.04599916189908981, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.0001828920067055151, + "grad_norm": 5.492603302001953, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8605349659919739, + "num_tokens": 203585222.0, + "step": 5334 + }, + { + "epoch": 0.6786668362803715, + "ewc_loss": 0.045977622270584106, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018267659470438957, + "grad_norm": 5.448929786682129, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8529918193817139, + "num_tokens": 203627014.0, + "step": 5335 + }, + { + "epoch": 0.6787940465589619, + "ewc_loss": 0.04604378715157509, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.0001833382702898234, + "grad_norm": 5.559998035430908, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8641606569290161, + "num_tokens": 203659189.0, + "step": 5336 + }, + { + "epoch": 0.6789212568375524, + "ewc_loss": 0.04617934674024582, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018347316654399037, + "grad_norm": 5.513433933258057, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8695060610771179, + "num_tokens": 203695355.0, + "step": 5337 + }, + { + "epoch": 0.679048467116143, + "ewc_loss": 0.04609481990337372, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018262790399603546, + "grad_norm": 5.530646800994873, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8670706748962402, + "num_tokens": 203733446.0, + "step": 5338 + }, + { + "epoch": 0.6791756773947335, + "ewc_loss": 0.046137239784002304, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018305209232494235, + "grad_norm": 5.455873966217041, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8423430919647217, + "num_tokens": 203775387.0, + "step": 5339 + }, + { + "epoch": 0.679302887673324, + "ewc_loss": 0.04612881690263748, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018296786583960056, + "grad_norm": 5.540839195251465, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8536167740821838, + "num_tokens": 203810713.0, + "step": 5340 + }, + { + "epoch": 0.6794300979519146, + "ewc_loss": 0.04618844389915466, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018356410146225244, + "grad_norm": 5.488029956817627, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.870577335357666, + "num_tokens": 203847144.0, + "step": 5341 + }, + { + "epoch": 0.679557308230505, + "ewc_loss": 0.04614321142435074, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018311181338503957, + "grad_norm": 5.478586196899414, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8646680116653442, + "num_tokens": 203889716.0, + "step": 5342 + }, + { + "epoch": 0.6796845185090955, + "ewc_loss": 0.04612744599580765, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018295417248737067, + "grad_norm": 5.4817094802856445, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8447613716125488, + "num_tokens": 203927692.0, + "step": 5343 + }, + { + "epoch": 0.679811728787686, + "ewc_loss": 0.04593238979578018, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018344500858802348, + "grad_norm": 5.551797866821289, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8464722633361816, + "num_tokens": 203960180.0, + "step": 5344 + }, + { + "epoch": 0.6799389390662766, + "ewc_loss": 0.046156421303749084, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018324388656765223, + "grad_norm": 5.483085632324219, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8702981472015381, + "num_tokens": 203997375.0, + "step": 5345 + }, + { + "epoch": 0.6800661493448671, + "ewc_loss": 0.04612363874912262, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001829160755733028, + "grad_norm": 5.526138782501221, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8549325466156006, + "num_tokens": 204037588.0, + "step": 5346 + }, + { + "epoch": 0.6801933596234576, + "ewc_loss": 0.04590626806020737, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001831838017096743, + "grad_norm": 5.4657111167907715, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8519198298454285, + "num_tokens": 204080993.0, + "step": 5347 + }, + { + "epoch": 0.680320569902048, + "ewc_loss": 0.045855894684791565, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018268001440446824, + "grad_norm": 5.540689468383789, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8606024980545044, + "num_tokens": 204116863.0, + "step": 5348 + }, + { + "epoch": 0.6804477801806386, + "ewc_loss": 0.045888643711805344, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001830075343605131, + "grad_norm": 5.505366802215576, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8538854122161865, + "num_tokens": 204156551.0, + "step": 5349 + }, + { + "epoch": 0.6805749904592291, + "ewc_loss": 0.04587402567267418, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018286134582012892, + "grad_norm": 5.486569881439209, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8523622155189514, + "num_tokens": 204196210.0, + "step": 5350 + }, + { + "epoch": 0.6807022007378196, + "ewc_loss": 0.04588795080780983, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001830006076488644, + "grad_norm": 5.539803981781006, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8570150136947632, + "num_tokens": 204234791.0, + "step": 5351 + }, + { + "epoch": 0.6808294110164101, + "ewc_loss": 0.04587458074092865, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018286689009983093, + "grad_norm": 5.519542217254639, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8625501394271851, + "num_tokens": 204271631.0, + "step": 5352 + }, + { + "epoch": 0.6809566212950007, + "ewc_loss": 0.045866407454013824, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018278518109582365, + "grad_norm": 5.443613052368164, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8694162964820862, + "num_tokens": 204311879.0, + "step": 5353 + }, + { + "epoch": 0.6810838315735911, + "ewc_loss": 0.04587036743760109, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001828247623052448, + "grad_norm": 5.527930736541748, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8621380925178528, + "num_tokens": 204357123.0, + "step": 5354 + }, + { + "epoch": 0.6812110418521816, + "ewc_loss": 0.04582003131508827, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001823214115574956, + "grad_norm": 5.509566307067871, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8564269542694092, + "num_tokens": 204397196.0, + "step": 5355 + }, + { + "epoch": 0.6813382521307721, + "ewc_loss": 0.045872654765844345, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.0001828476379159838, + "grad_norm": 5.529631614685059, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8483788371086121, + "num_tokens": 204434654.0, + "step": 5356 + }, + { + "epoch": 0.6814654624093627, + "ewc_loss": 0.04584837704896927, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018260486831422895, + "grad_norm": 5.5189528465271, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8807514309883118, + "num_tokens": 204466581.0, + "step": 5357 + }, + { + "epoch": 0.6815926726879532, + "ewc_loss": 0.04582911729812622, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018241224461235106, + "grad_norm": 5.5408759117126465, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8711252212524414, + "num_tokens": 204503890.0, + "step": 5358 + }, + { + "epoch": 0.6817198829665437, + "ewc_loss": 0.04588090255856514, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018293011817149818, + "grad_norm": 5.613236904144287, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8546198606491089, + "num_tokens": 204537160.0, + "step": 5359 + }, + { + "epoch": 0.6818470932451343, + "ewc_loss": 0.045807383954524994, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018219495541416109, + "grad_norm": 5.550291061401367, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8683251738548279, + "num_tokens": 204570077.0, + "step": 5360 + }, + { + "epoch": 0.6819743035237247, + "ewc_loss": 0.04579629376530647, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018208404071629047, + "grad_norm": 5.451243877410889, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8746890425682068, + "num_tokens": 204603638.0, + "step": 5361 + }, + { + "epoch": 0.6821015138023152, + "ewc_loss": 0.04594924673438072, + "ewc_loss_diag": 2.765655517578125e-05, + "ewc_loss_parallel": 0.00018239286146126688, + "grad_norm": 5.519856929779053, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8559722900390625, + "num_tokens": 204644584.0, + "step": 5362 + }, + { + "epoch": 0.6822287240809057, + "ewc_loss": 0.04586505889892578, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018277167691849172, + "grad_norm": 5.565598487854004, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8447413444519043, + "num_tokens": 204678459.0, + "step": 5363 + }, + { + "epoch": 0.6823559343594963, + "ewc_loss": 0.045832470059394836, + "ewc_loss_diag": 2.753734588623047e-05, + "ewc_loss_parallel": 0.00018244580132886767, + "grad_norm": 5.470010280609131, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8564237356185913, + "num_tokens": 204712923.0, + "step": 5364 + }, + { + "epoch": 0.6824831446380868, + "ewc_loss": 0.04613184183835983, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018299811927136034, + "grad_norm": 5.558048725128174, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8419180512428284, + "num_tokens": 204751303.0, + "step": 5365 + }, + { + "epoch": 0.6826103549166773, + "ewc_loss": 0.046160824596881866, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018328790611121804, + "grad_norm": 5.485162258148193, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8588706851005554, + "num_tokens": 204791685.0, + "step": 5366 + }, + { + "epoch": 0.6827375651952677, + "ewc_loss": 0.046118587255477905, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018286556587554514, + "grad_norm": 5.510327339172363, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8593878746032715, + "num_tokens": 204827748.0, + "step": 5367 + }, + { + "epoch": 0.6828647754738583, + "ewc_loss": 0.0461573451757431, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018325314158573747, + "grad_norm": 5.487801551818848, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8758261203765869, + "num_tokens": 204866568.0, + "step": 5368 + }, + { + "epoch": 0.6829919857524488, + "ewc_loss": 0.04618588462471962, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001835385337471962, + "grad_norm": 5.5456085205078125, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8575825691223145, + "num_tokens": 204908334.0, + "step": 5369 + }, + { + "epoch": 0.6831191960310393, + "ewc_loss": 0.04622352495789528, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018391493358649313, + "grad_norm": 5.520050048828125, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8614538908004761, + "num_tokens": 204949929.0, + "step": 5370 + }, + { + "epoch": 0.6832464063096299, + "ewc_loss": 0.0461413636803627, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018309330334886909, + "grad_norm": 5.522420883178711, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8492413759231567, + "num_tokens": 204991435.0, + "step": 5371 + }, + { + "epoch": 0.6833736165882204, + "ewc_loss": 0.046199798583984375, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018367767916060984, + "grad_norm": 5.54577112197876, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8647198677062988, + "num_tokens": 205028434.0, + "step": 5372 + }, + { + "epoch": 0.6835008268668108, + "ewc_loss": 0.046112820506095886, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018280786753166467, + "grad_norm": 5.604781150817871, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.853001594543457, + "num_tokens": 205064138.0, + "step": 5373 + }, + { + "epoch": 0.6836280371454013, + "ewc_loss": 0.046097494661808014, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018265462131239474, + "grad_norm": 5.476868152618408, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8526160717010498, + "num_tokens": 205104058.0, + "step": 5374 + }, + { + "epoch": 0.6837552474239919, + "ewc_loss": 0.04611043632030487, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018278404604643583, + "grad_norm": 5.5284247398376465, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8582056760787964, + "num_tokens": 205142609.0, + "step": 5375 + }, + { + "epoch": 0.6838824577025824, + "ewc_loss": 0.04611283540725708, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018280802760273218, + "grad_norm": 5.536202907562256, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8682892322540283, + "num_tokens": 205180407.0, + "step": 5376 + }, + { + "epoch": 0.6840096679811729, + "ewc_loss": 0.04612339287996292, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018291363085154444, + "grad_norm": 5.529565334320068, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8802974224090576, + "num_tokens": 205217340.0, + "step": 5377 + }, + { + "epoch": 0.6841368782597634, + "ewc_loss": 0.04609701409935951, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001826498337322846, + "grad_norm": 5.490121364593506, + "learning_rate": 1e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8462021946907043, + "num_tokens": 205263671.0, + "step": 5378 + }, + { + "epoch": 0.6842640885383539, + "ewc_loss": 0.046149566769599915, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018317534704692662, + "grad_norm": 5.538723468780518, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8560483455657959, + "num_tokens": 205302970.0, + "step": 5379 + }, + { + "epoch": 0.6843912988169444, + "ewc_loss": 0.04613907262682915, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018307041318621486, + "grad_norm": 5.562762260437012, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8623230457305908, + "num_tokens": 205342247.0, + "step": 5380 + }, + { + "epoch": 0.6845185090955349, + "ewc_loss": 0.04611058533191681, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001827855157898739, + "grad_norm": 5.569056987762451, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8441959619522095, + "num_tokens": 205374897.0, + "step": 5381 + }, + { + "epoch": 0.6846457193741254, + "ewc_loss": 0.04611663892865181, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018284608086105436, + "grad_norm": 5.53606653213501, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8559213876724243, + "num_tokens": 205411645.0, + "step": 5382 + }, + { + "epoch": 0.684772929652716, + "ewc_loss": 0.04608786851167679, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001825583603931591, + "grad_norm": 5.504973411560059, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8589330911636353, + "num_tokens": 205450972.0, + "step": 5383 + }, + { + "epoch": 0.6849001399313065, + "ewc_loss": 0.04620969295501709, + "ewc_loss_diag": 2.7894973754882812e-05, + "ewc_loss_parallel": 0.0001825559011194855, + "grad_norm": 5.459195613861084, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8605083227157593, + "num_tokens": 205494314.0, + "step": 5384 + }, + { + "epoch": 0.6850273502098969, + "ewc_loss": 0.04615401476621628, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018321984680369496, + "grad_norm": 5.52426290512085, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.846285343170166, + "num_tokens": 205534696.0, + "step": 5385 + }, + { + "epoch": 0.6851545604884874, + "ewc_loss": 0.04615321755409241, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018321187235414982, + "grad_norm": 5.5189995765686035, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8707683682441711, + "num_tokens": 205575579.0, + "step": 5386 + }, + { + "epoch": 0.685281770767078, + "ewc_loss": 0.04615980386734009, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001832777343224734, + "grad_norm": 5.515679359436035, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8626603484153748, + "num_tokens": 205609249.0, + "step": 5387 + }, + { + "epoch": 0.6854089810456685, + "ewc_loss": 0.046145208179950714, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018313176406081766, + "grad_norm": 5.5622100830078125, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8557648658752441, + "num_tokens": 205644135.0, + "step": 5388 + }, + { + "epoch": 0.685536191324259, + "ewc_loss": 0.046131446957588196, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018299417570233345, + "grad_norm": 5.540765762329102, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8522807359695435, + "num_tokens": 205685312.0, + "step": 5389 + }, + { + "epoch": 0.6856634016028496, + "ewc_loss": 0.046140991151332855, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018308960716240108, + "grad_norm": 5.5172600746154785, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8520878553390503, + "num_tokens": 205720525.0, + "step": 5390 + }, + { + "epoch": 0.68579061188144, + "ewc_loss": 0.04614568129181862, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001831365079851821, + "grad_norm": 5.5168681144714355, + "learning_rate": 1e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8392666578292847, + "num_tokens": 205763358.0, + "step": 5391 + }, + { + "epoch": 0.6859178221600305, + "ewc_loss": 0.046144068241119385, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018312038446310908, + "grad_norm": 5.48717737197876, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8610877990722656, + "num_tokens": 205801413.0, + "step": 5392 + }, + { + "epoch": 0.686045032438621, + "ewc_loss": 0.04616337642073631, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018331345927435905, + "grad_norm": 5.524117469787598, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8600713014602661, + "num_tokens": 205840442.0, + "step": 5393 + }, + { + "epoch": 0.6861722427172116, + "ewc_loss": 0.04618486762046814, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018352839106228203, + "grad_norm": 5.51624870300293, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8710877895355225, + "num_tokens": 205877319.0, + "step": 5394 + }, + { + "epoch": 0.6862994529958021, + "ewc_loss": 0.04615246504545212, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001832043199101463, + "grad_norm": 5.535242080688477, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8563677072525024, + "num_tokens": 205916186.0, + "step": 5395 + }, + { + "epoch": 0.6864266632743926, + "ewc_loss": 0.04621797055006027, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018385938892606646, + "grad_norm": 5.514577865600586, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8478258848190308, + "num_tokens": 205956336.0, + "step": 5396 + }, + { + "epoch": 0.686553873552983, + "ewc_loss": 0.046163395047187805, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001833136338973418, + "grad_norm": 5.569627285003662, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8572831153869629, + "num_tokens": 205989570.0, + "step": 5397 + }, + { + "epoch": 0.6866810838315736, + "ewc_loss": 0.046118348836898804, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018286319391336292, + "grad_norm": 5.421355724334717, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8508734107017517, + "num_tokens": 206026139.0, + "step": 5398 + }, + { + "epoch": 0.6868082941101641, + "ewc_loss": 0.04623320326209068, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.000184011718374677, + "grad_norm": 5.533620834350586, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8531904816627502, + "num_tokens": 206070059.0, + "step": 5399 + }, + { + "epoch": 0.6869355043887546, + "ewc_loss": 0.04636184871196747, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018407746392767876, + "grad_norm": 5.570917129516602, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8621759414672852, + "num_tokens": 206104645.0, + "step": 5400 + }, + { + "epoch": 0.6870627146673451, + "ewc_loss": 0.046265751123428345, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018433718651067466, + "grad_norm": 5.479119300842285, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8559631109237671, + "num_tokens": 206141220.0, + "step": 5401 + }, + { + "epoch": 0.6871899249459357, + "ewc_loss": 0.04621826857328415, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001838623866206035, + "grad_norm": 5.531103134155273, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8559328317642212, + "num_tokens": 206177917.0, + "step": 5402 + }, + { + "epoch": 0.6873171352245261, + "ewc_loss": 0.04626913368701935, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018437100516166538, + "grad_norm": 5.522294044494629, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8567934632301331, + "num_tokens": 206213419.0, + "step": 5403 + }, + { + "epoch": 0.6874443455031166, + "ewc_loss": 0.04632246494293213, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001849043183028698, + "grad_norm": 5.503957748413086, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8428642749786377, + "num_tokens": 206252421.0, + "step": 5404 + }, + { + "epoch": 0.6875715557817071, + "ewc_loss": 0.04628647118806839, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018454439123161137, + "grad_norm": 5.497550964355469, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8630945086479187, + "num_tokens": 206287195.0, + "step": 5405 + }, + { + "epoch": 0.6876987660602977, + "ewc_loss": 0.046305350959300995, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001847331877797842, + "grad_norm": 5.512982368469238, + "learning_rate": 1e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8389633893966675, + "num_tokens": 206328339.0, + "step": 5406 + }, + { + "epoch": 0.6878259763388882, + "ewc_loss": 0.046323783695697784, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018491754599381238, + "grad_norm": 5.4851837158203125, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.843716025352478, + "num_tokens": 206369523.0, + "step": 5407 + }, + { + "epoch": 0.6879531866174787, + "ewc_loss": 0.04635462164878845, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001852259156294167, + "grad_norm": 5.522274017333984, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8775501251220703, + "num_tokens": 206402864.0, + "step": 5408 + }, + { + "epoch": 0.6880803968960693, + "ewc_loss": 0.04630112648010254, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018469092901796103, + "grad_norm": 5.513166904449463, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8469741344451904, + "num_tokens": 206439004.0, + "step": 5409 + }, + { + "epoch": 0.6882076071746597, + "ewc_loss": 0.04633617773652077, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018504145555198193, + "grad_norm": 5.602386951446533, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8602950572967529, + "num_tokens": 206472320.0, + "step": 5410 + }, + { + "epoch": 0.6883348174532502, + "ewc_loss": 0.04633191600441933, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018499884754419327, + "grad_norm": 5.438041687011719, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8500639200210571, + "num_tokens": 206518649.0, + "step": 5411 + }, + { + "epoch": 0.6884620277318407, + "ewc_loss": 0.04632556438446045, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018493532843422145, + "grad_norm": 5.554621696472168, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8734109401702881, + "num_tokens": 206551318.0, + "step": 5412 + }, + { + "epoch": 0.6885892380104313, + "ewc_loss": 0.04635871946811676, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018526689382269979, + "grad_norm": 5.485376358032227, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8461492657661438, + "num_tokens": 206588173.0, + "step": 5413 + }, + { + "epoch": 0.6887164482890218, + "ewc_loss": 0.046306077390909195, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018474046373739839, + "grad_norm": 5.581518173217773, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8571799993515015, + "num_tokens": 206624013.0, + "step": 5414 + }, + { + "epoch": 0.6888436585676123, + "ewc_loss": 0.04634448140859604, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018512451788410544, + "grad_norm": 5.449594497680664, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8645808100700378, + "num_tokens": 206668180.0, + "step": 5415 + }, + { + "epoch": 0.6889708688462027, + "ewc_loss": 0.04628780856728554, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018455777899362147, + "grad_norm": 5.514168739318848, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8532590866088867, + "num_tokens": 206710619.0, + "step": 5416 + }, + { + "epoch": 0.6890980791247933, + "ewc_loss": 0.046323128044605255, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018491095397621393, + "grad_norm": 5.481950759887695, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8633297681808472, + "num_tokens": 206750121.0, + "step": 5417 + }, + { + "epoch": 0.6892252894033838, + "ewc_loss": 0.046309128403663635, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018477094999980181, + "grad_norm": 5.645726203918457, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8689174056053162, + "num_tokens": 206789538.0, + "step": 5418 + }, + { + "epoch": 0.6893524996819743, + "ewc_loss": 0.046295057982206345, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001846302766352892, + "grad_norm": 5.605731964111328, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8589776158332825, + "num_tokens": 206827513.0, + "step": 5419 + }, + { + "epoch": 0.6894797099605648, + "ewc_loss": 0.046230290085077286, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001839825854403898, + "grad_norm": 5.498153209686279, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8642086386680603, + "num_tokens": 206866563.0, + "step": 5420 + }, + { + "epoch": 0.6896069202391554, + "ewc_loss": 0.04627700522542, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018444973102305084, + "grad_norm": 5.508561134338379, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8692551255226135, + "num_tokens": 206906281.0, + "step": 5421 + }, + { + "epoch": 0.6897341305177458, + "ewc_loss": 0.046389076858758926, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018434974481351674, + "grad_norm": 5.527976989746094, + "learning_rate": 1e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8395552635192871, + "num_tokens": 206944055.0, + "step": 5422 + }, + { + "epoch": 0.6898613407963363, + "ewc_loss": 0.04663214832544327, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018433906370773911, + "grad_norm": 5.545706272125244, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8590812683105469, + "num_tokens": 206984421.0, + "step": 5423 + }, + { + "epoch": 0.6899885510749268, + "ewc_loss": 0.046228066086769104, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018396035011392087, + "grad_norm": 5.51648473739624, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8474446535110474, + "num_tokens": 207020319.0, + "step": 5424 + }, + { + "epoch": 0.6901157613535174, + "ewc_loss": 0.04625430330634117, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018422272114548832, + "grad_norm": 5.515192985534668, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8614052534103394, + "num_tokens": 207061487.0, + "step": 5425 + }, + { + "epoch": 0.6902429716321079, + "ewc_loss": 0.04623798280954361, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018405949231237173, + "grad_norm": 5.538466453552246, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8599560260772705, + "num_tokens": 207098343.0, + "step": 5426 + }, + { + "epoch": 0.6903701819106984, + "ewc_loss": 0.046294935047626495, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018462905427441, + "grad_norm": 5.4983601570129395, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8549190759658813, + "num_tokens": 207140381.0, + "step": 5427 + }, + { + "epoch": 0.6904973921892888, + "ewc_loss": 0.04624020308256149, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001840817421907559, + "grad_norm": 5.512636184692383, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8715788722038269, + "num_tokens": 207173923.0, + "step": 5428 + }, + { + "epoch": 0.6906246024678794, + "ewc_loss": 0.046263035386800766, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018431004718877375, + "grad_norm": 5.520100116729736, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8650145530700684, + "num_tokens": 207214840.0, + "step": 5429 + }, + { + "epoch": 0.6907518127464699, + "ewc_loss": 0.04621531814336777, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018383286078460515, + "grad_norm": 5.518004417419434, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8572426438331604, + "num_tokens": 207247850.0, + "step": 5430 + }, + { + "epoch": 0.6908790230250604, + "ewc_loss": 0.04627867788076401, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018446646572556347, + "grad_norm": 5.491296768188477, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.858759880065918, + "num_tokens": 207286698.0, + "step": 5431 + }, + { + "epoch": 0.691006233303651, + "ewc_loss": 0.04639082029461861, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018436717800796032, + "grad_norm": 5.489589214324951, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8533538579940796, + "num_tokens": 207322470.0, + "step": 5432 + }, + { + "epoch": 0.6911334435822415, + "ewc_loss": 0.0462760329246521, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.00018444003944750875, + "grad_norm": 5.521073818206787, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8444845676422119, + "num_tokens": 207361517.0, + "step": 5433 + }, + { + "epoch": 0.6912606538608319, + "ewc_loss": 0.04632490500807762, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.000184928736416623, + "grad_norm": 5.587980270385742, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8682931661605835, + "num_tokens": 207394504.0, + "step": 5434 + }, + { + "epoch": 0.6913878641394224, + "ewc_loss": 0.04644910246133804, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.0001849499822128564, + "grad_norm": 5.576694011688232, + "learning_rate": 1e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8407766222953796, + "num_tokens": 207430148.0, + "step": 5435 + }, + { + "epoch": 0.691515074418013, + "ewc_loss": 0.0463174432516098, + "ewc_loss_diag": 2.777576446533203e-05, + "ewc_loss_parallel": 0.0001848540996434167, + "grad_norm": 5.546387195587158, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8564011454582214, + "num_tokens": 207461094.0, + "step": 5436 + }, + { + "epoch": 0.6916422846966035, + "ewc_loss": 0.04641370475292206, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018459603597875684, + "grad_norm": 5.534495830535889, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8648428916931152, + "num_tokens": 207500572.0, + "step": 5437 + }, + { + "epoch": 0.691769494975194, + "ewc_loss": 0.04642677307128906, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018472671217750758, + "grad_norm": 5.5450520515441895, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8517885208129883, + "num_tokens": 207534977.0, + "step": 5438 + }, + { + "epoch": 0.6918967052537845, + "ewc_loss": 0.04644281044602394, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.0001848870888352394, + "grad_norm": 5.529984474182129, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8637670874595642, + "num_tokens": 207569809.0, + "step": 5439 + }, + { + "epoch": 0.692023915532375, + "ewc_loss": 0.04647214710712433, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018518045544624329, + "grad_norm": 5.546899318695068, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8604615330696106, + "num_tokens": 207606785.0, + "step": 5440 + }, + { + "epoch": 0.6921511258109655, + "ewc_loss": 0.04645398259162903, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018499878933653235, + "grad_norm": 5.461845874786377, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8577901124954224, + "num_tokens": 207648344.0, + "step": 5441 + }, + { + "epoch": 0.692278336089556, + "ewc_loss": 0.046405114233493805, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018451012147124857, + "grad_norm": 5.545485973358154, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.866595983505249, + "num_tokens": 207680962.0, + "step": 5442 + }, + { + "epoch": 0.6924055463681466, + "ewc_loss": 0.04648333415389061, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018529233057051897, + "grad_norm": 5.547313690185547, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8902297616004944, + "num_tokens": 207709162.0, + "step": 5443 + }, + { + "epoch": 0.6925327566467371, + "ewc_loss": 0.0463654026389122, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018411299970466644, + "grad_norm": 5.488370418548584, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8549705743789673, + "num_tokens": 207746089.0, + "step": 5444 + }, + { + "epoch": 0.6926599669253276, + "ewc_loss": 0.04645693674683571, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.0001850283588282764, + "grad_norm": 5.532160758972168, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8700496554374695, + "num_tokens": 207791235.0, + "step": 5445 + }, + { + "epoch": 0.692787177203918, + "ewc_loss": 0.04650714248418808, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.0001843097124947235, + "grad_norm": 5.55684757232666, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8513028621673584, + "num_tokens": 207824527.0, + "step": 5446 + }, + { + "epoch": 0.6929143874825086, + "ewc_loss": 0.046559929847717285, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018483756866771728, + "grad_norm": 5.506918907165527, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8690474629402161, + "num_tokens": 207863535.0, + "step": 5447 + }, + { + "epoch": 0.6930415977610991, + "ewc_loss": 0.04654301702976227, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018466847541276366, + "grad_norm": 5.477080821990967, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8528370261192322, + "num_tokens": 207899694.0, + "step": 5448 + }, + { + "epoch": 0.6931688080396896, + "ewc_loss": 0.04659486562013626, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018518694560043514, + "grad_norm": 5.572502136230469, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8507038354873657, + "num_tokens": 207931440.0, + "step": 5449 + }, + { + "epoch": 0.6932960183182801, + "ewc_loss": 0.04650777950882912, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.0001855367881944403, + "grad_norm": 5.562629222869873, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8569306135177612, + "num_tokens": 207963758.0, + "step": 5450 + }, + { + "epoch": 0.6934232285968707, + "ewc_loss": 0.04669984057545662, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018501597514841706, + "grad_norm": 5.518222332000732, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8634002208709717, + "num_tokens": 208002383.0, + "step": 5451 + }, + { + "epoch": 0.6935504388754611, + "ewc_loss": 0.046785663813352585, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001846535160439089, + "grad_norm": 5.504331111907959, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8626198768615723, + "num_tokens": 208038723.0, + "step": 5452 + }, + { + "epoch": 0.6936776491540516, + "ewc_loss": 0.04665764421224594, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018459399871062487, + "grad_norm": 5.521334171295166, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8636492490768433, + "num_tokens": 208071851.0, + "step": 5453 + }, + { + "epoch": 0.6938048594326421, + "ewc_loss": 0.046651750802993774, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018453509255778044, + "grad_norm": 5.535617351531982, + "learning_rate": 1e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8408316373825073, + "num_tokens": 208106916.0, + "step": 5454 + }, + { + "epoch": 0.6939320697112327, + "ewc_loss": 0.046571455895900726, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018495283438824117, + "grad_norm": 5.523775100708008, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8680633902549744, + "num_tokens": 208146883.0, + "step": 5455 + }, + { + "epoch": 0.6940592799898232, + "ewc_loss": 0.046561338007450104, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018485166947357357, + "grad_norm": 5.496166706085205, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8602088093757629, + "num_tokens": 208187364.0, + "step": 5456 + }, + { + "epoch": 0.6941864902684137, + "ewc_loss": 0.04655558988451958, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.0001847941748565063, + "grad_norm": 5.5036468505859375, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8660762310028076, + "num_tokens": 208226087.0, + "step": 5457 + }, + { + "epoch": 0.6943137005470043, + "ewc_loss": 0.04667717590928078, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018478932906873524, + "grad_norm": 5.575103282928467, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8563174605369568, + "num_tokens": 208265193.0, + "step": 5458 + }, + { + "epoch": 0.6944409108255947, + "ewc_loss": 0.0467897392809391, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018469427595846355, + "grad_norm": 5.4759368896484375, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.872355580329895, + "num_tokens": 208305163.0, + "step": 5459 + }, + { + "epoch": 0.6945681211041852, + "ewc_loss": 0.04669173061847687, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018493490642867982, + "grad_norm": 5.567694664001465, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8486032485961914, + "num_tokens": 208342229.0, + "step": 5460 + }, + { + "epoch": 0.6946953313827757, + "ewc_loss": 0.04662936180830002, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018431118223816156, + "grad_norm": 5.470991134643555, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8487008213996887, + "num_tokens": 208384606.0, + "step": 5461 + }, + { + "epoch": 0.6948225416613663, + "ewc_loss": 0.04668048769235611, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018482247833162546, + "grad_norm": 5.552656650543213, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8774769306182861, + "num_tokens": 208416777.0, + "step": 5462 + }, + { + "epoch": 0.6949497519399568, + "ewc_loss": 0.04670267552137375, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018504433683119714, + "grad_norm": 5.524109363555908, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8437816500663757, + "num_tokens": 208456101.0, + "step": 5463 + }, + { + "epoch": 0.6950769622185473, + "ewc_loss": 0.046565908938646317, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018489737703930587, + "grad_norm": 5.58104133605957, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8685586452484131, + "num_tokens": 208489201.0, + "step": 5464 + }, + { + "epoch": 0.6952041724971377, + "ewc_loss": 0.04657690227031708, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018500730220694095, + "grad_norm": 5.5006608963012695, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8740081787109375, + "num_tokens": 208525006.0, + "step": 5465 + }, + { + "epoch": 0.6953313827757283, + "ewc_loss": 0.04659124091267586, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018515068222768605, + "grad_norm": 5.5187458992004395, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8558792471885681, + "num_tokens": 208568997.0, + "step": 5466 + }, + { + "epoch": 0.6954585930543188, + "ewc_loss": 0.046691298484802246, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.000184930584509857, + "grad_norm": 5.556685924530029, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8499927520751953, + "num_tokens": 208605925.0, + "step": 5467 + }, + { + "epoch": 0.6955858033329093, + "ewc_loss": 0.04651034250855446, + "ewc_loss_diag": 2.8014183044433594e-05, + "ewc_loss_parallel": 0.00018556241411715746, + "grad_norm": 6.054507732391357, + "learning_rate": 1e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.8340389728546143, + "num_tokens": 208646290.0, + "step": 5468 + }, + { + "epoch": 0.6957130136114998, + "ewc_loss": 0.04698077589273453, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018660463683772832, + "grad_norm": 5.482957363128662, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8597676753997803, + "num_tokens": 208686424.0, + "step": 5469 + }, + { + "epoch": 0.6958402238900904, + "ewc_loss": 0.046740494668483734, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001842018245952204, + "grad_norm": 5.571157932281494, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8608601093292236, + "num_tokens": 208719643.0, + "step": 5470 + }, + { + "epoch": 0.6959674341686808, + "ewc_loss": 0.0467531755566597, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018554933194536716, + "grad_norm": 5.493180751800537, + "learning_rate": 1e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8436945676803589, + "num_tokens": 208760804.0, + "step": 5471 + }, + { + "epoch": 0.6960946444472713, + "ewc_loss": 0.0466989167034626, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018500674923416227, + "grad_norm": 5.546234607696533, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.85410475730896, + "num_tokens": 208803022.0, + "step": 5472 + }, + { + "epoch": 0.6962218547258618, + "ewc_loss": 0.04681455343961716, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018616308807395399, + "grad_norm": 5.51829195022583, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.860315203666687, + "num_tokens": 208844205.0, + "step": 5473 + }, + { + "epoch": 0.6963490650044524, + "ewc_loss": 0.046716392040252686, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001851814886322245, + "grad_norm": 5.479921817779541, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8698448538780212, + "num_tokens": 208886882.0, + "step": 5474 + }, + { + "epoch": 0.6964762752830429, + "ewc_loss": 0.04679729416966438, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018599051691126078, + "grad_norm": 5.57123327255249, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8562710285186768, + "num_tokens": 208926649.0, + "step": 5475 + }, + { + "epoch": 0.6966034855616334, + "ewc_loss": 0.0467512309551239, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018552987603470683, + "grad_norm": 5.477367401123047, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8521175384521484, + "num_tokens": 208963403.0, + "step": 5476 + }, + { + "epoch": 0.6967306958402238, + "ewc_loss": 0.04667883366346359, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.0001860266347648576, + "grad_norm": 5.60517692565918, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8515912890434265, + "num_tokens": 209002475.0, + "step": 5477 + }, + { + "epoch": 0.6968579061188144, + "ewc_loss": 0.04668952897191048, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018613357678987086, + "grad_norm": 5.483917236328125, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8583523631095886, + "num_tokens": 209044216.0, + "step": 5478 + }, + { + "epoch": 0.6969851163974049, + "ewc_loss": 0.046668797731399536, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018592628475744277, + "grad_norm": 5.5916900634765625, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8484708666801453, + "num_tokens": 209080774.0, + "step": 5479 + }, + { + "epoch": 0.6971123266759954, + "ewc_loss": 0.04665163904428482, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018575467402115464, + "grad_norm": 5.504116058349609, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8648895025253296, + "num_tokens": 209117167.0, + "step": 5480 + }, + { + "epoch": 0.697239536954586, + "ewc_loss": 0.04668566584587097, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018609494145493954, + "grad_norm": 5.625617980957031, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8536878824234009, + "num_tokens": 209148872.0, + "step": 5481 + }, + { + "epoch": 0.6973667472331765, + "ewc_loss": 0.04674093797802925, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.0001866476668510586, + "grad_norm": 5.547508716583252, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8599178791046143, + "num_tokens": 209187912.0, + "step": 5482 + }, + { + "epoch": 0.6974939575117669, + "ewc_loss": 0.046706706285476685, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018630536214914173, + "grad_norm": 5.568139553070068, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8658641576766968, + "num_tokens": 209227055.0, + "step": 5483 + }, + { + "epoch": 0.6976211677903574, + "ewc_loss": 0.046717267483472824, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.000186410965397954, + "grad_norm": 5.601842880249023, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8722952604293823, + "num_tokens": 209255343.0, + "step": 5484 + }, + { + "epoch": 0.697748378068948, + "ewc_loss": 0.04683312028646469, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018634877051226795, + "grad_norm": 5.545608997344971, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8522621393203735, + "num_tokens": 209294621.0, + "step": 5485 + }, + { + "epoch": 0.6978755883475385, + "ewc_loss": 0.04684976860880852, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018651527352631092, + "grad_norm": 5.494441986083984, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8746486306190491, + "num_tokens": 209336519.0, + "step": 5486 + }, + { + "epoch": 0.698002798626129, + "ewc_loss": 0.04682226851582527, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018624025688041002, + "grad_norm": 5.587173938751221, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8562946319580078, + "num_tokens": 209372913.0, + "step": 5487 + }, + { + "epoch": 0.6981300089047195, + "ewc_loss": 0.047027602791786194, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018707290291786194, + "grad_norm": 5.564884185791016, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8507921695709229, + "num_tokens": 209417768.0, + "step": 5488 + }, + { + "epoch": 0.69825721918331, + "ewc_loss": 0.04683799296617508, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001863975339801982, + "grad_norm": 5.599389553070068, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8611650466918945, + "num_tokens": 209451221.0, + "step": 5489 + }, + { + "epoch": 0.6983844294619005, + "ewc_loss": 0.04708439111709595, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018642008944880217, + "grad_norm": 5.562441349029541, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8585900664329529, + "num_tokens": 209488710.0, + "step": 5490 + }, + { + "epoch": 0.698511639740491, + "ewc_loss": 0.047058768570423126, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018616387387737632, + "grad_norm": 5.558078765869141, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8552038669586182, + "num_tokens": 209525166.0, + "step": 5491 + }, + { + "epoch": 0.6986388500190815, + "ewc_loss": 0.04695216193795204, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018631848797667772, + "grad_norm": 5.523712635040283, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8614239692687988, + "num_tokens": 209564804.0, + "step": 5492 + }, + { + "epoch": 0.6987660602976721, + "ewc_loss": 0.04697873815894127, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001865842641564086, + "grad_norm": 5.594979763031006, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8621867895126343, + "num_tokens": 209607155.0, + "step": 5493 + }, + { + "epoch": 0.6988932705762626, + "ewc_loss": 0.04692596569657326, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018605652439873666, + "grad_norm": 5.545958995819092, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8489173650741577, + "num_tokens": 209641226.0, + "step": 5494 + }, + { + "epoch": 0.699020480854853, + "ewc_loss": 0.0469510555267334, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018630742852110416, + "grad_norm": 5.545006275177002, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8553227186203003, + "num_tokens": 209679180.0, + "step": 5495 + }, + { + "epoch": 0.6991476911334435, + "ewc_loss": 0.04693756625056267, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001861725322669372, + "grad_norm": 5.565740585327148, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8507646322250366, + "num_tokens": 209716412.0, + "step": 5496 + }, + { + "epoch": 0.6992749014120341, + "ewc_loss": 0.04694699868559837, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018626685778144747, + "grad_norm": 5.54138708114624, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8501092195510864, + "num_tokens": 209755697.0, + "step": 5497 + }, + { + "epoch": 0.6994021116906246, + "ewc_loss": 0.04678953438997269, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018591292609926313, + "grad_norm": 5.561650276184082, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8656130433082581, + "num_tokens": 209793654.0, + "step": 5498 + }, + { + "epoch": 0.6995293219692151, + "ewc_loss": 0.04682637378573418, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018628130783326924, + "grad_norm": 5.577047824859619, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8569003939628601, + "num_tokens": 209830703.0, + "step": 5499 + }, + { + "epoch": 0.6996565322478057, + "ewc_loss": 0.046672169119119644, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018595997244119644, + "grad_norm": 5.592262268066406, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8591394424438477, + "num_tokens": 209868994.0, + "step": 5500 + }, + { + "epoch": 0.6997837425263961, + "ewc_loss": 0.046794742345809937, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.000185964978300035, + "grad_norm": 5.602546215057373, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.868248701095581, + "num_tokens": 209909605.0, + "step": 5501 + }, + { + "epoch": 0.6999109528049866, + "ewc_loss": 0.046752139925956726, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001855389855336398, + "grad_norm": 5.555679798126221, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8644542694091797, + "num_tokens": 209945196.0, + "step": 5502 + }, + { + "epoch": 0.7000381630835771, + "ewc_loss": 0.04676470160484314, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018566459766589105, + "grad_norm": 5.5140461921691895, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8577514886856079, + "num_tokens": 209988585.0, + "step": 5503 + }, + { + "epoch": 0.7001653733621677, + "ewc_loss": 0.046662263572216034, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018586090300232172, + "grad_norm": 5.496143817901611, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8592671155929565, + "num_tokens": 210032084.0, + "step": 5504 + }, + { + "epoch": 0.7002925836407582, + "ewc_loss": 0.046816714107990265, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001861847413238138, + "grad_norm": 5.555124759674072, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8488565683364868, + "num_tokens": 210070639.0, + "step": 5505 + }, + { + "epoch": 0.7004197939193487, + "ewc_loss": 0.04678718373179436, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018588942475616932, + "grad_norm": 5.49484395980835, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8671570420265198, + "num_tokens": 210113793.0, + "step": 5506 + }, + { + "epoch": 0.7005470041979391, + "ewc_loss": 0.04682111740112305, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018622877541929483, + "grad_norm": 5.560950756072998, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8503974676132202, + "num_tokens": 210153517.0, + "step": 5507 + }, + { + "epoch": 0.7006742144765297, + "ewc_loss": 0.046823445707559586, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018625202937982976, + "grad_norm": 5.60690975189209, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8537813425064087, + "num_tokens": 210186818.0, + "step": 5508 + }, + { + "epoch": 0.7008014247551202, + "ewc_loss": 0.046916455030441284, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001859614421846345, + "grad_norm": 5.545857906341553, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8507980108261108, + "num_tokens": 210224283.0, + "step": 5509 + }, + { + "epoch": 0.7009286350337107, + "ewc_loss": 0.04692235216498375, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018602039199322462, + "grad_norm": 5.565913677215576, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8539144992828369, + "num_tokens": 210263439.0, + "step": 5510 + }, + { + "epoch": 0.7010558453123013, + "ewc_loss": 0.046943582594394684, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018623267533257604, + "grad_norm": 5.533301830291748, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8607856631278992, + "num_tokens": 210305887.0, + "step": 5511 + }, + { + "epoch": 0.7011830555908918, + "ewc_loss": 0.046950504183769226, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018630189879331738, + "grad_norm": 5.5987935066223145, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8582995533943176, + "num_tokens": 210340118.0, + "step": 5512 + }, + { + "epoch": 0.7013102658694823, + "ewc_loss": 0.04693184047937393, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001861152850324288, + "grad_norm": 5.539632797241211, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8597273230552673, + "num_tokens": 210376098.0, + "step": 5513 + }, + { + "epoch": 0.7014374761480727, + "ewc_loss": 0.04699781537055969, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018677505431696773, + "grad_norm": 5.58200216293335, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8622339367866516, + "num_tokens": 210418801.0, + "step": 5514 + }, + { + "epoch": 0.7015646864266633, + "ewc_loss": 0.04694311320781708, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001862280332716182, + "grad_norm": 5.602751731872559, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8478256464004517, + "num_tokens": 210452392.0, + "step": 5515 + }, + { + "epoch": 0.7016918967052538, + "ewc_loss": 0.04695512726902962, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018634814477991313, + "grad_norm": 5.670412063598633, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8660876750946045, + "num_tokens": 210489697.0, + "step": 5516 + }, + { + "epoch": 0.7018191069838443, + "ewc_loss": 0.04680798947811127, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001860974880401045, + "grad_norm": 5.553588390350342, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8430130481719971, + "num_tokens": 210530212.0, + "step": 5517 + }, + { + "epoch": 0.7019463172624348, + "ewc_loss": 0.04690716043114662, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001858684845501557, + "grad_norm": 5.611785888671875, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8651970624923706, + "num_tokens": 210564974.0, + "step": 5518 + }, + { + "epoch": 0.7020735275410254, + "ewc_loss": 0.046946026384830475, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001862571225501597, + "grad_norm": 5.587499141693115, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8587853312492371, + "num_tokens": 210601272.0, + "step": 5519 + }, + { + "epoch": 0.7022007378196158, + "ewc_loss": 0.04704560339450836, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018603219359647483, + "grad_norm": 5.553318023681641, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8613055944442749, + "num_tokens": 210636908.0, + "step": 5520 + }, + { + "epoch": 0.7023279480982063, + "ewc_loss": 0.046866148710250854, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001854583533713594, + "grad_norm": 5.5582098960876465, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8556299209594727, + "num_tokens": 210672629.0, + "step": 5521 + }, + { + "epoch": 0.7024551583767968, + "ewc_loss": 0.046821966767311096, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018623725918587297, + "grad_norm": 5.63420295715332, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8733496069908142, + "num_tokens": 210703531.0, + "step": 5522 + }, + { + "epoch": 0.7025823686553874, + "ewc_loss": 0.046887874603271484, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018567562801763415, + "grad_norm": 5.5346598625183105, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8713669180870056, + "num_tokens": 210742946.0, + "step": 5523 + }, + { + "epoch": 0.7027095789339779, + "ewc_loss": 0.04689469560980797, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001857438328443095, + "grad_norm": 5.5501298904418945, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8679393529891968, + "num_tokens": 210778345.0, + "step": 5524 + }, + { + "epoch": 0.7028367892125684, + "ewc_loss": 0.047028545290231705, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001858616160461679, + "grad_norm": 5.607310771942139, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8650352358818054, + "num_tokens": 210821544.0, + "step": 5525 + }, + { + "epoch": 0.7029639994911588, + "ewc_loss": 0.04679587483406067, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018597629969008267, + "grad_norm": 5.592801094055176, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8638672828674316, + "num_tokens": 210858060.0, + "step": 5526 + }, + { + "epoch": 0.7030912097697494, + "ewc_loss": 0.046858493238687515, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001853818102972582, + "grad_norm": 5.5187458992004395, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8536803126335144, + "num_tokens": 210900365.0, + "step": 5527 + }, + { + "epoch": 0.7032184200483399, + "ewc_loss": 0.04694471135735512, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001862439967226237, + "grad_norm": 5.598055362701416, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8535293340682983, + "num_tokens": 210941118.0, + "step": 5528 + }, + { + "epoch": 0.7033456303269304, + "ewc_loss": 0.04676346853375435, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018565227219369262, + "grad_norm": 5.530432224273682, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8668323755264282, + "num_tokens": 210979865.0, + "step": 5529 + }, + { + "epoch": 0.703472840605521, + "ewc_loss": 0.046811170876026154, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018612931307870895, + "grad_norm": 5.595401763916016, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.858873724937439, + "num_tokens": 211016385.0, + "step": 5530 + }, + { + "epoch": 0.7036000508841115, + "ewc_loss": 0.04682675749063492, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001862851349869743, + "grad_norm": 5.596158981323242, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8465851545333862, + "num_tokens": 211059071.0, + "step": 5531 + }, + { + "epoch": 0.7037272611627019, + "ewc_loss": 0.04677898436784744, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001858074392657727, + "grad_norm": 5.588659763336182, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8617374300956726, + "num_tokens": 211094189.0, + "step": 5532 + }, + { + "epoch": 0.7038544714412924, + "ewc_loss": 0.04679195582866669, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018593711138237268, + "grad_norm": 5.58392333984375, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8736710548400879, + "num_tokens": 211130524.0, + "step": 5533 + }, + { + "epoch": 0.703981681719883, + "ewc_loss": 0.04678448289632797, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.000185862387297675, + "grad_norm": 5.593195915222168, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8524352312088013, + "num_tokens": 211172335.0, + "step": 5534 + }, + { + "epoch": 0.7041088919984735, + "ewc_loss": 0.04691444709897041, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018594134598970413, + "grad_norm": 5.704329013824463, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8607048988342285, + "num_tokens": 211205406.0, + "step": 5535 + }, + { + "epoch": 0.704236102277064, + "ewc_loss": 0.04676804691553116, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018569803796708584, + "grad_norm": 5.523881435394287, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8543017506599426, + "num_tokens": 211246443.0, + "step": 5536 + }, + { + "epoch": 0.7043633125556545, + "ewc_loss": 0.046783171594142914, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018584927602205426, + "grad_norm": 5.593142032623291, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8737541437149048, + "num_tokens": 211286707.0, + "step": 5537 + }, + { + "epoch": 0.704490522834245, + "ewc_loss": 0.04666970670223236, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.00018593535060063004, + "grad_norm": 5.565921783447266, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8696821928024292, + "num_tokens": 211319448.0, + "step": 5538 + }, + { + "epoch": 0.7046177331128355, + "ewc_loss": 0.04666149616241455, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.0001858532486949116, + "grad_norm": 5.562626838684082, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8541039824485779, + "num_tokens": 211363747.0, + "step": 5539 + }, + { + "epoch": 0.704744943391426, + "ewc_loss": 0.0467156246304512, + "ewc_loss_diag": 2.8133392333984375e-05, + "ewc_loss_parallel": 0.0001863945071818307, + "grad_norm": 5.568972587585449, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8645669221878052, + "num_tokens": 211408080.0, + "step": 5540 + }, + { + "epoch": 0.7048721536700165, + "ewc_loss": 0.046819426119327545, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018621185154188424, + "grad_norm": 5.571007251739502, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8482540845870972, + "num_tokens": 211445656.0, + "step": 5541 + }, + { + "epoch": 0.7049993639486071, + "ewc_loss": 0.04673948511481285, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018541242752689868, + "grad_norm": 5.507838249206543, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8674793243408203, + "num_tokens": 211484342.0, + "step": 5542 + }, + { + "epoch": 0.7051265742271976, + "ewc_loss": 0.04687485471367836, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001867661194410175, + "grad_norm": 5.620431900024414, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8534190654754639, + "num_tokens": 211518535.0, + "step": 5543 + }, + { + "epoch": 0.705253784505788, + "ewc_loss": 0.046813055872917175, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018614812870509923, + "grad_norm": 5.557149887084961, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8532183170318604, + "num_tokens": 211557236.0, + "step": 5544 + }, + { + "epoch": 0.7053809947843785, + "ewc_loss": 0.046887461096048355, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018689218268264085, + "grad_norm": 5.548967361450195, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8704749941825867, + "num_tokens": 211599216.0, + "step": 5545 + }, + { + "epoch": 0.7055082050629691, + "ewc_loss": 0.04689110070466995, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018692860612645745, + "grad_norm": 5.597220420837402, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.84162437915802, + "num_tokens": 211638930.0, + "step": 5546 + }, + { + "epoch": 0.7056354153415596, + "ewc_loss": 0.04680955410003662, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018611310224514455, + "grad_norm": 5.53676700592041, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8520811796188354, + "num_tokens": 211680765.0, + "step": 5547 + }, + { + "epoch": 0.7057626256201501, + "ewc_loss": 0.046914227306842804, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018715986516326666, + "grad_norm": 5.576366901397705, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8731021881103516, + "num_tokens": 211719037.0, + "step": 5548 + }, + { + "epoch": 0.7058898358987407, + "ewc_loss": 0.047102272510528564, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018659890338312835, + "grad_norm": 5.615207672119141, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8534972667694092, + "num_tokens": 211759655.0, + "step": 5549 + }, + { + "epoch": 0.7060170461773311, + "ewc_loss": 0.04685720056295395, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018658960470929742, + "grad_norm": 5.549500942230225, + "learning_rate": 1e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8409116864204407, + "num_tokens": 211800405.0, + "step": 5550 + }, + { + "epoch": 0.7061442564559216, + "ewc_loss": 0.04692413657903671, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018725896370597184, + "grad_norm": 5.571094989776611, + "learning_rate": 1e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.841334342956543, + "num_tokens": 211846057.0, + "step": 5551 + }, + { + "epoch": 0.7062714667345121, + "ewc_loss": 0.04717126488685608, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018728879513219, + "grad_norm": 5.580992221832275, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.856292724609375, + "num_tokens": 211885944.0, + "step": 5552 + }, + { + "epoch": 0.7063986770131027, + "ewc_loss": 0.0469633974134922, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018765155982691795, + "grad_norm": 5.554159164428711, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8632053136825562, + "num_tokens": 211928727.0, + "step": 5553 + }, + { + "epoch": 0.7065258872916932, + "ewc_loss": 0.04699352756142616, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018795285723172128, + "grad_norm": 5.617403030395508, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8488907814025879, + "num_tokens": 211967593.0, + "step": 5554 + }, + { + "epoch": 0.7066530975702837, + "ewc_loss": 0.04694793373346329, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018749690207187086, + "grad_norm": 5.631425380706787, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8532246947288513, + "num_tokens": 212001023.0, + "step": 5555 + }, + { + "epoch": 0.7067803078488741, + "ewc_loss": 0.0469878613948822, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018789620662573725, + "grad_norm": 5.581033229827881, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8498615026473999, + "num_tokens": 212045611.0, + "step": 5556 + }, + { + "epoch": 0.7069075181274647, + "ewc_loss": 0.04697263985872269, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001877439790405333, + "grad_norm": 5.649670124053955, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8543685674667358, + "num_tokens": 212087167.0, + "step": 5557 + }, + { + "epoch": 0.7070347284060552, + "ewc_loss": 0.046973586082458496, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018775342323351651, + "grad_norm": 5.550340175628662, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8599671721458435, + "num_tokens": 212124254.0, + "step": 5558 + }, + { + "epoch": 0.7071619386846457, + "ewc_loss": 0.04692239314317703, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018724153051152825, + "grad_norm": 5.619263648986816, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8528804779052734, + "num_tokens": 212162706.0, + "step": 5559 + }, + { + "epoch": 0.7072891489632362, + "ewc_loss": 0.04701581224799156, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001881757052615285, + "grad_norm": 5.583176136016846, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8582958579063416, + "num_tokens": 212197359.0, + "step": 5560 + }, + { + "epoch": 0.7074163592418268, + "ewc_loss": 0.046928323805332184, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001873008004622534, + "grad_norm": 5.58705472946167, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8655349016189575, + "num_tokens": 212237300.0, + "step": 5561 + }, + { + "epoch": 0.7075435695204173, + "ewc_loss": 0.04699954763054848, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018801305850502104, + "grad_norm": 5.512441158294678, + "learning_rate": 1e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.8392586708068848, + "num_tokens": 212283472.0, + "step": 5562 + }, + { + "epoch": 0.7076707797990077, + "ewc_loss": 0.0469767227768898, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001877848117146641, + "grad_norm": 5.617498874664307, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8533006906509399, + "num_tokens": 212321433.0, + "step": 5563 + }, + { + "epoch": 0.7077979900775982, + "ewc_loss": 0.04700176790356636, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018803526472765952, + "grad_norm": 5.544731140136719, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8557424545288086, + "num_tokens": 212362234.0, + "step": 5564 + }, + { + "epoch": 0.7079252003561888, + "ewc_loss": 0.04727061837911606, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018828237080015242, + "grad_norm": 5.685876846313477, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8487385511398315, + "num_tokens": 212399259.0, + "step": 5565 + }, + { + "epoch": 0.7080524106347793, + "ewc_loss": 0.047060348093509674, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018862106662709266, + "grad_norm": 5.588576793670654, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8586232662200928, + "num_tokens": 212438489.0, + "step": 5566 + }, + { + "epoch": 0.7081796209133698, + "ewc_loss": 0.04705535247921944, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018857110990211368, + "grad_norm": 5.686769485473633, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8504815101623535, + "num_tokens": 212482283.0, + "step": 5567 + }, + { + "epoch": 0.7083068311919604, + "ewc_loss": 0.04698598384857178, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001878774055512622, + "grad_norm": 5.571094512939453, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8533968925476074, + "num_tokens": 212522749.0, + "step": 5568 + }, + { + "epoch": 0.7084340414705508, + "ewc_loss": 0.04721762239933014, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018775240459945053, + "grad_norm": 5.635420322418213, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8497289419174194, + "num_tokens": 212559937.0, + "step": 5569 + }, + { + "epoch": 0.7085612517491413, + "ewc_loss": 0.04722503945231438, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018782656115945429, + "grad_norm": 14.318117141723633, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8632140159606934, + "num_tokens": 212595578.0, + "step": 5570 + }, + { + "epoch": 0.7086884620277318, + "ewc_loss": 0.05719277635216713, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0002875039353966713, + "grad_norm": 7.233639240264893, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8689894080162048, + "num_tokens": 212634085.0, + "step": 5571 + }, + { + "epoch": 0.7088156723063224, + "ewc_loss": 0.045665886253118515, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00017223504255525768, + "grad_norm": 4.867713451385498, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8468638062477112, + "num_tokens": 212673557.0, + "step": 5572 + }, + { + "epoch": 0.7089428825849129, + "ewc_loss": 0.050220027565956116, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00021777643996756524, + "grad_norm": 6.561371803283691, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8683397769927979, + "num_tokens": 212711142.0, + "step": 5573 + }, + { + "epoch": 0.7090700928635034, + "ewc_loss": 0.05031996965408325, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0002187758800573647, + "grad_norm": 5.785370349884033, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8584779500961304, + "num_tokens": 212745570.0, + "step": 5574 + }, + { + "epoch": 0.7091973031420938, + "ewc_loss": 0.04831275716423988, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019870373944286257, + "grad_norm": 5.9836812019348145, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8610590696334839, + "num_tokens": 212783048.0, + "step": 5575 + }, + { + "epoch": 0.7093245134206844, + "ewc_loss": 0.04918095842003822, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0002073857467621565, + "grad_norm": 5.85807991027832, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.875378429889679, + "num_tokens": 212821231.0, + "step": 5576 + }, + { + "epoch": 0.7094517236992749, + "ewc_loss": 0.048230525106191635, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001978814252652228, + "grad_norm": 5.735486030578613, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8551450967788696, + "num_tokens": 212861715.0, + "step": 5577 + }, + { + "epoch": 0.7095789339778654, + "ewc_loss": 0.04833812639117241, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019895743753295392, + "grad_norm": 5.879476547241211, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8483030796051025, + "num_tokens": 212900655.0, + "step": 5578 + }, + { + "epoch": 0.709706144256456, + "ewc_loss": 0.047842592000961304, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00019644350686576217, + "grad_norm": 5.73299503326416, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8502507209777832, + "num_tokens": 212939308.0, + "step": 5579 + }, + { + "epoch": 0.7098333545350465, + "ewc_loss": 0.047650210559368134, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00019451968546491116, + "grad_norm": 5.736408710479736, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8627362847328186, + "num_tokens": 212981174.0, + "step": 5580 + }, + { + "epoch": 0.7099605648136369, + "ewc_loss": 0.04753972589969635, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00019341486040502787, + "grad_norm": 5.724446773529053, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8676354885101318, + "num_tokens": 213016401.0, + "step": 5581 + }, + { + "epoch": 0.7100877750922274, + "ewc_loss": 0.04741707071661949, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00019218829402234405, + "grad_norm": 5.687509536743164, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8457499742507935, + "num_tokens": 213058480.0, + "step": 5582 + }, + { + "epoch": 0.710214985370818, + "ewc_loss": 0.04733741283416748, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00019139170763082802, + "grad_norm": 5.685940265655518, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8701013326644897, + "num_tokens": 213093527.0, + "step": 5583 + }, + { + "epoch": 0.7103421956494085, + "ewc_loss": 0.047279857099056244, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00019081615027971566, + "grad_norm": 5.672481536865234, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8565296530723572, + "num_tokens": 213126778.0, + "step": 5584 + }, + { + "epoch": 0.710469405927999, + "ewc_loss": 0.047221437096595764, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00019023196364287287, + "grad_norm": 5.608947277069092, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8614184260368347, + "num_tokens": 213170863.0, + "step": 5585 + }, + { + "epoch": 0.7105966162065895, + "ewc_loss": 0.0471767894923687, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001897854672279209, + "grad_norm": 5.668877601623535, + "learning_rate": 1e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.834345281124115, + "num_tokens": 213211336.0, + "step": 5586 + }, + { + "epoch": 0.71072382648518, + "ewc_loss": 0.04712700843811035, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018928767531178892, + "grad_norm": 5.605459213256836, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8649290800094604, + "num_tokens": 213245262.0, + "step": 5587 + }, + { + "epoch": 0.7108510367637705, + "ewc_loss": 0.04713909327983856, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018940851441584527, + "grad_norm": 5.644550800323486, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8490111827850342, + "num_tokens": 213282150.0, + "step": 5588 + }, + { + "epoch": 0.710978247042361, + "ewc_loss": 0.04714066535234451, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018942424503620714, + "grad_norm": 5.5947136878967285, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8631303906440735, + "num_tokens": 213325573.0, + "step": 5589 + }, + { + "epoch": 0.7111054573209515, + "ewc_loss": 0.0471126064658165, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001891436259029433, + "grad_norm": 5.595017433166504, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.867375373840332, + "num_tokens": 213363738.0, + "step": 5590 + }, + { + "epoch": 0.7112326675995421, + "ewc_loss": 0.04710439220070839, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001890614948933944, + "grad_norm": 5.565549850463867, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8637222051620483, + "num_tokens": 213404796.0, + "step": 5591 + }, + { + "epoch": 0.7113598778781326, + "ewc_loss": 0.0471176952123642, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001891945139504969, + "grad_norm": 5.646533966064453, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.850233793258667, + "num_tokens": 213436711.0, + "step": 5592 + }, + { + "epoch": 0.711487088156723, + "ewc_loss": 0.04708282649517059, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018884586461354047, + "grad_norm": 5.579102039337158, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8619123101234436, + "num_tokens": 213473458.0, + "step": 5593 + }, + { + "epoch": 0.7116142984353135, + "ewc_loss": 0.0471142940223217, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018916052067652345, + "grad_norm": 5.617921352386475, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8683210611343384, + "num_tokens": 213507355.0, + "step": 5594 + }, + { + "epoch": 0.7117415087139041, + "ewc_loss": 0.04716520011425018, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018966957577504218, + "grad_norm": 5.72054386138916, + "learning_rate": 1e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8406336307525635, + "num_tokens": 213543771.0, + "step": 5595 + }, + { + "epoch": 0.7118687189924946, + "ewc_loss": 0.04709725081920624, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018899010319728404, + "grad_norm": 5.546991348266602, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8548832535743713, + "num_tokens": 213578317.0, + "step": 5596 + }, + { + "epoch": 0.7119959292710851, + "ewc_loss": 0.04714691638946533, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001894867600640282, + "grad_norm": 5.640585422515869, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8550890684127808, + "num_tokens": 213615795.0, + "step": 5597 + }, + { + "epoch": 0.7121231395496757, + "ewc_loss": 0.047130782157182693, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018932540842797607, + "grad_norm": 5.549459934234619, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8681416511535645, + "num_tokens": 213655452.0, + "step": 5598 + }, + { + "epoch": 0.7122503498282661, + "ewc_loss": 0.04715227335691452, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018954029656015337, + "grad_norm": 5.730783462524414, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8486610651016235, + "num_tokens": 213688079.0, + "step": 5599 + }, + { + "epoch": 0.7123775601068566, + "ewc_loss": 0.04712796211242676, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018929717771243304, + "grad_norm": 5.515308380126953, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8752843737602234, + "num_tokens": 213725310.0, + "step": 5600 + }, + { + "epoch": 0.7125047703854471, + "ewc_loss": 0.0471203476190567, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018922107119578868, + "grad_norm": 5.616515159606934, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.852331817150116, + "num_tokens": 213757950.0, + "step": 5601 + }, + { + "epoch": 0.7126319806640377, + "ewc_loss": 0.047182198613882065, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018983955669682473, + "grad_norm": 5.5850090980529785, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.846667468547821, + "num_tokens": 213794146.0, + "step": 5602 + }, + { + "epoch": 0.7127591909426282, + "ewc_loss": 0.04712792485952377, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.0001892968430183828, + "grad_norm": 5.552728176116943, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8627417683601379, + "num_tokens": 213834922.0, + "step": 5603 + }, + { + "epoch": 0.7128864012212187, + "ewc_loss": 0.047185130417346954, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018986890790984035, + "grad_norm": 6.083570957183838, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8769891262054443, + "num_tokens": 213879651.0, + "step": 5604 + }, + { + "epoch": 0.7130136114998091, + "ewc_loss": 0.04736388474702835, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019043571955990046, + "grad_norm": 5.492380619049072, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8579041957855225, + "num_tokens": 213921833.0, + "step": 5605 + }, + { + "epoch": 0.7131408217783997, + "ewc_loss": 0.04721439629793167, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018894085951615125, + "grad_norm": 5.57628870010376, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8639997243881226, + "num_tokens": 213962909.0, + "step": 5606 + }, + { + "epoch": 0.7132680320569902, + "ewc_loss": 0.04725009575486183, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018929783254861832, + "grad_norm": 5.516468048095703, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8608062267303467, + "num_tokens": 214002030.0, + "step": 5607 + }, + { + "epoch": 0.7133952423355807, + "ewc_loss": 0.04733513668179512, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019014824647456408, + "grad_norm": 5.658631324768066, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.852073073387146, + "num_tokens": 214037094.0, + "step": 5608 + }, + { + "epoch": 0.7135224526141712, + "ewc_loss": 0.04732533544301987, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001900502247735858, + "grad_norm": 5.593255043029785, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8628684282302856, + "num_tokens": 214076649.0, + "step": 5609 + }, + { + "epoch": 0.7136496628927618, + "ewc_loss": 0.047238413244485855, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.000189181009773165, + "grad_norm": 5.6187238693237305, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8544557094573975, + "num_tokens": 214114194.0, + "step": 5610 + }, + { + "epoch": 0.7137768731713523, + "ewc_loss": 0.04727579653263092, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018955481937155128, + "grad_norm": 5.6338276863098145, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8608871698379517, + "num_tokens": 214149652.0, + "step": 5611 + }, + { + "epoch": 0.7139040834499427, + "ewc_loss": 0.04725304991006851, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018932738748844713, + "grad_norm": 5.592535495758057, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8679265379905701, + "num_tokens": 214189068.0, + "step": 5612 + }, + { + "epoch": 0.7140312937285332, + "ewc_loss": 0.0471622496843338, + "ewc_loss_diag": 2.8252601623535156e-05, + "ewc_loss_parallel": 0.00018964004993904382, + "grad_norm": 5.674901008605957, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8470247983932495, + "num_tokens": 214220251.0, + "step": 5613 + }, + { + "epoch": 0.7141585040071238, + "ewc_loss": 0.04738585278391838, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001894346933113411, + "grad_norm": 5.651018142700195, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8562160730361938, + "num_tokens": 214256705.0, + "step": 5614 + }, + { + "epoch": 0.7142857142857143, + "ewc_loss": 0.04734661057591438, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018904228636529297, + "grad_norm": 5.619790077209473, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8714491128921509, + "num_tokens": 214292563.0, + "step": 5615 + }, + { + "epoch": 0.7144129245643048, + "ewc_loss": 0.047356363385915756, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018913981330115348, + "grad_norm": 5.5968017578125, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8522819876670837, + "num_tokens": 214333843.0, + "step": 5616 + }, + { + "epoch": 0.7145401348428954, + "ewc_loss": 0.04730777442455292, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018865391029976308, + "grad_norm": 5.582503318786621, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8694281578063965, + "num_tokens": 214372272.0, + "step": 5617 + }, + { + "epoch": 0.7146673451214858, + "ewc_loss": 0.04734581708908081, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018903434101957828, + "grad_norm": 5.5286078453063965, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8647593855857849, + "num_tokens": 214416364.0, + "step": 5618 + }, + { + "epoch": 0.7147945554000763, + "ewc_loss": 0.04737761989235878, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018935237312689424, + "grad_norm": 5.670059680938721, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8565946817398071, + "num_tokens": 214453102.0, + "step": 5619 + }, + { + "epoch": 0.7149217656786668, + "ewc_loss": 0.047246020287275314, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001892570871859789, + "grad_norm": 5.610619068145752, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8627396821975708, + "num_tokens": 214492374.0, + "step": 5620 + }, + { + "epoch": 0.7150489759572574, + "ewc_loss": 0.047182876616716385, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001886256504803896, + "grad_norm": 5.632662773132324, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8587526082992554, + "num_tokens": 214530918.0, + "step": 5621 + }, + { + "epoch": 0.7151761862358479, + "ewc_loss": 0.04722131788730621, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001890100393211469, + "grad_norm": 5.632604598999023, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8742437362670898, + "num_tokens": 214570227.0, + "step": 5622 + }, + { + "epoch": 0.7153033965144384, + "ewc_loss": 0.04716682434082031, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018846511375159025, + "grad_norm": 5.5707106590271, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8615609407424927, + "num_tokens": 214609519.0, + "step": 5623 + }, + { + "epoch": 0.7154306067930288, + "ewc_loss": 0.04720863327383995, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018888320482801646, + "grad_norm": 5.659508228302002, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8567105531692505, + "num_tokens": 214643564.0, + "step": 5624 + }, + { + "epoch": 0.7155578170716194, + "ewc_loss": 0.0472421795129776, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018921865557786077, + "grad_norm": 5.654647350311279, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.866889476776123, + "num_tokens": 214677632.0, + "step": 5625 + }, + { + "epoch": 0.7156850273502099, + "ewc_loss": 0.04718220606446266, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001886189274955541, + "grad_norm": 5.597856521606445, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8607248067855835, + "num_tokens": 214710363.0, + "step": 5626 + }, + { + "epoch": 0.7158122376288004, + "ewc_loss": 0.0472804456949234, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001896013563964516, + "grad_norm": 5.692677021026611, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8667305707931519, + "num_tokens": 214746982.0, + "step": 5627 + }, + { + "epoch": 0.715939447907391, + "ewc_loss": 0.04726079851388931, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018940487643703818, + "grad_norm": 5.601443290710449, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8496500849723816, + "num_tokens": 214786447.0, + "step": 5628 + }, + { + "epoch": 0.7160666581859815, + "ewc_loss": 0.04729683697223663, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018976524006575346, + "grad_norm": 5.652091026306152, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8665709495544434, + "num_tokens": 214824627.0, + "step": 5629 + }, + { + "epoch": 0.7161938684645719, + "ewc_loss": 0.047319408506155014, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018999095482286066, + "grad_norm": 5.667090892791748, + "learning_rate": 1e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8408156633377075, + "num_tokens": 214863051.0, + "step": 5630 + }, + { + "epoch": 0.7163210787431624, + "ewc_loss": 0.04730096831917763, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001898065529530868, + "grad_norm": 5.698697090148926, + "learning_rate": 1e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8459620475769043, + "num_tokens": 214900524.0, + "step": 5631 + }, + { + "epoch": 0.716448289021753, + "ewc_loss": 0.047238945960998535, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018918633577413857, + "grad_norm": 5.598255157470703, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.864688515663147, + "num_tokens": 214943949.0, + "step": 5632 + }, + { + "epoch": 0.7165754993003435, + "ewc_loss": 0.04730268567800522, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001898237387649715, + "grad_norm": 5.674777030944824, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8630275726318359, + "num_tokens": 214979126.0, + "step": 5633 + }, + { + "epoch": 0.716702709578934, + "ewc_loss": 0.04722592607140541, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018905613978859037, + "grad_norm": 5.600131511688232, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8590871691703796, + "num_tokens": 215017740.0, + "step": 5634 + }, + { + "epoch": 0.7168299198575245, + "ewc_loss": 0.047258298844099045, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018937986169476062, + "grad_norm": 5.603230953216553, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8665704727172852, + "num_tokens": 215054472.0, + "step": 5635 + }, + { + "epoch": 0.716957130136115, + "ewc_loss": 0.047243811190128326, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018923499737866223, + "grad_norm": 5.631259441375732, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8620857000350952, + "num_tokens": 215086894.0, + "step": 5636 + }, + { + "epoch": 0.7170843404147055, + "ewc_loss": 0.047373805195093155, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018931421800516546, + "grad_norm": 5.587013244628906, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8597087860107422, + "num_tokens": 215128157.0, + "step": 5637 + }, + { + "epoch": 0.717211550693296, + "ewc_loss": 0.047280557453632355, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001896024332381785, + "grad_norm": 5.647223472595215, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8663710355758667, + "num_tokens": 215161008.0, + "step": 5638 + }, + { + "epoch": 0.7173387609718865, + "ewc_loss": 0.047318167984485626, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018997857114300132, + "grad_norm": 5.658824443817139, + "learning_rate": 1e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8425847291946411, + "num_tokens": 215197157.0, + "step": 5639 + }, + { + "epoch": 0.7174659712504771, + "ewc_loss": 0.04726409912109375, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018943785107694566, + "grad_norm": 5.570572853088379, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8666290640830994, + "num_tokens": 215240270.0, + "step": 5640 + }, + { + "epoch": 0.7175931815290676, + "ewc_loss": 0.0472361296415329, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018915816326625645, + "grad_norm": 5.659282684326172, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8662974834442139, + "num_tokens": 215279828.0, + "step": 5641 + }, + { + "epoch": 0.717720391807658, + "ewc_loss": 0.04732639342546463, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001900607894640416, + "grad_norm": 5.662044048309326, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8541577458381653, + "num_tokens": 215315271.0, + "step": 5642 + }, + { + "epoch": 0.7178476020862485, + "ewc_loss": 0.04720047861337662, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018880168499890715, + "grad_norm": 5.605023384094238, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8605734705924988, + "num_tokens": 215352621.0, + "step": 5643 + }, + { + "epoch": 0.7179748123648391, + "ewc_loss": 0.04724147543311119, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018921162700280547, + "grad_norm": 5.624301910400391, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8614239692687988, + "num_tokens": 215390231.0, + "step": 5644 + }, + { + "epoch": 0.7181020226434296, + "ewc_loss": 0.047346822917461395, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018904439639300108, + "grad_norm": 5.5661396980285645, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8726233243942261, + "num_tokens": 215426812.0, + "step": 5645 + }, + { + "epoch": 0.7182292329220201, + "ewc_loss": 0.047262612730264664, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001894229935714975, + "grad_norm": 5.647113800048828, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.847987949848175, + "num_tokens": 215467551.0, + "step": 5646 + }, + { + "epoch": 0.7183564432006107, + "ewc_loss": 0.04725552722811699, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018935214029625058, + "grad_norm": 5.5583930015563965, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8669559955596924, + "num_tokens": 215508559.0, + "step": 5647 + }, + { + "epoch": 0.7184836534792011, + "ewc_loss": 0.04725564271211624, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018935331900138408, + "grad_norm": 5.660579204559326, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8531740307807922, + "num_tokens": 215548709.0, + "step": 5648 + }, + { + "epoch": 0.7186108637577916, + "ewc_loss": 0.04742661118507385, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018984229245688766, + "grad_norm": 5.581604957580566, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8595336675643921, + "num_tokens": 215593844.0, + "step": 5649 + }, + { + "epoch": 0.7187380740363821, + "ewc_loss": 0.047246698290109634, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018926385382656008, + "grad_norm": 5.603652477264404, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.864833414554596, + "num_tokens": 215633853.0, + "step": 5650 + }, + { + "epoch": 0.7188652843149727, + "ewc_loss": 0.0474092923104763, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018966909556183964, + "grad_norm": 5.747854232788086, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8579609394073486, + "num_tokens": 215667272.0, + "step": 5651 + }, + { + "epoch": 0.7189924945935632, + "ewc_loss": 0.04728902876377106, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018968716904055327, + "grad_norm": 5.6626410484313965, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8418423533439636, + "num_tokens": 215705889.0, + "step": 5652 + }, + { + "epoch": 0.7191197048721537, + "ewc_loss": 0.047296978533267975, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018854596419259906, + "grad_norm": 5.618457317352295, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8689348697662354, + "num_tokens": 215749143.0, + "step": 5653 + }, + { + "epoch": 0.7192469151507441, + "ewc_loss": 0.047378331422805786, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018935950356535614, + "grad_norm": 5.6598663330078125, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8543506264686584, + "num_tokens": 215787583.0, + "step": 5654 + }, + { + "epoch": 0.7193741254293347, + "ewc_loss": 0.047221239656209946, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001890092680696398, + "grad_norm": 5.661603927612305, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8590565919876099, + "num_tokens": 215825552.0, + "step": 5655 + }, + { + "epoch": 0.7195013357079252, + "ewc_loss": 0.04725554957985878, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018935237312689424, + "grad_norm": 5.712777614593506, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8684482574462891, + "num_tokens": 215858310.0, + "step": 5656 + }, + { + "epoch": 0.7196285459865157, + "ewc_loss": 0.04721379280090332, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018893479136750102, + "grad_norm": 5.6474456787109375, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8477133512496948, + "num_tokens": 215898403.0, + "step": 5657 + }, + { + "epoch": 0.7197557562651062, + "ewc_loss": 0.04720661789178848, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018886303587350994, + "grad_norm": 5.662144184112549, + "learning_rate": 1e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.8368273973464966, + "num_tokens": 215935930.0, + "step": 5658 + }, + { + "epoch": 0.7198829665436968, + "ewc_loss": 0.04737953096628189, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018937146523967385, + "grad_norm": 5.713860034942627, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.856130838394165, + "num_tokens": 215971509.0, + "step": 5659 + }, + { + "epoch": 0.7200101768222873, + "ewc_loss": 0.0472571887075901, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018936875858344138, + "grad_norm": 5.660373210906982, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8656442761421204, + "num_tokens": 216008696.0, + "step": 5660 + }, + { + "epoch": 0.7201373871008777, + "ewc_loss": 0.04720291495323181, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018882601580116898, + "grad_norm": 5.637457370758057, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8594702482223511, + "num_tokens": 216045861.0, + "step": 5661 + }, + { + "epoch": 0.7202645973794682, + "ewc_loss": 0.047402769327163696, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001896038738777861, + "grad_norm": 5.684584140777588, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8686065673828125, + "num_tokens": 216081307.0, + "step": 5662 + }, + { + "epoch": 0.7203918076580588, + "ewc_loss": 0.04721030592918396, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018889991042669863, + "grad_norm": 5.6632561683654785, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8687803745269775, + "num_tokens": 216118506.0, + "step": 5663 + }, + { + "epoch": 0.7205190179366493, + "ewc_loss": 0.047331325709819794, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001888894330477342, + "grad_norm": 5.699686050415039, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8529438972473145, + "num_tokens": 216159000.0, + "step": 5664 + }, + { + "epoch": 0.7206462282152398, + "ewc_loss": 0.047346364706754684, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018903982709161937, + "grad_norm": 5.624113082885742, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8519062399864197, + "num_tokens": 216204712.0, + "step": 5665 + }, + { + "epoch": 0.7207734384938304, + "ewc_loss": 0.04714560508728027, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018825293227564543, + "grad_norm": 5.70986270904541, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8515963554382324, + "num_tokens": 216235001.0, + "step": 5666 + }, + { + "epoch": 0.7209006487724208, + "ewc_loss": 0.04719480127096176, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018874490342568606, + "grad_norm": 5.632843971252441, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.852450966835022, + "num_tokens": 216271383.0, + "step": 5667 + }, + { + "epoch": 0.7210278590510113, + "ewc_loss": 0.04720976576209068, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018889454076997936, + "grad_norm": 5.692481994628906, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8599733114242554, + "num_tokens": 216307923.0, + "step": 5668 + }, + { + "epoch": 0.7211550693296018, + "ewc_loss": 0.04732626676559448, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001888388505904004, + "grad_norm": 5.567021369934082, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8780300617218018, + "num_tokens": 216345147.0, + "step": 5669 + }, + { + "epoch": 0.7212822796081924, + "ewc_loss": 0.04734481871128082, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018902437295764685, + "grad_norm": 5.658054828643799, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8518394231796265, + "num_tokens": 216386991.0, + "step": 5670 + }, + { + "epoch": 0.7214094898867829, + "ewc_loss": 0.04736987501382828, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001892748987302184, + "grad_norm": 5.6757330894470215, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8621635437011719, + "num_tokens": 216425534.0, + "step": 5671 + }, + { + "epoch": 0.7215367001653734, + "ewc_loss": 0.047256529331207275, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018936219566967338, + "grad_norm": 5.612478256225586, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8596001863479614, + "num_tokens": 216466686.0, + "step": 5672 + }, + { + "epoch": 0.7216639104439638, + "ewc_loss": 0.04736985266208649, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018927468045148998, + "grad_norm": 5.6686577796936035, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8712747097015381, + "num_tokens": 216505245.0, + "step": 5673 + }, + { + "epoch": 0.7217911207225544, + "ewc_loss": 0.04738302528858185, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001894064189400524, + "grad_norm": 5.636201858520508, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.855632483959198, + "num_tokens": 216538024.0, + "step": 5674 + }, + { + "epoch": 0.7219183310011449, + "ewc_loss": 0.047381043434143066, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018938658467959613, + "grad_norm": 5.658790588378906, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8579164743423462, + "num_tokens": 216573722.0, + "step": 5675 + }, + { + "epoch": 0.7220455412797354, + "ewc_loss": 0.047233209013938904, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018912898667622358, + "grad_norm": 5.586302280426025, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8643949031829834, + "num_tokens": 216610362.0, + "step": 5676 + }, + { + "epoch": 0.7221727515583259, + "ewc_loss": 0.047260433435440063, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018940122390631586, + "grad_norm": 5.64892578125, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8515986204147339, + "num_tokens": 216648732.0, + "step": 5677 + }, + { + "epoch": 0.7222999618369165, + "ewc_loss": 0.04738245904445648, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018940074369311333, + "grad_norm": 5.672479629516602, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8681808710098267, + "num_tokens": 216680324.0, + "step": 5678 + }, + { + "epoch": 0.7224271721155069, + "ewc_loss": 0.04736768454313278, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018925302720163018, + "grad_norm": 5.632071495056152, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8622565865516663, + "num_tokens": 216712773.0, + "step": 5679 + }, + { + "epoch": 0.7225543823940974, + "ewc_loss": 0.047415606677532196, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018973226542584598, + "grad_norm": 5.735875606536865, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8519275784492493, + "num_tokens": 216752538.0, + "step": 5680 + }, + { + "epoch": 0.722681592672688, + "ewc_loss": 0.047438040375709534, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018995656864717603, + "grad_norm": 5.609714031219482, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8621882200241089, + "num_tokens": 216791795.0, + "step": 5681 + }, + { + "epoch": 0.7228088029512785, + "ewc_loss": 0.047336380928754807, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018893998640123755, + "grad_norm": 5.6060614585876465, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8564903736114502, + "num_tokens": 216831729.0, + "step": 5682 + }, + { + "epoch": 0.722936013229869, + "ewc_loss": 0.04744799807667732, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019005614740308374, + "grad_norm": 5.653533935546875, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8596757650375366, + "num_tokens": 216869494.0, + "step": 5683 + }, + { + "epoch": 0.7230632235084595, + "ewc_loss": 0.04741859436035156, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001897621259558946, + "grad_norm": 5.671147346496582, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8402172327041626, + "num_tokens": 216906672.0, + "step": 5684 + }, + { + "epoch": 0.72319043378705, + "ewc_loss": 0.047431930899620056, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018989549425896257, + "grad_norm": 5.650267601013184, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8661372661590576, + "num_tokens": 216941587.0, + "step": 5685 + }, + { + "epoch": 0.7233176440656405, + "ewc_loss": 0.0474093034863472, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.000189669182873331, + "grad_norm": 5.628523349761963, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8527224063873291, + "num_tokens": 216981812.0, + "step": 5686 + }, + { + "epoch": 0.723444854344231, + "ewc_loss": 0.047455962747335434, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001901357900351286, + "grad_norm": 5.656848907470703, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8693245649337769, + "num_tokens": 217024137.0, + "step": 5687 + }, + { + "epoch": 0.7235720646228215, + "ewc_loss": 0.047481469810009, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019039087055716664, + "grad_norm": 5.628856182098389, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8716949820518494, + "num_tokens": 217063753.0, + "step": 5688 + }, + { + "epoch": 0.7236992749014121, + "ewc_loss": 0.04741200804710388, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018969627853948623, + "grad_norm": 5.6328630447387695, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8576294183731079, + "num_tokens": 217105611.0, + "step": 5689 + }, + { + "epoch": 0.7238264851800026, + "ewc_loss": 0.04743800684809685, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.000189956248505041, + "grad_norm": 5.651052474975586, + "learning_rate": 1e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8395032286643982, + "num_tokens": 217145989.0, + "step": 5690 + }, + { + "epoch": 0.723953695458593, + "ewc_loss": 0.04743274301290512, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018990358512382954, + "grad_norm": 5.662303447723389, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8778618574142456, + "num_tokens": 217183726.0, + "step": 5691 + }, + { + "epoch": 0.7240809057371835, + "ewc_loss": 0.04743301868438721, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018990636453963816, + "grad_norm": 5.634890079498291, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8571352362632751, + "num_tokens": 217226644.0, + "step": 5692 + }, + { + "epoch": 0.7242081160157741, + "ewc_loss": 0.04726417362689972, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.000189438636880368, + "grad_norm": 5.675182342529297, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8446225523948669, + "num_tokens": 217263327.0, + "step": 5693 + }, + { + "epoch": 0.7243353262943646, + "ewc_loss": 0.04730742424726486, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018987110524903983, + "grad_norm": 5.647279739379883, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8644662499427795, + "num_tokens": 217300595.0, + "step": 5694 + }, + { + "epoch": 0.7244625365729551, + "ewc_loss": 0.04741846024990082, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018976075807586312, + "grad_norm": 5.647026062011719, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8466535806655884, + "num_tokens": 217336385.0, + "step": 5695 + }, + { + "epoch": 0.7245897468515456, + "ewc_loss": 0.04742088168859482, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018978500156663358, + "grad_norm": 5.6460394859313965, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8665270209312439, + "num_tokens": 217374309.0, + "step": 5696 + }, + { + "epoch": 0.7247169571301361, + "ewc_loss": 0.04739764705300331, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018955265113618225, + "grad_norm": 5.653144359588623, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8496580719947815, + "num_tokens": 217408386.0, + "step": 5697 + }, + { + "epoch": 0.7248441674087266, + "ewc_loss": 0.04742242395877838, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001898003974929452, + "grad_norm": 5.707441806793213, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8452617526054382, + "num_tokens": 217442855.0, + "step": 5698 + }, + { + "epoch": 0.7249713776873171, + "ewc_loss": 0.04732039198279381, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019000079191755503, + "grad_norm": 5.6403021812438965, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8645901083946228, + "num_tokens": 217476892.0, + "step": 5699 + }, + { + "epoch": 0.7250985879659076, + "ewc_loss": 0.04739219322800636, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001894980960059911, + "grad_norm": 5.647506237030029, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8583544492721558, + "num_tokens": 217513569.0, + "step": 5700 + }, + { + "epoch": 0.7252257982444982, + "ewc_loss": 0.047266144305467606, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018945832562167197, + "grad_norm": 5.710901260375977, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8540654182434082, + "num_tokens": 217545585.0, + "step": 5701 + }, + { + "epoch": 0.7253530085230887, + "ewc_loss": 0.04745592549443245, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019013542623724788, + "grad_norm": 5.673801422119141, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8621326684951782, + "num_tokens": 217583156.0, + "step": 5702 + }, + { + "epoch": 0.7254802188016791, + "ewc_loss": 0.04752718284726143, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00018962728790938854, + "grad_norm": 5.645833969116211, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8599395751953125, + "num_tokens": 217619630.0, + "step": 5703 + }, + { + "epoch": 0.7256074290802697, + "ewc_loss": 0.0474349781870842, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018992595141753554, + "grad_norm": 5.656457424163818, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.869094967842102, + "num_tokens": 217653616.0, + "step": 5704 + }, + { + "epoch": 0.7257346393588602, + "ewc_loss": 0.047315843403339386, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001899553171824664, + "grad_norm": 5.6454596519470215, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8540927171707153, + "num_tokens": 217692758.0, + "step": 5705 + }, + { + "epoch": 0.7258618496374507, + "ewc_loss": 0.04728888347744942, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00018968571384903044, + "grad_norm": 5.630218982696533, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8434308767318726, + "num_tokens": 217734175.0, + "step": 5706 + }, + { + "epoch": 0.7259890599160412, + "ewc_loss": 0.04750858247280121, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019066200184170157, + "grad_norm": 5.786139488220215, + "learning_rate": 1e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8371773958206177, + "num_tokens": 217773372.0, + "step": 5707 + }, + { + "epoch": 0.7261162701946318, + "ewc_loss": 0.04738686978816986, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018944486510008574, + "grad_norm": 5.620987892150879, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8681267499923706, + "num_tokens": 217809174.0, + "step": 5708 + }, + { + "epoch": 0.7262434804732223, + "ewc_loss": 0.047426775097846985, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018984392227139324, + "grad_norm": 5.686269283294678, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.852537989616394, + "num_tokens": 217845882.0, + "step": 5709 + }, + { + "epoch": 0.7263706907518127, + "ewc_loss": 0.04736291989684105, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018920536967925727, + "grad_norm": 5.633202075958252, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8545070886611938, + "num_tokens": 217885798.0, + "step": 5710 + }, + { + "epoch": 0.7264979010304032, + "ewc_loss": 0.04740652069449425, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001896413741633296, + "grad_norm": 5.669549465179443, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8557960391044617, + "num_tokens": 217924967.0, + "step": 5711 + }, + { + "epoch": 0.7266251113089938, + "ewc_loss": 0.047370340675115585, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018927958444692194, + "grad_norm": 5.622639179229736, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.870794415473938, + "num_tokens": 217967381.0, + "step": 5712 + }, + { + "epoch": 0.7267523215875843, + "ewc_loss": 0.0473746582865715, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018932273087557405, + "grad_norm": 5.641185760498047, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8650426268577576, + "num_tokens": 218001097.0, + "step": 5713 + }, + { + "epoch": 0.7268795318661748, + "ewc_loss": 0.04742313176393509, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018980749882757664, + "grad_norm": 5.641758441925049, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8607658743858337, + "num_tokens": 218040705.0, + "step": 5714 + }, + { + "epoch": 0.7270067421447653, + "ewc_loss": 0.04742434248328209, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018981960602104664, + "grad_norm": 5.673430919647217, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8587448596954346, + "num_tokens": 218077549.0, + "step": 5715 + }, + { + "epoch": 0.7271339524233558, + "ewc_loss": 0.04744773358106613, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019005349895451218, + "grad_norm": 5.671090602874756, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8589584827423096, + "num_tokens": 218116251.0, + "step": 5716 + }, + { + "epoch": 0.7272611627019463, + "ewc_loss": 0.04741328954696655, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018970906967297196, + "grad_norm": 5.666350364685059, + "learning_rate": 1e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.832051157951355, + "num_tokens": 218155798.0, + "step": 5717 + }, + { + "epoch": 0.7273883729805368, + "ewc_loss": 0.047429561614990234, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018987178918905556, + "grad_norm": 5.651294708251953, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8650605082511902, + "num_tokens": 218191277.0, + "step": 5718 + }, + { + "epoch": 0.7275155832591274, + "ewc_loss": 0.04739692807197571, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018954547704197466, + "grad_norm": 5.603384017944336, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8648597598075867, + "num_tokens": 218233165.0, + "step": 5719 + }, + { + "epoch": 0.7276427935377179, + "ewc_loss": 0.04736702889204025, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019046715169679374, + "grad_norm": 5.67061710357666, + "learning_rate": 1e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8391427397727966, + "num_tokens": 218271263.0, + "step": 5720 + }, + { + "epoch": 0.7277700038163084, + "ewc_loss": 0.047457583248615265, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.000190152000868693, + "grad_norm": 5.683823585510254, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8607521653175354, + "num_tokens": 218309081.0, + "step": 5721 + }, + { + "epoch": 0.7278972140948988, + "ewc_loss": 0.0474725142121315, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019030131807085127, + "grad_norm": 5.615505695343018, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8538460731506348, + "num_tokens": 218348314.0, + "step": 5722 + }, + { + "epoch": 0.7280244243734894, + "ewc_loss": 0.04742041975259781, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001897803449537605, + "grad_norm": 5.587974548339844, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8633161783218384, + "num_tokens": 218387556.0, + "step": 5723 + }, + { + "epoch": 0.7281516346520799, + "ewc_loss": 0.04752853512763977, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019086150859948248, + "grad_norm": 5.690566062927246, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.852074146270752, + "num_tokens": 218423130.0, + "step": 5724 + }, + { + "epoch": 0.7282788449306704, + "ewc_loss": 0.04751010611653328, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019067723769694567, + "grad_norm": 5.63646936416626, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8408995866775513, + "num_tokens": 218464079.0, + "step": 5725 + }, + { + "epoch": 0.7284060552092609, + "ewc_loss": 0.04753836244344711, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019095982133876532, + "grad_norm": 5.705136299133301, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8666752576828003, + "num_tokens": 218501307.0, + "step": 5726 + }, + { + "epoch": 0.7285332654878515, + "ewc_loss": 0.047512076795101166, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001906969555420801, + "grad_norm": 5.5956621170043945, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8682529926300049, + "num_tokens": 218537710.0, + "step": 5727 + }, + { + "epoch": 0.7286604757664419, + "ewc_loss": 0.04750668257474899, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019064301159232855, + "grad_norm": 5.661556243896484, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8549227118492126, + "num_tokens": 218576026.0, + "step": 5728 + }, + { + "epoch": 0.7287876860450324, + "ewc_loss": 0.047372281551361084, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001905196695588529, + "grad_norm": 5.707655429840088, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8553905487060547, + "num_tokens": 218612491.0, + "step": 5729 + }, + { + "epoch": 0.7289148963236229, + "ewc_loss": 0.047589391469955444, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019024938228540123, + "grad_norm": 5.73112678527832, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.867923378944397, + "num_tokens": 218642541.0, + "step": 5730 + }, + { + "epoch": 0.7290421066022135, + "ewc_loss": 0.04748261719942093, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001904023374663666, + "grad_norm": 5.654718399047852, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.845171332359314, + "num_tokens": 218678514.0, + "step": 5731 + }, + { + "epoch": 0.729169316880804, + "ewc_loss": 0.04762159287929535, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019057138706557453, + "grad_norm": 5.72004508972168, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8629963397979736, + "num_tokens": 218715415.0, + "step": 5732 + }, + { + "epoch": 0.7292965271593945, + "ewc_loss": 0.047620125114917755, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019055673328693956, + "grad_norm": 5.606342315673828, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8670886158943176, + "num_tokens": 218754422.0, + "step": 5733 + }, + { + "epoch": 0.7294237374379849, + "ewc_loss": 0.04764683544635773, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019082380458712578, + "grad_norm": 5.744997978210449, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.849436342716217, + "num_tokens": 218787141.0, + "step": 5734 + }, + { + "epoch": 0.7295509477165755, + "ewc_loss": 0.04779539257287979, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019108870765194297, + "grad_norm": 5.622399806976318, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8730044364929199, + "num_tokens": 218824764.0, + "step": 5735 + }, + { + "epoch": 0.729678157995166, + "ewc_loss": 0.04762659966945648, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001906214893097058, + "grad_norm": 5.90352725982666, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8572097420692444, + "num_tokens": 218866142.0, + "step": 5736 + }, + { + "epoch": 0.7298053682737565, + "ewc_loss": 0.04781497269868851, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019128451822325587, + "grad_norm": 5.629177093505859, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8533154726028442, + "num_tokens": 218909208.0, + "step": 5737 + }, + { + "epoch": 0.7299325785523471, + "ewc_loss": 0.04771105572581291, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001902453223010525, + "grad_norm": 5.716446399688721, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8665002584457397, + "num_tokens": 218942469.0, + "step": 5738 + }, + { + "epoch": 0.7300597888309376, + "ewc_loss": 0.04756729304790497, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001900284260045737, + "grad_norm": 5.7147626876831055, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8518055081367493, + "num_tokens": 218982934.0, + "step": 5739 + }, + { + "epoch": 0.730186999109528, + "ewc_loss": 0.04774188995361328, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019055364828091115, + "grad_norm": 5.6875457763671875, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8554022312164307, + "num_tokens": 219027055.0, + "step": 5740 + }, + { + "epoch": 0.7303142093881185, + "ewc_loss": 0.047716423869132996, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019029901886824518, + "grad_norm": 5.6711859703063965, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8599063158035278, + "num_tokens": 219069639.0, + "step": 5741 + }, + { + "epoch": 0.7304414196667091, + "ewc_loss": 0.04755236953496933, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00018987915245816112, + "grad_norm": 5.607831954956055, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8529276847839355, + "num_tokens": 219111774.0, + "step": 5742 + }, + { + "epoch": 0.7305686299452996, + "ewc_loss": 0.047595344483852386, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001903089287225157, + "grad_norm": 5.619897365570068, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8462613821029663, + "num_tokens": 219156962.0, + "step": 5743 + }, + { + "epoch": 0.7306958402238901, + "ewc_loss": 0.04762633889913559, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019061884086113423, + "grad_norm": 5.689164638519287, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8667234778404236, + "num_tokens": 219193352.0, + "step": 5744 + }, + { + "epoch": 0.7308230505024806, + "ewc_loss": 0.04759272187948227, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019028267706744373, + "grad_norm": 5.897594451904297, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8438644409179688, + "num_tokens": 219230041.0, + "step": 5745 + }, + { + "epoch": 0.7309502607810711, + "ewc_loss": 0.047614846378564835, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019050393893849105, + "grad_norm": 5.7014594078063965, + "learning_rate": 1e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8359968662261963, + "num_tokens": 219262748.0, + "step": 5746 + }, + { + "epoch": 0.7310774710596616, + "ewc_loss": 0.04747467488050461, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001891022257041186, + "grad_norm": 5.7023773193359375, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.87110435962677, + "num_tokens": 219296556.0, + "step": 5747 + }, + { + "epoch": 0.7312046813382521, + "ewc_loss": 0.047540076076984406, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00018975624698214233, + "grad_norm": 5.690878868103027, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.845855712890625, + "num_tokens": 219330801.0, + "step": 5748 + }, + { + "epoch": 0.7313318916168426, + "ewc_loss": 0.047403596341609955, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00018961211026180536, + "grad_norm": 5.632338523864746, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8593356609344482, + "num_tokens": 219367470.0, + "step": 5749 + }, + { + "epoch": 0.7314591018954332, + "ewc_loss": 0.04755420982837677, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00018989757518284023, + "grad_norm": 5.637970924377441, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8456356525421143, + "num_tokens": 219405291.0, + "step": 5750 + }, + { + "epoch": 0.7315863121740237, + "ewc_loss": 0.04777076840400696, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019084247469436377, + "grad_norm": 5.621677875518799, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8610066175460815, + "num_tokens": 219443365.0, + "step": 5751 + }, + { + "epoch": 0.7317135224526141, + "ewc_loss": 0.047718800604343414, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001903227821458131, + "grad_norm": 5.613344669342041, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.860321044921875, + "num_tokens": 219477524.0, + "step": 5752 + }, + { + "epoch": 0.7318407327312046, + "ewc_loss": 0.047936953604221344, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019128360145259649, + "grad_norm": 5.67411470413208, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8489327430725098, + "num_tokens": 219514576.0, + "step": 5753 + }, + { + "epoch": 0.7319679430097952, + "ewc_loss": 0.04788510873913765, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019198586232960224, + "grad_norm": 5.640300750732422, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8587780594825745, + "num_tokens": 219558253.0, + "step": 5754 + }, + { + "epoch": 0.7320951532883857, + "ewc_loss": 0.047840047627687454, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001915352331707254, + "grad_norm": 5.649752616882324, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8540112972259521, + "num_tokens": 219601156.0, + "step": 5755 + }, + { + "epoch": 0.7322223635669762, + "ewc_loss": 0.04791826009750366, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019231736951041967, + "grad_norm": 5.703460216522217, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8487229347229004, + "num_tokens": 219638241.0, + "step": 5756 + }, + { + "epoch": 0.7323495738455668, + "ewc_loss": 0.04782724753022194, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019140724907629192, + "grad_norm": 5.598141670227051, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8599609136581421, + "num_tokens": 219678168.0, + "step": 5757 + }, + { + "epoch": 0.7324767841241572, + "ewc_loss": 0.047911107540130615, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019224583229515702, + "grad_norm": 5.700012683868408, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8558623790740967, + "num_tokens": 219718115.0, + "step": 5758 + }, + { + "epoch": 0.7326039944027477, + "ewc_loss": 0.0477365180850029, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019172066822648048, + "grad_norm": 5.642626762390137, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8584387302398682, + "num_tokens": 219754221.0, + "step": 5759 + }, + { + "epoch": 0.7327312046813382, + "ewc_loss": 0.04773544892668724, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001917099580168724, + "grad_norm": 5.701375961303711, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8637808561325073, + "num_tokens": 219786880.0, + "step": 5760 + }, + { + "epoch": 0.7328584149599288, + "ewc_loss": 0.04782448709011078, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019137961498927325, + "grad_norm": 5.625481128692627, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8613772392272949, + "num_tokens": 219830585.0, + "step": 5761 + }, + { + "epoch": 0.7329856252385193, + "ewc_loss": 0.04784321039915085, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019156686903443187, + "grad_norm": 5.671756267547607, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8617844581604004, + "num_tokens": 219867249.0, + "step": 5762 + }, + { + "epoch": 0.7331128355171098, + "ewc_loss": 0.04777657985687256, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019212128245271742, + "grad_norm": 5.699039936065674, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8499894142150879, + "num_tokens": 219906046.0, + "step": 5763 + }, + { + "epoch": 0.7332400457957003, + "ewc_loss": 0.0476701594889164, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019105705723632127, + "grad_norm": 5.6545867919921875, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8494665026664734, + "num_tokens": 219947466.0, + "step": 5764 + }, + { + "epoch": 0.7333672560742908, + "ewc_loss": 0.04780095815658569, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019114433962386101, + "grad_norm": 6.032394886016846, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8603564500808716, + "num_tokens": 219985698.0, + "step": 5765 + }, + { + "epoch": 0.7334944663528813, + "ewc_loss": 0.047735076397657394, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019170623272657394, + "grad_norm": 5.568026542663574, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8587781190872192, + "num_tokens": 220026158.0, + "step": 5766 + }, + { + "epoch": 0.7336216766314718, + "ewc_loss": 0.04757732152938843, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019012868870049715, + "grad_norm": 5.68945837020874, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8573371171951294, + "num_tokens": 220071849.0, + "step": 5767 + }, + { + "epoch": 0.7337488869100623, + "ewc_loss": 0.04785417020320892, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019167648861184716, + "grad_norm": 5.672128677368164, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8643242120742798, + "num_tokens": 220110255.0, + "step": 5768 + }, + { + "epoch": 0.7338760971886529, + "ewc_loss": 0.04778318107128143, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019096660253126174, + "grad_norm": 5.933164119720459, + "learning_rate": 1e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8432135581970215, + "num_tokens": 220144962.0, + "step": 5769 + }, + { + "epoch": 0.7340033074672434, + "ewc_loss": 0.047843750566244125, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001915722677949816, + "grad_norm": 5.682445049285889, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8487018346786499, + "num_tokens": 220182118.0, + "step": 5770 + }, + { + "epoch": 0.7341305177458338, + "ewc_loss": 0.04759256914258003, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019028116366825998, + "grad_norm": 5.681987285614014, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8494715690612793, + "num_tokens": 220218954.0, + "step": 5771 + }, + { + "epoch": 0.7342577280244243, + "ewc_loss": 0.04765821248292923, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019093757146038115, + "grad_norm": 5.643564224243164, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8566850423812866, + "num_tokens": 220259116.0, + "step": 5772 + }, + { + "epoch": 0.7343849383030149, + "ewc_loss": 0.0476846843957901, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019120232900604606, + "grad_norm": 5.732757091522217, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8549208045005798, + "num_tokens": 220301742.0, + "step": 5773 + }, + { + "epoch": 0.7345121485816054, + "ewc_loss": 0.047779809683561325, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019093285663984716, + "grad_norm": 5.654195308685303, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8619772791862488, + "num_tokens": 220342572.0, + "step": 5774 + }, + { + "epoch": 0.7346393588601959, + "ewc_loss": 0.04772605001926422, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001916159817483276, + "grad_norm": 5.765035629272461, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8634017705917358, + "num_tokens": 220376373.0, + "step": 5775 + }, + { + "epoch": 0.7347665691387865, + "ewc_loss": 0.0476733073592186, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019108856213279068, + "grad_norm": 5.684655666351318, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8744581937789917, + "num_tokens": 220413290.0, + "step": 5776 + }, + { + "epoch": 0.7348937794173769, + "ewc_loss": 0.047674477100372314, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001911002182168886, + "grad_norm": 5.687112808227539, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.855098307132721, + "num_tokens": 220453219.0, + "step": 5777 + }, + { + "epoch": 0.7350209896959674, + "ewc_loss": 0.047618091106414795, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019053637515753508, + "grad_norm": 5.751513481140137, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8584501147270203, + "num_tokens": 220488369.0, + "step": 5778 + }, + { + "epoch": 0.7351481999745579, + "ewc_loss": 0.047640591859817505, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019076140597462654, + "grad_norm": 5.664389610290527, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8621591329574585, + "num_tokens": 220527798.0, + "step": 5779 + }, + { + "epoch": 0.7352754102531485, + "ewc_loss": 0.04763305187225342, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019068601250182837, + "grad_norm": 5.717931270599365, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8762061595916748, + "num_tokens": 220565398.0, + "step": 5780 + }, + { + "epoch": 0.735402620531739, + "ewc_loss": 0.04766279458999634, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019098339544143528, + "grad_norm": 5.68436336517334, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.868290901184082, + "num_tokens": 220603903.0, + "step": 5781 + }, + { + "epoch": 0.7355298308103295, + "ewc_loss": 0.04766085371375084, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019096399773843586, + "grad_norm": 5.700619697570801, + "learning_rate": 1e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8402917385101318, + "num_tokens": 220644168.0, + "step": 5782 + }, + { + "epoch": 0.7356570410889199, + "ewc_loss": 0.04766182601451874, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019097374752163887, + "grad_norm": 5.784470558166504, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8718615770339966, + "num_tokens": 220679709.0, + "step": 5783 + }, + { + "epoch": 0.7357842513675105, + "ewc_loss": 0.04759209603071213, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019027641974389553, + "grad_norm": 5.623586654663086, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8515258431434631, + "num_tokens": 220721184.0, + "step": 5784 + }, + { + "epoch": 0.735911461646101, + "ewc_loss": 0.04774598777294159, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019059467012993991, + "grad_norm": 5.735348701477051, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8619717359542847, + "num_tokens": 220755027.0, + "step": 5785 + }, + { + "epoch": 0.7360386719246915, + "ewc_loss": 0.047625210136175156, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019060756312683225, + "grad_norm": 5.66717004776001, + "learning_rate": 1e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.834701418876648, + "num_tokens": 220793597.0, + "step": 5786 + }, + { + "epoch": 0.736165882203282, + "ewc_loss": 0.04779963567852974, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019113112648483366, + "grad_norm": 5.681683540344238, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8574377298355103, + "num_tokens": 220833549.0, + "step": 5787 + }, + { + "epoch": 0.7362930924818726, + "ewc_loss": 0.047697123140096664, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001913267042255029, + "grad_norm": 5.702974796295166, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8647732734680176, + "num_tokens": 220865551.0, + "step": 5788 + }, + { + "epoch": 0.736420302760463, + "ewc_loss": 0.04766477644443512, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.000191003258805722, + "grad_norm": 5.623890399932861, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8706691265106201, + "num_tokens": 220911626.0, + "step": 5789 + }, + { + "epoch": 0.7365475130390535, + "ewc_loss": 0.047662001103162766, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019219618116039783, + "grad_norm": 5.7646002769470215, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8595985770225525, + "num_tokens": 220949839.0, + "step": 5790 + }, + { + "epoch": 0.736674723317644, + "ewc_loss": 0.047572970390319824, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001913058658828959, + "grad_norm": 5.6459150314331055, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8417066931724548, + "num_tokens": 220984069.0, + "step": 5791 + }, + { + "epoch": 0.7368019335962346, + "ewc_loss": 0.04779030755162239, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019225853611715138, + "grad_norm": 5.710310935974121, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.878150463104248, + "num_tokens": 221020999.0, + "step": 5792 + }, + { + "epoch": 0.7369291438748251, + "ewc_loss": 0.04772834852337837, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019163894467055798, + "grad_norm": 5.766402721405029, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8502515554428101, + "num_tokens": 221051082.0, + "step": 5793 + }, + { + "epoch": 0.7370563541534156, + "ewc_loss": 0.04778408631682396, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001921963266795501, + "grad_norm": 5.687830924987793, + "learning_rate": 1e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8410067558288574, + "num_tokens": 221095444.0, + "step": 5794 + }, + { + "epoch": 0.737183564432006, + "ewc_loss": 0.04784305393695831, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019156528287567198, + "grad_norm": 5.740118503570557, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.860605001449585, + "num_tokens": 221136433.0, + "step": 5795 + }, + { + "epoch": 0.7373107747105966, + "ewc_loss": 0.04775235056877136, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019187894940841943, + "grad_norm": 5.7259440422058105, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8509632349014282, + "num_tokens": 221171278.0, + "step": 5796 + }, + { + "epoch": 0.7374379849891871, + "ewc_loss": 0.04768473282456398, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019120279466733336, + "grad_norm": 5.704735279083252, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8575257062911987, + "num_tokens": 221209380.0, + "step": 5797 + }, + { + "epoch": 0.7375651952677776, + "ewc_loss": 0.047714993357658386, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019150540174450725, + "grad_norm": 5.678010940551758, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8560462594032288, + "num_tokens": 221253076.0, + "step": 5798 + }, + { + "epoch": 0.7376924055463682, + "ewc_loss": 0.04783032834529877, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001914380700327456, + "grad_norm": 5.7267374992370605, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.853512704372406, + "num_tokens": 221290348.0, + "step": 5799 + }, + { + "epoch": 0.7378196158249587, + "ewc_loss": 0.04768119007349014, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019116738985758275, + "grad_norm": 5.637682914733887, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8646128177642822, + "num_tokens": 221329635.0, + "step": 5800 + }, + { + "epoch": 0.7379468261035491, + "ewc_loss": 0.047720782458782196, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019156330381520092, + "grad_norm": 5.671411037445068, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8652492761611938, + "num_tokens": 221368521.0, + "step": 5801 + }, + { + "epoch": 0.7380740363821396, + "ewc_loss": 0.047777291387319565, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019212838378734887, + "grad_norm": 5.726292610168457, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8695693612098694, + "num_tokens": 221407090.0, + "step": 5802 + }, + { + "epoch": 0.7382012466607302, + "ewc_loss": 0.04768127202987671, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019116820476483554, + "grad_norm": 5.59898042678833, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8675556182861328, + "num_tokens": 221450340.0, + "step": 5803 + }, + { + "epoch": 0.7383284569393207, + "ewc_loss": 0.04780888557434082, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019122364756185561, + "grad_norm": 5.753568649291992, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8589935302734375, + "num_tokens": 221491996.0, + "step": 5804 + }, + { + "epoch": 0.7384556672179112, + "ewc_loss": 0.04784267395734787, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019156152848154306, + "grad_norm": 5.666899681091309, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8668860793113708, + "num_tokens": 221531466.0, + "step": 5805 + }, + { + "epoch": 0.7385828774965018, + "ewc_loss": 0.047881126403808594, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019194604828953743, + "grad_norm": 5.737725734710693, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8396142721176147, + "num_tokens": 221567003.0, + "step": 5806 + }, + { + "epoch": 0.7387100877750922, + "ewc_loss": 0.04785912483930588, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019172602333128452, + "grad_norm": 5.6473388671875, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8712209463119507, + "num_tokens": 221607396.0, + "step": 5807 + }, + { + "epoch": 0.7388372980536827, + "ewc_loss": 0.047522954642772675, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019202643306925893, + "grad_norm": 5.77211856842041, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.851038932800293, + "num_tokens": 221644776.0, + "step": 5808 + }, + { + "epoch": 0.7389645083322732, + "ewc_loss": 0.04780253767967224, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019238083041273057, + "grad_norm": 5.761518478393555, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8715596199035645, + "num_tokens": 221685643.0, + "step": 5809 + }, + { + "epoch": 0.7390917186108638, + "ewc_loss": 0.047740280628204346, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019175825582351536, + "grad_norm": 5.687266826629639, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8516634106636047, + "num_tokens": 221724673.0, + "step": 5810 + }, + { + "epoch": 0.7392189288894543, + "ewc_loss": 0.047741636633872986, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019177186186425388, + "grad_norm": 5.800502300262451, + "learning_rate": 1e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8343577980995178, + "num_tokens": 221766287.0, + "step": 5811 + }, + { + "epoch": 0.7393461391680448, + "ewc_loss": 0.04767439514398575, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019109940330963582, + "grad_norm": 5.687044143676758, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8566457629203796, + "num_tokens": 221805357.0, + "step": 5812 + }, + { + "epoch": 0.7394733494466353, + "ewc_loss": 0.04772121459245682, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019156764028593898, + "grad_norm": 6.082259178161621, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8570214509963989, + "num_tokens": 221838513.0, + "step": 5813 + }, + { + "epoch": 0.7396005597252258, + "ewc_loss": 0.04770974814891815, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.000191452942090109, + "grad_norm": 5.621401786804199, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8469901084899902, + "num_tokens": 221882808.0, + "step": 5814 + }, + { + "epoch": 0.7397277700038163, + "ewc_loss": 0.047738611698150635, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019174159388057888, + "grad_norm": 5.935407638549805, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.871563196182251, + "num_tokens": 221918505.0, + "step": 5815 + }, + { + "epoch": 0.7398549802824068, + "ewc_loss": 0.04775736853480339, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001919291535159573, + "grad_norm": 5.6955180168151855, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8590776920318604, + "num_tokens": 221955983.0, + "step": 5816 + }, + { + "epoch": 0.7399821905609973, + "ewc_loss": 0.04767955094575882, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019115096074528992, + "grad_norm": 5.814741611480713, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.857843279838562, + "num_tokens": 221997363.0, + "step": 5817 + }, + { + "epoch": 0.7401094008395879, + "ewc_loss": 0.0476871058344841, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019122649973724037, + "grad_norm": 5.655688762664795, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8434675335884094, + "num_tokens": 222039302.0, + "step": 5818 + }, + { + "epoch": 0.7402366111181784, + "ewc_loss": 0.04774912819266319, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019184674602001905, + "grad_norm": 5.720420837402344, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8573713302612305, + "num_tokens": 222075779.0, + "step": 5819 + }, + { + "epoch": 0.7403638213967688, + "ewc_loss": 0.047778595238924026, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019214142230339348, + "grad_norm": 5.691029071807861, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8766094446182251, + "num_tokens": 222115120.0, + "step": 5820 + }, + { + "epoch": 0.7404910316753593, + "ewc_loss": 0.0477907657623291, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001922631054185331, + "grad_norm": 5.7004852294921875, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8499473333358765, + "num_tokens": 222156570.0, + "step": 5821 + }, + { + "epoch": 0.7406182419539499, + "ewc_loss": 0.04779917746782303, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019234725914429873, + "grad_norm": 5.772760391235352, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8580495715141296, + "num_tokens": 222189487.0, + "step": 5822 + }, + { + "epoch": 0.7407454522325404, + "ewc_loss": 0.04775969311594963, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019195240747649223, + "grad_norm": 5.688124656677246, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8560865521430969, + "num_tokens": 222223163.0, + "step": 5823 + }, + { + "epoch": 0.7408726625111309, + "ewc_loss": 0.04785143584012985, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019286984752397984, + "grad_norm": 5.759641647338867, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8706744313240051, + "num_tokens": 222259042.0, + "step": 5824 + }, + { + "epoch": 0.7409998727897215, + "ewc_loss": 0.04779685288667679, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019232401973567903, + "grad_norm": 5.902190208435059, + "learning_rate": 1e-06, + "loss": 0.542, + "mean_token_accuracy": 0.8392819166183472, + "num_tokens": 222300497.0, + "step": 5825 + }, + { + "epoch": 0.7411270830683119, + "ewc_loss": 0.04771821200847626, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019153756147716194, + "grad_norm": 5.655339241027832, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8544321656227112, + "num_tokens": 222333630.0, + "step": 5826 + }, + { + "epoch": 0.7412542933469024, + "ewc_loss": 0.047804176807403564, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019239724497310817, + "grad_norm": 5.716368675231934, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8534886240959167, + "num_tokens": 222371715.0, + "step": 5827 + }, + { + "epoch": 0.7413815036254929, + "ewc_loss": 0.04779168218374252, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019227230222895741, + "grad_norm": 5.629703521728516, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.859377384185791, + "num_tokens": 222412044.0, + "step": 5828 + }, + { + "epoch": 0.7415087139040835, + "ewc_loss": 0.048084162175655365, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019275565864518285, + "grad_norm": 5.699195861816406, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8607760071754456, + "num_tokens": 222452668.0, + "step": 5829 + }, + { + "epoch": 0.741635924182674, + "ewc_loss": 0.048008136451244354, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019321612489875406, + "grad_norm": 5.703313827514648, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8606798052787781, + "num_tokens": 222485631.0, + "step": 5830 + }, + { + "epoch": 0.7417631344612645, + "ewc_loss": 0.04800038039684296, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001931385777425021, + "grad_norm": 5.654483795166016, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8726935982704163, + "num_tokens": 222522967.0, + "step": 5831 + }, + { + "epoch": 0.7418903447398549, + "ewc_loss": 0.047916874289512634, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019352421804796904, + "grad_norm": 5.687801361083984, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8483983278274536, + "num_tokens": 222563258.0, + "step": 5832 + }, + { + "epoch": 0.7420175550184455, + "ewc_loss": 0.04802530258893967, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019338780839461833, + "grad_norm": 5.678137302398682, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8518552780151367, + "num_tokens": 222599248.0, + "step": 5833 + }, + { + "epoch": 0.742144765297036, + "ewc_loss": 0.04806673526763916, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001938021305250004, + "grad_norm": 5.670136451721191, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.865321159362793, + "num_tokens": 222642575.0, + "step": 5834 + }, + { + "epoch": 0.7422719755756265, + "ewc_loss": 0.048085473477840424, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019398948643356562, + "grad_norm": 5.639596939086914, + "learning_rate": 1e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8350341320037842, + "num_tokens": 222687330.0, + "step": 5835 + }, + { + "epoch": 0.742399185854217, + "ewc_loss": 0.04809046536684036, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019403942860662937, + "grad_norm": 5.766669273376465, + "learning_rate": 1e-06, + "loss": 0.5526, + "mean_token_accuracy": 0.8343134522438049, + "num_tokens": 222720020.0, + "step": 5836 + }, + { + "epoch": 0.7425263961328076, + "ewc_loss": 0.04809046909213066, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019403945771045983, + "grad_norm": 5.66442346572876, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.87117999792099, + "num_tokens": 222758634.0, + "step": 5837 + }, + { + "epoch": 0.742653606411398, + "ewc_loss": 0.04811234772205353, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019425824575591832, + "grad_norm": 5.755281925201416, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8655068278312683, + "num_tokens": 222794823.0, + "step": 5838 + }, + { + "epoch": 0.7427808166899885, + "ewc_loss": 0.04810357838869095, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019417057046666741, + "grad_norm": 5.661910533905029, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8575267791748047, + "num_tokens": 222833836.0, + "step": 5839 + }, + { + "epoch": 0.742908026968579, + "ewc_loss": 0.04805896058678627, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019372436509002, + "grad_norm": 5.635602951049805, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8600096106529236, + "num_tokens": 222875712.0, + "step": 5840 + }, + { + "epoch": 0.7430352372471696, + "ewc_loss": 0.04808449745178223, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019397976575419307, + "grad_norm": 5.705381870269775, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8473789691925049, + "num_tokens": 222915494.0, + "step": 5841 + }, + { + "epoch": 0.7431624475257601, + "ewc_loss": 0.04811081662774086, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019424293714109808, + "grad_norm": 5.732005596160889, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8753424882888794, + "num_tokens": 222951675.0, + "step": 5842 + }, + { + "epoch": 0.7432896578043506, + "ewc_loss": 0.04803992062807083, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019353398238308728, + "grad_norm": 5.660515308380127, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8538543581962585, + "num_tokens": 222993245.0, + "step": 5843 + }, + { + "epoch": 0.743416868082941, + "ewc_loss": 0.048060670495033264, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001937414926942438, + "grad_norm": 5.665330410003662, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8648203611373901, + "num_tokens": 223029876.0, + "step": 5844 + }, + { + "epoch": 0.7435440783615316, + "ewc_loss": 0.04803459346294403, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019348067871760577, + "grad_norm": 5.6450958251953125, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8636255860328674, + "num_tokens": 223072550.0, + "step": 5845 + }, + { + "epoch": 0.7436712886401221, + "ewc_loss": 0.04790125787258148, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019336803234182298, + "grad_norm": 5.706195831298828, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8564581871032715, + "num_tokens": 223112892.0, + "step": 5846 + }, + { + "epoch": 0.7437984989187126, + "ewc_loss": 0.047946229577064514, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019381775928195566, + "grad_norm": 5.668695449829102, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8638310432434082, + "num_tokens": 223157093.0, + "step": 5847 + }, + { + "epoch": 0.7439257091973032, + "ewc_loss": 0.04791741073131561, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019352955860085785, + "grad_norm": 5.718978404998779, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8489151000976562, + "num_tokens": 223190756.0, + "step": 5848 + }, + { + "epoch": 0.7440529194758937, + "ewc_loss": 0.04790432006120682, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001933986641233787, + "grad_norm": 5.7276611328125, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8485187888145447, + "num_tokens": 223226013.0, + "step": 5849 + }, + { + "epoch": 0.7441801297544841, + "ewc_loss": 0.04786038398742676, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019295931269880384, + "grad_norm": 5.771414279937744, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8699758052825928, + "num_tokens": 223262858.0, + "step": 5850 + }, + { + "epoch": 0.7443073400330746, + "ewc_loss": 0.047842949628829956, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019278496620245278, + "grad_norm": 5.660323619842529, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8625687956809998, + "num_tokens": 223310528.0, + "step": 5851 + }, + { + "epoch": 0.7444345503116652, + "ewc_loss": 0.04785918444395065, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019294730736874044, + "grad_norm": 5.673396587371826, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8605214357376099, + "num_tokens": 223345021.0, + "step": 5852 + }, + { + "epoch": 0.7445617605902557, + "ewc_loss": 0.04793080314993858, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019366350898053497, + "grad_norm": 5.7345499992370605, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.868598997592926, + "num_tokens": 223382055.0, + "step": 5853 + }, + { + "epoch": 0.7446889708688462, + "ewc_loss": 0.047907449305057526, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019342997984495014, + "grad_norm": 5.756223678588867, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8480768203735352, + "num_tokens": 223418441.0, + "step": 5854 + }, + { + "epoch": 0.7448161811474368, + "ewc_loss": 0.04786689952015877, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019302446162328124, + "grad_norm": 5.685177326202393, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8564926385879517, + "num_tokens": 223458150.0, + "step": 5855 + }, + { + "epoch": 0.7449433914260272, + "ewc_loss": 0.04790479317307472, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001934033934958279, + "grad_norm": 5.690577983856201, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8551061153411865, + "num_tokens": 223496916.0, + "step": 5856 + }, + { + "epoch": 0.7450706017046177, + "ewc_loss": 0.04789324849843979, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019328795315232128, + "grad_norm": 5.674203395843506, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8643709421157837, + "num_tokens": 223534808.0, + "step": 5857 + }, + { + "epoch": 0.7451978119832082, + "ewc_loss": 0.04802495241165161, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019338431593496352, + "grad_norm": 5.704507827758789, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8674951791763306, + "num_tokens": 223571255.0, + "step": 5858 + }, + { + "epoch": 0.7453250222617988, + "ewc_loss": 0.04793570190668106, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001937124616233632, + "grad_norm": 5.666534423828125, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8619264364242554, + "num_tokens": 223616608.0, + "step": 5859 + }, + { + "epoch": 0.7454522325403893, + "ewc_loss": 0.047910161316394806, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019345710461493582, + "grad_norm": 5.670276641845703, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8588456511497498, + "num_tokens": 223656188.0, + "step": 5860 + }, + { + "epoch": 0.7455794428189798, + "ewc_loss": 0.04798106104135513, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019416608847677708, + "grad_norm": 5.723303318023682, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8536685109138489, + "num_tokens": 223693787.0, + "step": 5861 + }, + { + "epoch": 0.7457066530975703, + "ewc_loss": 0.04791512340307236, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001935066975420341, + "grad_norm": 5.667285919189453, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8708471655845642, + "num_tokens": 223729350.0, + "step": 5862 + }, + { + "epoch": 0.7458338633761608, + "ewc_loss": 0.04802648350596428, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019462031195871532, + "grad_norm": 5.706331253051758, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8473184704780579, + "num_tokens": 223770846.0, + "step": 5863 + }, + { + "epoch": 0.7459610736547513, + "ewc_loss": 0.04798085615038872, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019416403665672988, + "grad_norm": 5.637046813964844, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8583940863609314, + "num_tokens": 223814194.0, + "step": 5864 + }, + { + "epoch": 0.7460882839333418, + "ewc_loss": 0.04799250513315201, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019428052473813295, + "grad_norm": 5.749925136566162, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8525888919830322, + "num_tokens": 223849141.0, + "step": 5865 + }, + { + "epoch": 0.7462154942119323, + "ewc_loss": 0.04797469079494476, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019410235108807683, + "grad_norm": 5.660540580749512, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8806309700012207, + "num_tokens": 223888654.0, + "step": 5866 + }, + { + "epoch": 0.7463427044905229, + "ewc_loss": 0.04797349125146866, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019409037486184388, + "grad_norm": 5.729836940765381, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8621280193328857, + "num_tokens": 223924509.0, + "step": 5867 + }, + { + "epoch": 0.7464699147691134, + "ewc_loss": 0.04773003235459328, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.000194097199710086, + "grad_norm": 5.68017053604126, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.844606339931488, + "num_tokens": 223966191.0, + "step": 5868 + }, + { + "epoch": 0.7465971250477038, + "ewc_loss": 0.047710783779621124, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019390469242352992, + "grad_norm": 5.668210983276367, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.861676812171936, + "num_tokens": 224001415.0, + "step": 5869 + }, + { + "epoch": 0.7467243353262943, + "ewc_loss": 0.04768340289592743, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019363091269042343, + "grad_norm": 5.6829609870910645, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8648897409439087, + "num_tokens": 224042820.0, + "step": 5870 + }, + { + "epoch": 0.7468515456048849, + "ewc_loss": 0.04798929765820503, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019424845231696963, + "grad_norm": 5.709856033325195, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8643155097961426, + "num_tokens": 224078411.0, + "step": 5871 + }, + { + "epoch": 0.7469787558834754, + "ewc_loss": 0.04782004654407501, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001949973520822823, + "grad_norm": 5.704207897186279, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8651386499404907, + "num_tokens": 224117136.0, + "step": 5872 + }, + { + "epoch": 0.7471059661620659, + "ewc_loss": 0.04772941768169403, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019409102969802916, + "grad_norm": 5.708874225616455, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8516111373901367, + "num_tokens": 224155879.0, + "step": 5873 + }, + { + "epoch": 0.7472331764406565, + "ewc_loss": 0.04803204908967018, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001946759584825486, + "grad_norm": 5.65916633605957, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8698654770851135, + "num_tokens": 224195186.0, + "step": 5874 + }, + { + "epoch": 0.7473603867192469, + "ewc_loss": 0.0480198860168457, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001945543335750699, + "grad_norm": 5.660102844238281, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8552473783493042, + "num_tokens": 224235722.0, + "step": 5875 + }, + { + "epoch": 0.7474875969978374, + "ewc_loss": 0.04780365154147148, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.0001948333956534043, + "grad_norm": 5.751811981201172, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8666700720787048, + "num_tokens": 224268461.0, + "step": 5876 + }, + { + "epoch": 0.7476148072764279, + "ewc_loss": 0.04777545481920242, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019455143774393946, + "grad_norm": 5.768117427825928, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8719590306282043, + "num_tokens": 224302921.0, + "step": 5877 + }, + { + "epoch": 0.7477420175550185, + "ewc_loss": 0.04776499420404434, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019444682402536273, + "grad_norm": 5.6462721824646, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8599380254745483, + "num_tokens": 224344081.0, + "step": 5878 + }, + { + "epoch": 0.747869227833609, + "ewc_loss": 0.047898177057504654, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019455794245004654, + "grad_norm": 5.773932933807373, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8463553786277771, + "num_tokens": 224384464.0, + "step": 5879 + }, + { + "epoch": 0.7479964381121995, + "ewc_loss": 0.04788957163691521, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019447188242338598, + "grad_norm": 5.641759395599365, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8672175407409668, + "num_tokens": 224425542.0, + "step": 5880 + }, + { + "epoch": 0.7481236483907899, + "ewc_loss": 0.04806375503540039, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019499301561154425, + "grad_norm": 5.74017333984375, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8754798173904419, + "num_tokens": 224465342.0, + "step": 5881 + }, + { + "epoch": 0.7482508586693805, + "ewc_loss": 0.04807824641466141, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.000193917250726372, + "grad_norm": 5.668381690979004, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8629199266433716, + "num_tokens": 224504787.0, + "step": 5882 + }, + { + "epoch": 0.748378068947971, + "ewc_loss": 0.048041462898254395, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019477010937407613, + "grad_norm": 5.694324016571045, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8607202768325806, + "num_tokens": 224548534.0, + "step": 5883 + }, + { + "epoch": 0.7485052792265615, + "ewc_loss": 0.04791360720992088, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.0001947122364072129, + "grad_norm": 5.713432312011719, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8514689207077026, + "num_tokens": 224590251.0, + "step": 5884 + }, + { + "epoch": 0.748632489505152, + "ewc_loss": 0.047766271978616714, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.000194459586055018, + "grad_norm": 5.671234130859375, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8687651753425598, + "num_tokens": 224633745.0, + "step": 5885 + }, + { + "epoch": 0.7487596997837426, + "ewc_loss": 0.047832272946834564, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019511960272211581, + "grad_norm": 5.7659831047058105, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8429905772209167, + "num_tokens": 224671319.0, + "step": 5886 + }, + { + "epoch": 0.748886910062333, + "ewc_loss": 0.047785449773073196, + "ewc_loss_diag": 2.8371810913085938e-05, + "ewc_loss_parallel": 0.00019465136574581265, + "grad_norm": 5.62863302230835, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8590750098228455, + "num_tokens": 224712670.0, + "step": 5887 + }, + { + "epoch": 0.7490141203409235, + "ewc_loss": 0.04795375466346741, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019511373830027878, + "grad_norm": 5.7429399490356445, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.85577392578125, + "num_tokens": 224747840.0, + "step": 5888 + }, + { + "epoch": 0.749141330619514, + "ewc_loss": 0.04794897139072418, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019506586249917746, + "grad_norm": 5.646008491516113, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8705947995185852, + "num_tokens": 224783274.0, + "step": 5889 + }, + { + "epoch": 0.7492685408981046, + "ewc_loss": 0.04815186560153961, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001958741049747914, + "grad_norm": 5.733854293823242, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8551592826843262, + "num_tokens": 224820456.0, + "step": 5890 + }, + { + "epoch": 0.7493957511766951, + "ewc_loss": 0.04804086685180664, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019476415764074773, + "grad_norm": 5.65757942199707, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8634752631187439, + "num_tokens": 224858569.0, + "step": 5891 + }, + { + "epoch": 0.7495229614552856, + "ewc_loss": 0.048040539026260376, + "ewc_loss_diag": 2.849102020263672e-05, + "ewc_loss_parallel": 0.00019598158542066813, + "grad_norm": 5.717557430267334, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8574906587600708, + "num_tokens": 224901033.0, + "step": 5892 + }, + { + "epoch": 0.749650171733876, + "ewc_loss": 0.0481586791574955, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019594226614572108, + "grad_norm": 5.712769985198975, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8726734519004822, + "num_tokens": 224934345.0, + "step": 5893 + }, + { + "epoch": 0.7497773820124666, + "ewc_loss": 0.048134855926036835, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019570402218960226, + "grad_norm": 5.68019962310791, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8506304025650024, + "num_tokens": 224972479.0, + "step": 5894 + }, + { + "epoch": 0.7499045922910571, + "ewc_loss": 0.04819771647453308, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019633262127172202, + "grad_norm": 5.718626022338867, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8745900392532349, + "num_tokens": 225014213.0, + "step": 5895 + }, + { + "epoch": 0.7500318025696476, + "ewc_loss": 0.04825282096862793, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019566297123674303, + "grad_norm": 5.698086738586426, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8662115335464478, + "num_tokens": 225049929.0, + "step": 5896 + }, + { + "epoch": 0.7501590128482382, + "ewc_loss": 0.048139672726392746, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019575220358092338, + "grad_norm": 5.733706951141357, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8444802761077881, + "num_tokens": 225086409.0, + "step": 5897 + }, + { + "epoch": 0.7502862231268287, + "ewc_loss": 0.0481000579893589, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019535605679266155, + "grad_norm": 5.699435234069824, + "learning_rate": 1e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8422994613647461, + "num_tokens": 225124443.0, + "step": 5898 + }, + { + "epoch": 0.7504134334054191, + "ewc_loss": 0.04815854877233505, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019594095647335052, + "grad_norm": 5.728498458862305, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8481433987617493, + "num_tokens": 225166665.0, + "step": 5899 + }, + { + "epoch": 0.7505406436840096, + "ewc_loss": 0.04810710996389389, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019542658992577344, + "grad_norm": 5.743863105773926, + "learning_rate": 1e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.8353589773178101, + "num_tokens": 225203841.0, + "step": 5900 + }, + { + "epoch": 0.7506678539626002, + "ewc_loss": 0.04815692454576492, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001959247310878709, + "grad_norm": 5.795862674713135, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8418745398521423, + "num_tokens": 225234844.0, + "step": 5901 + }, + { + "epoch": 0.7507950642411907, + "ewc_loss": 0.04812619090080261, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019561736553441733, + "grad_norm": 5.7133331298828125, + "learning_rate": 1e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8407618999481201, + "num_tokens": 225269828.0, + "step": 5902 + }, + { + "epoch": 0.7509222745197812, + "ewc_loss": 0.048173267394304276, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019608814909588546, + "grad_norm": 5.6920013427734375, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8626784682273865, + "num_tokens": 225312361.0, + "step": 5903 + }, + { + "epoch": 0.7510494847983717, + "ewc_loss": 0.04813254252076149, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019568089919630438, + "grad_norm": 5.664271831512451, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8478424549102783, + "num_tokens": 225353708.0, + "step": 5904 + }, + { + "epoch": 0.7511766950769622, + "ewc_loss": 0.04814630374312401, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019581850210670382, + "grad_norm": 5.6866536140441895, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8774797320365906, + "num_tokens": 225399730.0, + "step": 5905 + }, + { + "epoch": 0.7513039053555527, + "ewc_loss": 0.04831574857234955, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019629222515504807, + "grad_norm": 5.706448078155518, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8534091711044312, + "num_tokens": 225441836.0, + "step": 5906 + }, + { + "epoch": 0.7514311156341432, + "ewc_loss": 0.04844948276877403, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.0001964088878594339, + "grad_norm": 5.7941575050354, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8660995960235596, + "num_tokens": 225471004.0, + "step": 5907 + }, + { + "epoch": 0.7515583259127337, + "ewc_loss": 0.048335812985897064, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019649288151413202, + "grad_norm": 5.816433906555176, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8408956527709961, + "num_tokens": 225505789.0, + "step": 5908 + }, + { + "epoch": 0.7516855361913243, + "ewc_loss": 0.048419177532196045, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.0001961058151209727, + "grad_norm": 5.733166694641113, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8611441850662231, + "num_tokens": 225541138.0, + "step": 5909 + }, + { + "epoch": 0.7518127464699148, + "ewc_loss": 0.04838690161705017, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019578308274503797, + "grad_norm": 5.719600200653076, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8619258999824524, + "num_tokens": 225581695.0, + "step": 5910 + }, + { + "epoch": 0.7519399567485053, + "ewc_loss": 0.04835670441389084, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019670180336106569, + "grad_norm": 5.743384838104248, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8648717403411865, + "num_tokens": 225619532.0, + "step": 5911 + }, + { + "epoch": 0.7520671670270958, + "ewc_loss": 0.048509109765291214, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019578446517698467, + "grad_norm": 5.808113098144531, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8508104085922241, + "num_tokens": 225657403.0, + "step": 5912 + }, + { + "epoch": 0.7521943773056863, + "ewc_loss": 0.04843880236148834, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001950813748408109, + "grad_norm": 5.71795129776001, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8540007472038269, + "num_tokens": 225693402.0, + "step": 5913 + }, + { + "epoch": 0.7523215875842768, + "ewc_loss": 0.04848155006766319, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001955088519025594, + "grad_norm": 5.784256458282471, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8575119376182556, + "num_tokens": 225727208.0, + "step": 5914 + }, + { + "epoch": 0.7524487978628673, + "ewc_loss": 0.04845372587442398, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019523061928339303, + "grad_norm": 5.721748352050781, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8634626865386963, + "num_tokens": 225762117.0, + "step": 5915 + }, + { + "epoch": 0.7525760081414579, + "ewc_loss": 0.04846885800361633, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019538191554602236, + "grad_norm": 5.846144676208496, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8431923389434814, + "num_tokens": 225794737.0, + "step": 5916 + }, + { + "epoch": 0.7527032184200484, + "ewc_loss": 0.048492107540369034, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019561444059945643, + "grad_norm": 5.706273078918457, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8579378128051758, + "num_tokens": 225829074.0, + "step": 5917 + }, + { + "epoch": 0.7528304286986388, + "ewc_loss": 0.04843785613775253, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001950719451997429, + "grad_norm": 5.695587635040283, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.871745228767395, + "num_tokens": 225871344.0, + "step": 5918 + }, + { + "epoch": 0.7529576389772293, + "ewc_loss": 0.04849053919315338, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019559876818675548, + "grad_norm": 5.726165771484375, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8542242050170898, + "num_tokens": 225911321.0, + "step": 5919 + }, + { + "epoch": 0.7530848492558199, + "ewc_loss": 0.04855041950941086, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019619757949840277, + "grad_norm": 5.782662868499756, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8665780425071716, + "num_tokens": 225944613.0, + "step": 5920 + }, + { + "epoch": 0.7532120595344104, + "ewc_loss": 0.04831632971763611, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019629804592113942, + "grad_norm": 5.7575788497924805, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8634952306747437, + "num_tokens": 225989501.0, + "step": 5921 + }, + { + "epoch": 0.7533392698130009, + "ewc_loss": 0.048497073352336884, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019566409173421562, + "grad_norm": 5.721827507019043, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8612090349197388, + "num_tokens": 226024984.0, + "step": 5922 + }, + { + "epoch": 0.7534664800915915, + "ewc_loss": 0.048541925847530365, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001961125963134691, + "grad_norm": 5.734927654266357, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8709656000137329, + "num_tokens": 226063255.0, + "step": 5923 + }, + { + "epoch": 0.7535936903701819, + "ewc_loss": 0.048570118844509125, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001963945251191035, + "grad_norm": 5.889192581176758, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8463814854621887, + "num_tokens": 226105538.0, + "step": 5924 + }, + { + "epoch": 0.7537209006487724, + "ewc_loss": 0.04852166026830673, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001959099608939141, + "grad_norm": 5.767068386077881, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8622177839279175, + "num_tokens": 226137449.0, + "step": 5925 + }, + { + "epoch": 0.7538481109273629, + "ewc_loss": 0.0483182892203331, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019509695994202048, + "grad_norm": 5.7634053230285645, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8527210354804993, + "num_tokens": 226171078.0, + "step": 5926 + }, + { + "epoch": 0.7539753212059535, + "ewc_loss": 0.048391010612249374, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019582416280172765, + "grad_norm": 5.783665657043457, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8609196543693542, + "num_tokens": 226208060.0, + "step": 5927 + }, + { + "epoch": 0.754102531484544, + "ewc_loss": 0.04847139120101929, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019540725043043494, + "grad_norm": 5.710203170776367, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8682752847671509, + "num_tokens": 226245330.0, + "step": 5928 + }, + { + "epoch": 0.7542297417631345, + "ewc_loss": 0.048444099724292755, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001951343729160726, + "grad_norm": 5.718273162841797, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8787915706634521, + "num_tokens": 226281974.0, + "step": 5929 + }, + { + "epoch": 0.7543569520417249, + "ewc_loss": 0.04832744225859642, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019518847693689167, + "grad_norm": 5.722827911376953, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8487566709518433, + "num_tokens": 226323302.0, + "step": 5930 + }, + { + "epoch": 0.7544841623203155, + "ewc_loss": 0.04852956905961037, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019598905055318028, + "grad_norm": 5.781993389129639, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8451462984085083, + "num_tokens": 226361346.0, + "step": 5931 + }, + { + "epoch": 0.754611372598906, + "ewc_loss": 0.048422642052173615, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001949197903741151, + "grad_norm": 5.781154632568359, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.85402911901474, + "num_tokens": 226397744.0, + "step": 5932 + }, + { + "epoch": 0.7547385828774965, + "ewc_loss": 0.048361264169216156, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019552669255062938, + "grad_norm": 5.732566833496094, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8661919832229614, + "num_tokens": 226436745.0, + "step": 5933 + }, + { + "epoch": 0.754865793156087, + "ewc_loss": 0.04842546582221985, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019494799198582768, + "grad_norm": 5.71858024597168, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8665614128112793, + "num_tokens": 226470302.0, + "step": 5934 + }, + { + "epoch": 0.7549930034346776, + "ewc_loss": 0.048483170568943024, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001955250627361238, + "grad_norm": 5.829345703125, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.859982967376709, + "num_tokens": 226500090.0, + "step": 5935 + }, + { + "epoch": 0.755120213713268, + "ewc_loss": 0.04847155511379242, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001954089239006862, + "grad_norm": 5.868124008178711, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.859794557094574, + "num_tokens": 226533359.0, + "step": 5936 + }, + { + "epoch": 0.7552474239918585, + "ewc_loss": 0.0486215278506279, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019446721125859767, + "grad_norm": 5.713805198669434, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8500596880912781, + "num_tokens": 226567060.0, + "step": 5937 + }, + { + "epoch": 0.755374634270449, + "ewc_loss": 0.048558443784713745, + "ewc_loss_diag": 2.9087066650390625e-05, + "ewc_loss_parallel": 0.0001950571167981252, + "grad_norm": 5.702055931091309, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8570672869682312, + "num_tokens": 226604270.0, + "step": 5938 + }, + { + "epoch": 0.7555018445490396, + "ewc_loss": 0.04829006269574165, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019481468189042062, + "grad_norm": 5.719937324523926, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8561322689056396, + "num_tokens": 226641310.0, + "step": 5939 + }, + { + "epoch": 0.7556290548276301, + "ewc_loss": 0.04835023730993271, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.0001954164617927745, + "grad_norm": 5.759089469909668, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8471890687942505, + "num_tokens": 226678305.0, + "step": 5940 + }, + { + "epoch": 0.7557562651062206, + "ewc_loss": 0.04859966039657593, + "ewc_loss_diag": 2.9087066650390625e-05, + "ewc_loss_parallel": 0.00019546924158930779, + "grad_norm": 5.7069010734558105, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8664306402206421, + "num_tokens": 226715628.0, + "step": 5941 + }, + { + "epoch": 0.755883475384811, + "ewc_loss": 0.04838182032108307, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.0001957322529051453, + "grad_norm": 5.723585605621338, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8702796697616577, + "num_tokens": 226755346.0, + "step": 5942 + }, + { + "epoch": 0.7560106856634016, + "ewc_loss": 0.0484699085354805, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019539242202881724, + "grad_norm": 5.7846808433532715, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8484817743301392, + "num_tokens": 226792243.0, + "step": 5943 + }, + { + "epoch": 0.7561378959419921, + "ewc_loss": 0.04870382323861122, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019529018027242273, + "grad_norm": 5.692826747894287, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8731293082237244, + "num_tokens": 226829033.0, + "step": 5944 + }, + { + "epoch": 0.7562651062205826, + "ewc_loss": 0.048551589250564575, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019620925013441592, + "grad_norm": 5.73169469833374, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8738176822662354, + "num_tokens": 226870645.0, + "step": 5945 + }, + { + "epoch": 0.7563923164991732, + "ewc_loss": 0.048479609191417694, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019548945419956, + "grad_norm": 5.776523590087891, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8635386228561401, + "num_tokens": 226902395.0, + "step": 5946 + }, + { + "epoch": 0.7565195267777637, + "ewc_loss": 0.04856346547603607, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019632800831459463, + "grad_norm": 5.766813278198242, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8654181957244873, + "num_tokens": 226942620.0, + "step": 5947 + }, + { + "epoch": 0.7566467370563541, + "ewc_loss": 0.04873602092266083, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019561214139685035, + "grad_norm": 5.761503219604492, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8665432929992676, + "num_tokens": 226976472.0, + "step": 5948 + }, + { + "epoch": 0.7567739473349446, + "ewc_loss": 0.04847094416618347, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019540282664820552, + "grad_norm": 5.715554714202881, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8493980169296265, + "num_tokens": 227013626.0, + "step": 5949 + }, + { + "epoch": 0.7569011576135352, + "ewc_loss": 0.048684779554605484, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019509975390974432, + "grad_norm": 5.762484550476074, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8464004993438721, + "num_tokens": 227049336.0, + "step": 5950 + }, + { + "epoch": 0.7570283678921257, + "ewc_loss": 0.04869991913437843, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019525113748386502, + "grad_norm": 5.705662250518799, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.88161700963974, + "num_tokens": 227087371.0, + "step": 5951 + }, + { + "epoch": 0.7571555781707162, + "ewc_loss": 0.048648588359355927, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019473781867418438, + "grad_norm": 5.717240333557129, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8609704375267029, + "num_tokens": 227129188.0, + "step": 5952 + }, + { + "epoch": 0.7572827884493067, + "ewc_loss": 0.04863681644201279, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019462013733573258, + "grad_norm": 5.7100138664245605, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8528431057929993, + "num_tokens": 227161807.0, + "step": 5953 + }, + { + "epoch": 0.7574099987278972, + "ewc_loss": 0.0486486479640007, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.0001947384444065392, + "grad_norm": 5.759490966796875, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8504771590232849, + "num_tokens": 227198431.0, + "step": 5954 + }, + { + "epoch": 0.7575372090064877, + "ewc_loss": 0.048671454191207886, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.0001949664729181677, + "grad_norm": 5.704230308532715, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8782588243484497, + "num_tokens": 227236046.0, + "step": 5955 + }, + { + "epoch": 0.7576644192850782, + "ewc_loss": 0.048797592520713806, + "ewc_loss_diag": 2.9325485229492188e-05, + "ewc_loss_parallel": 0.00019500716007314622, + "grad_norm": 5.665928363800049, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8751070499420166, + "num_tokens": 227272925.0, + "step": 5956 + }, + { + "epoch": 0.7577916295636687, + "ewc_loss": 0.0490393340587616, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019498320762068033, + "grad_norm": 14.383574485778809, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8638477325439453, + "num_tokens": 227312311.0, + "step": 5957 + }, + { + "epoch": 0.7579188398422593, + "ewc_loss": 0.05945909768342972, + "ewc_loss_diag": 2.9325485229492188e-05, + "ewc_loss_parallel": 0.00030162223265506327, + "grad_norm": 7.343239784240723, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8788343667984009, + "num_tokens": 227349632.0, + "step": 5958 + }, + { + "epoch": 0.7580460501208498, + "ewc_loss": 0.04721440002322197, + "ewc_loss_diag": 2.9325485229492188e-05, + "ewc_loss_parallel": 0.000179175243829377, + "grad_norm": 4.958686351776123, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.858508825302124, + "num_tokens": 227392982.0, + "step": 5959 + }, + { + "epoch": 0.7581732603994403, + "ewc_loss": 0.05215228721499443, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00022733341029379517, + "grad_norm": 6.663650989532471, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8555614948272705, + "num_tokens": 227431878.0, + "step": 5960 + }, + { + "epoch": 0.7583004706780307, + "ewc_loss": 0.0518488809466362, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.0002267407689942047, + "grad_norm": 5.868136882781982, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8495733141899109, + "num_tokens": 227470401.0, + "step": 5961 + }, + { + "epoch": 0.7584276809566213, + "ewc_loss": 0.04971955716609955, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0002078889519907534, + "grad_norm": 6.021884918212891, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8645353317260742, + "num_tokens": 227507189.0, + "step": 5962 + }, + { + "epoch": 0.7585548912352118, + "ewc_loss": 0.0506996288895607, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.0002152482484234497, + "grad_norm": 5.962874889373779, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8631666898727417, + "num_tokens": 227548070.0, + "step": 5963 + }, + { + "epoch": 0.7586821015138023, + "ewc_loss": 0.04987166076898575, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00020696857245638967, + "grad_norm": 6.0367207527160645, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8650625944137573, + "num_tokens": 227584360.0, + "step": 5964 + }, + { + "epoch": 0.7588093117923929, + "ewc_loss": 0.04940663278102875, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00020720111206173897, + "grad_norm": 5.9272260665893555, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8655754923820496, + "num_tokens": 227618961.0, + "step": 5965 + }, + { + "epoch": 0.7589365220709834, + "ewc_loss": 0.048947304487228394, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00020260778546798974, + "grad_norm": 5.833368301391602, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8607057929039001, + "num_tokens": 227658304.0, + "step": 5966 + }, + { + "epoch": 0.7590637323495738, + "ewc_loss": 0.04895107075572014, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0002026454749284312, + "grad_norm": 5.814998149871826, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8579568862915039, + "num_tokens": 227701526.0, + "step": 5967 + }, + { + "epoch": 0.7591909426281643, + "ewc_loss": 0.048788927495479584, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00020102404232602566, + "grad_norm": 5.787914752960205, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8645532131195068, + "num_tokens": 227744930.0, + "step": 5968 + }, + { + "epoch": 0.7593181529067549, + "ewc_loss": 0.048700787127017975, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00020014261826872826, + "grad_norm": 5.838253974914551, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8708007335662842, + "num_tokens": 227780607.0, + "step": 5969 + }, + { + "epoch": 0.7594453631853454, + "ewc_loss": 0.04856954514980316, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019883022469002753, + "grad_norm": 5.721366882324219, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8570926189422607, + "num_tokens": 227825742.0, + "step": 5970 + }, + { + "epoch": 0.7595725734639359, + "ewc_loss": 0.04855397716164589, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019867453374899924, + "grad_norm": 5.811789035797119, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8401798605918884, + "num_tokens": 227865278.0, + "step": 5971 + }, + { + "epoch": 0.7596997837425264, + "ewc_loss": 0.0484081469476223, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001984369446290657, + "grad_norm": 5.827140808105469, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8565990924835205, + "num_tokens": 227906912.0, + "step": 5972 + }, + { + "epoch": 0.7598269940211169, + "ewc_loss": 0.04831250011920929, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019748049089685082, + "grad_norm": 5.723024368286133, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8576377034187317, + "num_tokens": 227947067.0, + "step": 5973 + }, + { + "epoch": 0.7599542042997074, + "ewc_loss": 0.04832060635089874, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001975615305127576, + "grad_norm": 5.800917148590088, + "learning_rate": 1e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8418203592300415, + "num_tokens": 227988563.0, + "step": 5974 + }, + { + "epoch": 0.7600814145782979, + "ewc_loss": 0.048426419496536255, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001973989710677415, + "grad_norm": 5.782634258270264, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8532791137695312, + "num_tokens": 228028556.0, + "step": 5975 + }, + { + "epoch": 0.7602086248568884, + "ewc_loss": 0.04834691435098648, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001966039271792397, + "grad_norm": 5.713836193084717, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8743276596069336, + "num_tokens": 228066265.0, + "step": 5976 + }, + { + "epoch": 0.760335835135479, + "ewc_loss": 0.04836955666542053, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019683034042827785, + "grad_norm": 5.7125773429870605, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8575139045715332, + "num_tokens": 228103732.0, + "step": 5977 + }, + { + "epoch": 0.7604630454140695, + "ewc_loss": 0.0482339933514595, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019669542962219566, + "grad_norm": 5.707237243652344, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8567049503326416, + "num_tokens": 228149609.0, + "step": 5978 + }, + { + "epoch": 0.7605902556926599, + "ewc_loss": 0.04848823696374893, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.0001967964053619653, + "grad_norm": 5.791441917419434, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.862989068031311, + "num_tokens": 228188462.0, + "step": 5979 + }, + { + "epoch": 0.7607174659712505, + "ewc_loss": 0.048245690762996674, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.0001968123542610556, + "grad_norm": 5.7033915519714355, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8738819360733032, + "num_tokens": 228229252.0, + "step": 5980 + }, + { + "epoch": 0.760844676249841, + "ewc_loss": 0.0484842024743557, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.0001967560820048675, + "grad_norm": 5.798221111297607, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8563319444656372, + "num_tokens": 228267397.0, + "step": 5981 + }, + { + "epoch": 0.7609718865284315, + "ewc_loss": 0.04837309941649437, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001968657597899437, + "grad_norm": 5.659443378448486, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8671476244926453, + "num_tokens": 228307561.0, + "step": 5982 + }, + { + "epoch": 0.761099096807022, + "ewc_loss": 0.048524849116802216, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.000197162531549111, + "grad_norm": 5.834355354309082, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8612057566642761, + "num_tokens": 228347667.0, + "step": 5983 + }, + { + "epoch": 0.7612263070856126, + "ewc_loss": 0.048262521624565125, + "ewc_loss_diag": 2.86102294921875e-05, + "ewc_loss_parallel": 0.00019698070536833256, + "grad_norm": 5.712822914123535, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8570796251296997, + "num_tokens": 228386820.0, + "step": 5984 + }, + { + "epoch": 0.761353517364203, + "ewc_loss": 0.04853484779596329, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019726253231056035, + "grad_norm": 5.736824035644531, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8689883351325989, + "num_tokens": 228427304.0, + "step": 5985 + }, + { + "epoch": 0.7614807276427935, + "ewc_loss": 0.048555269837379456, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.0001974667829927057, + "grad_norm": 5.7447919845581055, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8493359684944153, + "num_tokens": 228470779.0, + "step": 5986 + }, + { + "epoch": 0.761607937921384, + "ewc_loss": 0.048573046922683716, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019764450553338975, + "grad_norm": 5.768558979034424, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8756489753723145, + "num_tokens": 228500454.0, + "step": 5987 + }, + { + "epoch": 0.7617351481999746, + "ewc_loss": 0.04857369512319565, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019765101023949683, + "grad_norm": 5.741118907928467, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8679059743881226, + "num_tokens": 228537374.0, + "step": 5988 + }, + { + "epoch": 0.7618623584785651, + "ewc_loss": 0.04854338616132736, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.00019734793750103563, + "grad_norm": 5.740847587585449, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8604134917259216, + "num_tokens": 228577215.0, + "step": 5989 + }, + { + "epoch": 0.7619895687571556, + "ewc_loss": 0.04851936176419258, + "ewc_loss_diag": 2.8848648071289062e-05, + "ewc_loss_parallel": 0.0001971076853806153, + "grad_norm": 5.740967273712158, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8644921779632568, + "num_tokens": 228611693.0, + "step": 5990 + }, + { + "epoch": 0.762116779035746, + "ewc_loss": 0.0487142950296402, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019783632887993008, + "grad_norm": 5.7985639572143555, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.859024167060852, + "num_tokens": 228644231.0, + "step": 5991 + }, + { + "epoch": 0.7622439893143366, + "ewc_loss": 0.04870273917913437, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019772074301727116, + "grad_norm": 5.7213873863220215, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8706257343292236, + "num_tokens": 228684776.0, + "step": 5992 + }, + { + "epoch": 0.7623711995929271, + "ewc_loss": 0.04867086187005043, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019740196876227856, + "grad_norm": 5.757044315338135, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8659440279006958, + "num_tokens": 228724214.0, + "step": 5993 + }, + { + "epoch": 0.7624984098715176, + "ewc_loss": 0.048729557543992996, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019798893481492996, + "grad_norm": 5.772300720214844, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8595016002655029, + "num_tokens": 228765098.0, + "step": 5994 + }, + { + "epoch": 0.7626256201501082, + "ewc_loss": 0.04865926504135132, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019728601910173893, + "grad_norm": 5.75115966796875, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8615119457244873, + "num_tokens": 228802423.0, + "step": 5995 + }, + { + "epoch": 0.7627528304286987, + "ewc_loss": 0.04865769296884537, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001972703030332923, + "grad_norm": 5.78452730178833, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8505263328552246, + "num_tokens": 228840218.0, + "step": 5996 + }, + { + "epoch": 0.7628800407072891, + "ewc_loss": 0.04871132969856262, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019780667207669467, + "grad_norm": 5.747585296630859, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8652745485305786, + "num_tokens": 228882810.0, + "step": 5997 + }, + { + "epoch": 0.7630072509858796, + "ewc_loss": 0.04868200421333313, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019751339277718216, + "grad_norm": 5.810128688812256, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8661825656890869, + "num_tokens": 228917664.0, + "step": 5998 + }, + { + "epoch": 0.7631344612644702, + "ewc_loss": 0.04873194172978401, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019801278540398926, + "grad_norm": 5.816966533660889, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8675490617752075, + "num_tokens": 228951017.0, + "step": 5999 + }, + { + "epoch": 0.7632616715430607, + "ewc_loss": 0.04862532019615173, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019694658112712204, + "grad_norm": 5.719345569610596, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.862174391746521, + "num_tokens": 228990949.0, + "step": 6000 + }, + { + "epoch": 0.7633888818216512, + "ewc_loss": 0.04869993403553963, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001976927014766261, + "grad_norm": 5.786059856414795, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8652979135513306, + "num_tokens": 229026604.0, + "step": 6001 + }, + { + "epoch": 0.7635160921002417, + "ewc_loss": 0.04840291291475296, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001971639139810577, + "grad_norm": 5.749255657196045, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8546977043151855, + "num_tokens": 229061810.0, + "step": 6002 + }, + { + "epoch": 0.7636433023788322, + "ewc_loss": 0.048417508602142334, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019730982603505254, + "grad_norm": 5.721907138824463, + "learning_rate": 1e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8374353647232056, + "num_tokens": 229107765.0, + "step": 6003 + }, + { + "epoch": 0.7637705126574227, + "ewc_loss": 0.04889392852783203, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.0001971912570297718, + "grad_norm": 5.788161754608154, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8480616211891174, + "num_tokens": 229147822.0, + "step": 6004 + }, + { + "epoch": 0.7638977229360132, + "ewc_loss": 0.048610251396894455, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019679588149301708, + "grad_norm": 5.843513011932373, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8699133396148682, + "num_tokens": 229184421.0, + "step": 6005 + }, + { + "epoch": 0.7640249332146037, + "ewc_loss": 0.048389680683612823, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019703159341588616, + "grad_norm": 5.7358717918396, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8602034449577332, + "num_tokens": 229221737.0, + "step": 6006 + }, + { + "epoch": 0.7641521434931943, + "ewc_loss": 0.0486367866396904, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019706123566720635, + "grad_norm": 5.787471771240234, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8543218970298767, + "num_tokens": 229254727.0, + "step": 6007 + }, + { + "epoch": 0.7642793537717848, + "ewc_loss": 0.048426155000925064, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019739630806725472, + "grad_norm": 5.756568431854248, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8702490329742432, + "num_tokens": 229292648.0, + "step": 6008 + }, + { + "epoch": 0.7644065640503753, + "ewc_loss": 0.04842837154865265, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019741848518606275, + "grad_norm": 5.740278244018555, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8650697469711304, + "num_tokens": 229335708.0, + "step": 6009 + }, + { + "epoch": 0.7645337743289657, + "ewc_loss": 0.04867509752511978, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001974443148355931, + "grad_norm": 5.70281982421875, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8684825897216797, + "num_tokens": 229373791.0, + "step": 6010 + }, + { + "epoch": 0.7646609846075563, + "ewc_loss": 0.048712071031332016, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001978140790015459, + "grad_norm": 5.788173675537109, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8699341416358948, + "num_tokens": 229409891.0, + "step": 6011 + }, + { + "epoch": 0.7647881948861468, + "ewc_loss": 0.04845418781042099, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019767662161029875, + "grad_norm": 5.73007869720459, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.862116277217865, + "num_tokens": 229449859.0, + "step": 6012 + }, + { + "epoch": 0.7649154051647373, + "ewc_loss": 0.04867004603147507, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001973938342416659, + "grad_norm": 5.7425971031188965, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8680622577667236, + "num_tokens": 229484224.0, + "step": 6013 + }, + { + "epoch": 0.7650426154433279, + "ewc_loss": 0.048723362386226654, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019792697275988758, + "grad_norm": 5.807680606842041, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8550024628639221, + "num_tokens": 229521104.0, + "step": 6014 + }, + { + "epoch": 0.7651698257219184, + "ewc_loss": 0.04867725446820259, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019746589532587677, + "grad_norm": 5.70806884765625, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8445448875427246, + "num_tokens": 229558918.0, + "step": 6015 + }, + { + "epoch": 0.7652970360005088, + "ewc_loss": 0.04884415864944458, + "ewc_loss_diag": 2.9087066650390625e-05, + "ewc_loss_parallel": 0.00019791423983406276, + "grad_norm": 5.775394916534424, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8469700813293457, + "num_tokens": 229596931.0, + "step": 6016 + }, + { + "epoch": 0.7654242462790993, + "ewc_loss": 0.04878911375999451, + "ewc_loss_diag": 2.9087066650390625e-05, + "ewc_loss_parallel": 0.0001973637699848041, + "grad_norm": 5.7053303718566895, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8445826768875122, + "num_tokens": 229633903.0, + "step": 6017 + }, + { + "epoch": 0.7655514565576899, + "ewc_loss": 0.048514414578676224, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019827891082968563, + "grad_norm": 5.750467300415039, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8571571111679077, + "num_tokens": 229674224.0, + "step": 6018 + }, + { + "epoch": 0.7656786668362804, + "ewc_loss": 0.048487961292266846, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001980144006665796, + "grad_norm": 5.710472106933594, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.858156681060791, + "num_tokens": 229710547.0, + "step": 6019 + }, + { + "epoch": 0.7658058771148709, + "ewc_loss": 0.048609476536512375, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019922952924389392, + "grad_norm": 5.759857177734375, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8757133483886719, + "num_tokens": 229752020.0, + "step": 6020 + }, + { + "epoch": 0.7659330873934614, + "ewc_loss": 0.04849992319941521, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019813398830592632, + "grad_norm": 5.755948066711426, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8584182262420654, + "num_tokens": 229785710.0, + "step": 6021 + }, + { + "epoch": 0.7660602976720519, + "ewc_loss": 0.04852376505732536, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.00019837242143694311, + "grad_norm": 5.7522759437561035, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8672393560409546, + "num_tokens": 229824118.0, + "step": 6022 + }, + { + "epoch": 0.7661875079506424, + "ewc_loss": 0.048731960356235504, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.00019801298913080245, + "grad_norm": 5.758548736572266, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8731410503387451, + "num_tokens": 229854231.0, + "step": 6023 + }, + { + "epoch": 0.7663147182292329, + "ewc_loss": 0.048753708600997925, + "ewc_loss_diag": 2.8967857360839844e-05, + "ewc_loss_parallel": 0.0001982304675038904, + "grad_norm": 5.731037139892578, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8521029949188232, + "num_tokens": 229888102.0, + "step": 6024 + }, + { + "epoch": 0.7664419285078234, + "ewc_loss": 0.04847349971532822, + "ewc_loss_diag": 2.872943878173828e-05, + "ewc_loss_parallel": 0.0001978697400772944, + "grad_norm": 5.733343124389648, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.864161491394043, + "num_tokens": 229927634.0, + "step": 6025 + }, + { + "epoch": 0.766569138786414, + "ewc_loss": 0.04901053011417389, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.0001983572292374447, + "grad_norm": 5.716583728790283, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8686242699623108, + "num_tokens": 229966526.0, + "step": 6026 + }, + { + "epoch": 0.7666963490650045, + "ewc_loss": 0.048982687294483185, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019807885109912604, + "grad_norm": 5.770829677581787, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8699978590011597, + "num_tokens": 230002761.0, + "step": 6027 + }, + { + "epoch": 0.7668235593435949, + "ewc_loss": 0.04897207021713257, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019797265122178942, + "grad_norm": 5.7106614112854, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.866105318069458, + "num_tokens": 230041303.0, + "step": 6028 + }, + { + "epoch": 0.7669507696221854, + "ewc_loss": 0.049098312854766846, + "ewc_loss_diag": 2.9325485229492188e-05, + "ewc_loss_parallel": 0.0001980144006665796, + "grad_norm": 5.749156951904297, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8651722073554993, + "num_tokens": 230081117.0, + "step": 6029 + }, + { + "epoch": 0.767077979900776, + "ewc_loss": 0.04899334907531738, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019818545843008906, + "grad_norm": 5.7518486976623535, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8553041815757751, + "num_tokens": 230119411.0, + "step": 6030 + }, + { + "epoch": 0.7672051901793665, + "ewc_loss": 0.04908234626054764, + "ewc_loss_diag": 2.9325485229492188e-05, + "ewc_loss_parallel": 0.0001978547079488635, + "grad_norm": 5.674310684204102, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8669564127922058, + "num_tokens": 230161111.0, + "step": 6031 + }, + { + "epoch": 0.767332400457957, + "ewc_loss": 0.04905080795288086, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019876005535479635, + "grad_norm": 5.8581318855285645, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8559359312057495, + "num_tokens": 230196999.0, + "step": 6032 + }, + { + "epoch": 0.7674596107365476, + "ewc_loss": 0.04898177087306976, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019806966884061694, + "grad_norm": 5.754560470581055, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8616390228271484, + "num_tokens": 230231495.0, + "step": 6033 + }, + { + "epoch": 0.767586821015138, + "ewc_loss": 0.0489022359251976, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019727429025806487, + "grad_norm": 5.785147190093994, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8532580137252808, + "num_tokens": 230267732.0, + "step": 6034 + }, + { + "epoch": 0.7677140312937285, + "ewc_loss": 0.048984769731760025, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.00019809964578598738, + "grad_norm": 5.810576438903809, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8604655861854553, + "num_tokens": 230297559.0, + "step": 6035 + }, + { + "epoch": 0.767841241572319, + "ewc_loss": 0.04891651123762131, + "ewc_loss_diag": 2.9206275939941406e-05, + "ewc_loss_parallel": 0.0001974170736502856, + "grad_norm": 5.718212604522705, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8482940196990967, + "num_tokens": 230336396.0, + "step": 6036 + }, + { + "epoch": 0.7679684518509096, + "ewc_loss": 0.04921053349971771, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.0001979158550966531, + "grad_norm": 5.750172138214111, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8607818484306335, + "num_tokens": 230368152.0, + "step": 6037 + }, + { + "epoch": 0.7680956621295001, + "ewc_loss": 0.04911230504512787, + "ewc_loss_diag": 2.9325485229492188e-05, + "ewc_loss_parallel": 0.00019815431733150035, + "grad_norm": 5.707828521728516, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8667577505111694, + "num_tokens": 230408081.0, + "step": 6038 + }, + { + "epoch": 0.7682228724080906, + "ewc_loss": 0.049245595932006836, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019826649804599583, + "grad_norm": 5.790038108825684, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8529207110404968, + "num_tokens": 230441691.0, + "step": 6039 + }, + { + "epoch": 0.768350082686681, + "ewc_loss": 0.049219537526369095, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.0001980059314519167, + "grad_norm": 5.709406852722168, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8580331206321716, + "num_tokens": 230480089.0, + "step": 6040 + }, + { + "epoch": 0.7684772929652716, + "ewc_loss": 0.04928136244416237, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019862416957039386, + "grad_norm": 5.753717422485352, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8551149964332581, + "num_tokens": 230518655.0, + "step": 6041 + }, + { + "epoch": 0.7686045032438621, + "ewc_loss": 0.04924726486206055, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019828317454084754, + "grad_norm": 5.753386497497559, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8640461564064026, + "num_tokens": 230555717.0, + "step": 6042 + }, + { + "epoch": 0.7687317135224526, + "ewc_loss": 0.04928121715784073, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019862272893078625, + "grad_norm": 5.720555305480957, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8687719106674194, + "num_tokens": 230596699.0, + "step": 6043 + }, + { + "epoch": 0.7688589238010431, + "ewc_loss": 0.04953581839799881, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019872734264936298, + "grad_norm": 5.783785820007324, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.869005024433136, + "num_tokens": 230633284.0, + "step": 6044 + }, + { + "epoch": 0.7689861340796337, + "ewc_loss": 0.04926517978310585, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.0001984623377211392, + "grad_norm": 5.72078275680542, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8635714054107666, + "num_tokens": 230673133.0, + "step": 6045 + }, + { + "epoch": 0.7691133443582241, + "ewc_loss": 0.04924364015460014, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019824695482384413, + "grad_norm": 5.723403453826904, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8619301319122314, + "num_tokens": 230719003.0, + "step": 6046 + }, + { + "epoch": 0.7692405546368146, + "ewc_loss": 0.04922424256801605, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019805297779385, + "grad_norm": 5.758659839630127, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8704516887664795, + "num_tokens": 230754805.0, + "step": 6047 + }, + { + "epoch": 0.7693677649154052, + "ewc_loss": 0.049232445657253265, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019813502149190754, + "grad_norm": 5.7867560386657715, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8472477197647095, + "num_tokens": 230793249.0, + "step": 6048 + }, + { + "epoch": 0.7694949751939957, + "ewc_loss": 0.04906298220157623, + "ewc_loss_diag": 2.9325485229492188e-05, + "ewc_loss_parallel": 0.0001976610510610044, + "grad_norm": 5.73728084564209, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8771107196807861, + "num_tokens": 230826025.0, + "step": 6049 + }, + { + "epoch": 0.7696221854725862, + "ewc_loss": 0.049189068377017975, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019770124345086515, + "grad_norm": 5.699733257293701, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8718768358230591, + "num_tokens": 230866615.0, + "step": 6050 + }, + { + "epoch": 0.7697493957511767, + "ewc_loss": 0.04919750988483429, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019778564455918968, + "grad_norm": 5.873684883117676, + "learning_rate": 1e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8451588749885559, + "num_tokens": 230901587.0, + "step": 6051 + }, + { + "epoch": 0.7698766060297672, + "ewc_loss": 0.04917103797197342, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019752093066927046, + "grad_norm": 5.641973495483398, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8664543628692627, + "num_tokens": 230944267.0, + "step": 6052 + }, + { + "epoch": 0.7700038163083577, + "ewc_loss": 0.04922270029783249, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019803752365987748, + "grad_norm": 5.847389221191406, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8527283668518066, + "num_tokens": 230980022.0, + "step": 6053 + }, + { + "epoch": 0.7701310265869482, + "ewc_loss": 0.04919575899839401, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019776812405325472, + "grad_norm": 5.678046226501465, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8675839900970459, + "num_tokens": 231018110.0, + "step": 6054 + }, + { + "epoch": 0.7702582368655387, + "ewc_loss": 0.04924885928630829, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.0001982991525437683, + "grad_norm": 5.746004581451416, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8654358386993408, + "num_tokens": 231056531.0, + "step": 6055 + }, + { + "epoch": 0.7703854471441293, + "ewc_loss": 0.04917879402637482, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019759847782552242, + "grad_norm": 5.692092418670654, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8662040829658508, + "num_tokens": 231096270.0, + "step": 6056 + }, + { + "epoch": 0.7705126574227198, + "ewc_loss": 0.04923710599541664, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019818160217255354, + "grad_norm": 5.740546703338623, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8536866903305054, + "num_tokens": 231136300.0, + "step": 6057 + }, + { + "epoch": 0.7706398677013102, + "ewc_loss": 0.04926106333732605, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019842115580104291, + "grad_norm": 5.759403705596924, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8485114574432373, + "num_tokens": 231169005.0, + "step": 6058 + }, + { + "epoch": 0.7707670779799007, + "ewc_loss": 0.04926253855228424, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019843594054691494, + "grad_norm": 5.7848005294799805, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8674726486206055, + "num_tokens": 231205274.0, + "step": 6059 + }, + { + "epoch": 0.7708942882584913, + "ewc_loss": 0.04918952286243439, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019770579820033163, + "grad_norm": 5.738844394683838, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8664395213127136, + "num_tokens": 231239022.0, + "step": 6060 + }, + { + "epoch": 0.7710214985370818, + "ewc_loss": 0.04927043616771698, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019851489923894405, + "grad_norm": 5.759010314941406, + "learning_rate": 1e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8426337838172913, + "num_tokens": 231282952.0, + "step": 6061 + }, + { + "epoch": 0.7711487088156723, + "ewc_loss": 0.04938029125332832, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019839275046251714, + "grad_norm": 5.7285990715026855, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8584479093551636, + "num_tokens": 231322980.0, + "step": 6062 + }, + { + "epoch": 0.7712759190942629, + "ewc_loss": 0.049348972737789154, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019807957869488746, + "grad_norm": 5.79133415222168, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8529674410820007, + "num_tokens": 231358382.0, + "step": 6063 + }, + { + "epoch": 0.7714031293728534, + "ewc_loss": 0.04941088706254959, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019869868992827833, + "grad_norm": 5.77526330947876, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8692786693572998, + "num_tokens": 231401915.0, + "step": 6064 + }, + { + "epoch": 0.7715303396514438, + "ewc_loss": 0.049301281571388245, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001976026687771082, + "grad_norm": 5.813100337982178, + "learning_rate": 1e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8378452658653259, + "num_tokens": 231433241.0, + "step": 6065 + }, + { + "epoch": 0.7716575499300343, + "ewc_loss": 0.049342505633831024, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001980149099836126, + "grad_norm": 5.708467483520508, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8563852310180664, + "num_tokens": 231478318.0, + "step": 6066 + }, + { + "epoch": 0.7717847602086249, + "ewc_loss": 0.04930677264928818, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001976575586013496, + "grad_norm": 5.730264663696289, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8527601957321167, + "num_tokens": 231518584.0, + "step": 6067 + }, + { + "epoch": 0.7719119704872154, + "ewc_loss": 0.04934648051857948, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019805465126410127, + "grad_norm": 5.707643985748291, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8611177802085876, + "num_tokens": 231556689.0, + "step": 6068 + }, + { + "epoch": 0.7720391807658059, + "ewc_loss": 0.04932036250829697, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001977934589376673, + "grad_norm": 5.79539680480957, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8593546152114868, + "num_tokens": 231593936.0, + "step": 6069 + }, + { + "epoch": 0.7721663910443964, + "ewc_loss": 0.04939190670847893, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019850890384986997, + "grad_norm": 5.70024299621582, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8800749778747559, + "num_tokens": 231634785.0, + "step": 6070 + }, + { + "epoch": 0.7722936013229869, + "ewc_loss": 0.04937766492366791, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001983664697036147, + "grad_norm": 5.77145528793335, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8565868139266968, + "num_tokens": 231675798.0, + "step": 6071 + }, + { + "epoch": 0.7724208116015774, + "ewc_loss": 0.0494077205657959, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001986670249607414, + "grad_norm": 5.806922912597656, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8581019639968872, + "num_tokens": 231711611.0, + "step": 6072 + }, + { + "epoch": 0.7725480218801679, + "ewc_loss": 0.049339957535266876, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019798940047621727, + "grad_norm": 5.750394344329834, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8507370352745056, + "num_tokens": 231746997.0, + "step": 6073 + }, + { + "epoch": 0.7726752321587584, + "ewc_loss": 0.04937349259853363, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019832479301840067, + "grad_norm": 5.751362323760986, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8404909372329712, + "num_tokens": 231787696.0, + "step": 6074 + }, + { + "epoch": 0.772802442437349, + "ewc_loss": 0.049429312348365784, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001988829899346456, + "grad_norm": 5.795015811920166, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8520479202270508, + "num_tokens": 231826127.0, + "step": 6075 + }, + { + "epoch": 0.7729296527159395, + "ewc_loss": 0.04939064383506775, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019849628733936697, + "grad_norm": 5.765031337738037, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8543822765350342, + "num_tokens": 231857538.0, + "step": 6076 + }, + { + "epoch": 0.7730568629945299, + "ewc_loss": 0.04943300783634186, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001989199226954952, + "grad_norm": 5.74454927444458, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8590060472488403, + "num_tokens": 231893913.0, + "step": 6077 + }, + { + "epoch": 0.7731840732731204, + "ewc_loss": 0.04942312836647034, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019882110063917935, + "grad_norm": 5.73247766494751, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8626154661178589, + "num_tokens": 231934274.0, + "step": 6078 + }, + { + "epoch": 0.773311283551711, + "ewc_loss": 0.04945862293243408, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019917608005926013, + "grad_norm": 5.72324800491333, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8538538217544556, + "num_tokens": 231983197.0, + "step": 6079 + }, + { + "epoch": 0.7734384938303015, + "ewc_loss": 0.0494704507291317, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001992943580262363, + "grad_norm": 5.776266574859619, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8556597828865051, + "num_tokens": 232021265.0, + "step": 6080 + }, + { + "epoch": 0.773565704108892, + "ewc_loss": 0.04947464168071747, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019933623843826354, + "grad_norm": 5.725919246673584, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8580388426780701, + "num_tokens": 232060992.0, + "step": 6081 + }, + { + "epoch": 0.7736929143874826, + "ewc_loss": 0.049449384212493896, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019908370450139046, + "grad_norm": 5.8068671226501465, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8567663431167603, + "num_tokens": 232096758.0, + "step": 6082 + }, + { + "epoch": 0.773820124666073, + "ewc_loss": 0.04935865476727486, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019939709454774857, + "grad_norm": 5.782403469085693, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8572604656219482, + "num_tokens": 232133773.0, + "step": 6083 + }, + { + "epoch": 0.7739473349446635, + "ewc_loss": 0.04957970231771469, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019916615565307438, + "grad_norm": 5.718831539154053, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8584302663803101, + "num_tokens": 232173182.0, + "step": 6084 + }, + { + "epoch": 0.774074545223254, + "ewc_loss": 0.04945816099643707, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001991714525502175, + "grad_norm": 5.823055267333984, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8676959276199341, + "num_tokens": 232206478.0, + "step": 6085 + }, + { + "epoch": 0.7742017555018446, + "ewc_loss": 0.04930679500102997, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019887852249667048, + "grad_norm": 5.8173441886901855, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8536955714225769, + "num_tokens": 232242231.0, + "step": 6086 + }, + { + "epoch": 0.7743289657804351, + "ewc_loss": 0.049348022788763046, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019807007629424334, + "grad_norm": 5.778904438018799, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8631255626678467, + "num_tokens": 232270636.0, + "step": 6087 + }, + { + "epoch": 0.7744561760590256, + "ewc_loss": 0.049416109919548035, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019875096040777862, + "grad_norm": 5.769468307495117, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8650202751159668, + "num_tokens": 232309974.0, + "step": 6088 + }, + { + "epoch": 0.774583386337616, + "ewc_loss": 0.04935178905725479, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019810775120276958, + "grad_norm": 5.748225688934326, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8689815998077393, + "num_tokens": 232344118.0, + "step": 6089 + }, + { + "epoch": 0.7747105966162066, + "ewc_loss": 0.04936062544584274, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.000198196095880121, + "grad_norm": 5.701426029205322, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8500520586967468, + "num_tokens": 232385183.0, + "step": 6090 + }, + { + "epoch": 0.7748378068947971, + "ewc_loss": 0.0494653582572937, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.000199243426322937, + "grad_norm": 5.776501178741455, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8606505990028381, + "num_tokens": 232421670.0, + "step": 6091 + }, + { + "epoch": 0.7749650171733876, + "ewc_loss": 0.04935288429260254, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019811866513919085, + "grad_norm": 5.743775844573975, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8493245840072632, + "num_tokens": 232459563.0, + "step": 6092 + }, + { + "epoch": 0.7750922274519781, + "ewc_loss": 0.04939045011997223, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019849432283081114, + "grad_norm": 5.779514789581299, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.859861433506012, + "num_tokens": 232496357.0, + "step": 6093 + }, + { + "epoch": 0.7752194377305687, + "ewc_loss": 0.04942086338996887, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001987985015148297, + "grad_norm": 5.7909016609191895, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8599214553833008, + "num_tokens": 232530436.0, + "step": 6094 + }, + { + "epoch": 0.7753466480091591, + "ewc_loss": 0.04939042776823044, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019849411910399795, + "grad_norm": 5.739744186401367, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8543148040771484, + "num_tokens": 232568585.0, + "step": 6095 + }, + { + "epoch": 0.7754738582877496, + "ewc_loss": 0.04942959547042847, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019888579845428467, + "grad_norm": 5.765961647033691, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8662034273147583, + "num_tokens": 232602506.0, + "step": 6096 + }, + { + "epoch": 0.7756010685663401, + "ewc_loss": 0.04938046634197235, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.000198394525796175, + "grad_norm": 5.703269958496094, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8677461743354797, + "num_tokens": 232642819.0, + "step": 6097 + }, + { + "epoch": 0.7757282788449307, + "ewc_loss": 0.049438074231147766, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019897057791240513, + "grad_norm": 5.815900802612305, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8469975590705872, + "num_tokens": 232679505.0, + "step": 6098 + }, + { + "epoch": 0.7758554891235212, + "ewc_loss": 0.049660101532936096, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019997016352135688, + "grad_norm": 5.828433513641357, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.864766538143158, + "num_tokens": 232715701.0, + "step": 6099 + }, + { + "epoch": 0.7759826994021117, + "ewc_loss": 0.049492716789245605, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019829632947221398, + "grad_norm": 5.838054180145264, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8618454933166504, + "num_tokens": 232750786.0, + "step": 6100 + }, + { + "epoch": 0.7761099096807021, + "ewc_loss": 0.049419865012168884, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019878851890098304, + "grad_norm": 5.718584060668945, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8501026630401611, + "num_tokens": 232791370.0, + "step": 6101 + }, + { + "epoch": 0.7762371199592927, + "ewc_loss": 0.049317825585603714, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019898881146218628, + "grad_norm": 5.764339447021484, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8614629507064819, + "num_tokens": 232830895.0, + "step": 6102 + }, + { + "epoch": 0.7763643302378832, + "ewc_loss": 0.049394555389881134, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019853541743941605, + "grad_norm": 5.721362113952637, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.859078586101532, + "num_tokens": 232869940.0, + "step": 6103 + }, + { + "epoch": 0.7764915405164737, + "ewc_loss": 0.0493793748319149, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019960429926868528, + "grad_norm": 5.765819072723389, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8540905714035034, + "num_tokens": 232910743.0, + "step": 6104 + }, + { + "epoch": 0.7766187507950643, + "ewc_loss": 0.04943319782614708, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019892181444447488, + "grad_norm": 5.748299598693848, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.859000027179718, + "num_tokens": 232948428.0, + "step": 6105 + }, + { + "epoch": 0.7767459610736548, + "ewc_loss": 0.04933246225118637, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019913516007363796, + "grad_norm": 5.774012565612793, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8662028312683105, + "num_tokens": 232985142.0, + "step": 6106 + }, + { + "epoch": 0.7768731713522452, + "ewc_loss": 0.04946688935160637, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019925873493775725, + "grad_norm": 5.789148330688477, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.869629442691803, + "num_tokens": 233020810.0, + "step": 6107 + }, + { + "epoch": 0.7770003816308357, + "ewc_loss": 0.04942217841744423, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019881162734236568, + "grad_norm": 5.792534351348877, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8515223264694214, + "num_tokens": 233055674.0, + "step": 6108 + }, + { + "epoch": 0.7771275919094263, + "ewc_loss": 0.04938933998346329, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019970392168033868, + "grad_norm": 5.852530479431152, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8551770448684692, + "num_tokens": 233084506.0, + "step": 6109 + }, + { + "epoch": 0.7772548021880168, + "ewc_loss": 0.04927990585565567, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019860963220708072, + "grad_norm": 5.7435197830200195, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8774071335792542, + "num_tokens": 233120172.0, + "step": 6110 + }, + { + "epoch": 0.7773820124666073, + "ewc_loss": 0.049319375306367874, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019900429469998926, + "grad_norm": 5.79106330871582, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8497845530509949, + "num_tokens": 233160075.0, + "step": 6111 + }, + { + "epoch": 0.7775092227451978, + "ewc_loss": 0.04930972680449486, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.0001989078155020252, + "grad_norm": 5.745293617248535, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8519599437713623, + "num_tokens": 233201714.0, + "step": 6112 + }, + { + "epoch": 0.7776364330237884, + "ewc_loss": 0.04945729300379753, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019916276505682617, + "grad_norm": 5.768608093261719, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8657531142234802, + "num_tokens": 233237642.0, + "step": 6113 + }, + { + "epoch": 0.7777636433023788, + "ewc_loss": 0.04945443570613861, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019913421419914812, + "grad_norm": 5.7456746101379395, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8654794692993164, + "num_tokens": 233279712.0, + "step": 6114 + }, + { + "epoch": 0.7778908535809693, + "ewc_loss": 0.04941197857260704, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019870963296853006, + "grad_norm": 5.748671531677246, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8653771281242371, + "num_tokens": 233322441.0, + "step": 6115 + }, + { + "epoch": 0.7780180638595598, + "ewc_loss": 0.04946792870759964, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019926913955714554, + "grad_norm": 5.79965877532959, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8539029359817505, + "num_tokens": 233371053.0, + "step": 6116 + }, + { + "epoch": 0.7781452741381504, + "ewc_loss": 0.049525488168001175, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019862402405124158, + "grad_norm": 5.777641296386719, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8610272407531738, + "num_tokens": 233405372.0, + "step": 6117 + }, + { + "epoch": 0.7782724844167409, + "ewc_loss": 0.049390293657779694, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001984927657758817, + "grad_norm": 5.8426289558410645, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8662284016609192, + "num_tokens": 233447209.0, + "step": 6118 + }, + { + "epoch": 0.7783996946953314, + "ewc_loss": 0.04954797029495239, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019884883658960462, + "grad_norm": 5.765011310577393, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8644790649414062, + "num_tokens": 233484621.0, + "step": 6119 + }, + { + "epoch": 0.7785269049739219, + "ewc_loss": 0.049374401569366455, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019833385886158794, + "grad_norm": 5.829294204711914, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8436967730522156, + "num_tokens": 233520756.0, + "step": 6120 + }, + { + "epoch": 0.7786541152525124, + "ewc_loss": 0.04951788857579231, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019854803394991904, + "grad_norm": 5.830665588378906, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8528259992599487, + "num_tokens": 233559120.0, + "step": 6121 + }, + { + "epoch": 0.7787813255311029, + "ewc_loss": 0.04932978004217148, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019788762438111007, + "grad_norm": 5.848202228546143, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8704565167427063, + "num_tokens": 233590513.0, + "step": 6122 + }, + { + "epoch": 0.7789085358096934, + "ewc_loss": 0.04935868829488754, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019817674183286726, + "grad_norm": 5.7525529861450195, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8762324452400208, + "num_tokens": 233625986.0, + "step": 6123 + }, + { + "epoch": 0.779035746088284, + "ewc_loss": 0.04931539297103882, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019774377869907767, + "grad_norm": 5.821476459503174, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.868600606918335, + "num_tokens": 233655097.0, + "step": 6124 + }, + { + "epoch": 0.7791629563668745, + "ewc_loss": 0.04933041334152222, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019789398356806487, + "grad_norm": 5.797431468963623, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8499725461006165, + "num_tokens": 233687815.0, + "step": 6125 + }, + { + "epoch": 0.7792901666454649, + "ewc_loss": 0.04933036118745804, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019789345969911665, + "grad_norm": 5.778768062591553, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8735532760620117, + "num_tokens": 233722611.0, + "step": 6126 + }, + { + "epoch": 0.7794173769240554, + "ewc_loss": 0.049367062747478485, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019826045900117606, + "grad_norm": 5.7544636726379395, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8475803136825562, + "num_tokens": 233765345.0, + "step": 6127 + }, + { + "epoch": 0.779544587202646, + "ewc_loss": 0.04936520755290985, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001982419053092599, + "grad_norm": 5.739104270935059, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.861603856086731, + "num_tokens": 233804358.0, + "step": 6128 + }, + { + "epoch": 0.7796717974812365, + "ewc_loss": 0.04940880089998245, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019867786613758653, + "grad_norm": 5.774422645568848, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8487114310264587, + "num_tokens": 233845117.0, + "step": 6129 + }, + { + "epoch": 0.779799007759827, + "ewc_loss": 0.049447208642959595, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019906190573237836, + "grad_norm": 5.769612789154053, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8524495363235474, + "num_tokens": 233884091.0, + "step": 6130 + }, + { + "epoch": 0.7799262180384176, + "ewc_loss": 0.049408718943595886, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019867703667841852, + "grad_norm": 5.7886481285095215, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8578249216079712, + "num_tokens": 233922590.0, + "step": 6131 + }, + { + "epoch": 0.780053428317008, + "ewc_loss": 0.049402058124542236, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001986104471143335, + "grad_norm": 5.768764019012451, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8622698783874512, + "num_tokens": 233954138.0, + "step": 6132 + }, + { + "epoch": 0.7801806385955985, + "ewc_loss": 0.04959164559841156, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019928559777326882, + "grad_norm": 5.802132606506348, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.869609534740448, + "num_tokens": 233988035.0, + "step": 6133 + }, + { + "epoch": 0.780307848874189, + "ewc_loss": 0.0493951216340065, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019854106358252466, + "grad_norm": 5.82064151763916, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8495515584945679, + "num_tokens": 234023747.0, + "step": 6134 + }, + { + "epoch": 0.7804350591527796, + "ewc_loss": 0.049416184425354004, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019875168800354004, + "grad_norm": 5.74389123916626, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8687953948974609, + "num_tokens": 234064182.0, + "step": 6135 + }, + { + "epoch": 0.7805622694313701, + "ewc_loss": 0.04939742758870125, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019856412836816162, + "grad_norm": 5.835330963134766, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8431157469749451, + "num_tokens": 234106533.0, + "step": 6136 + }, + { + "epoch": 0.7806894797099606, + "ewc_loss": 0.049400102347135544, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019859087478835136, + "grad_norm": 5.706843852996826, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8730857372283936, + "num_tokens": 234145492.0, + "step": 6137 + }, + { + "epoch": 0.780816689988551, + "ewc_loss": 0.049418069422245026, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.000198770547285676, + "grad_norm": 5.815219402313232, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.862516462802887, + "num_tokens": 234181709.0, + "step": 6138 + }, + { + "epoch": 0.7809439002671416, + "ewc_loss": 0.049427542835474014, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019886528025381267, + "grad_norm": 5.745954513549805, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.860146164894104, + "num_tokens": 234217854.0, + "step": 6139 + }, + { + "epoch": 0.7810711105457321, + "ewc_loss": 0.04936673492193222, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001982571993721649, + "grad_norm": 5.736785411834717, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8607604503631592, + "num_tokens": 234255769.0, + "step": 6140 + }, + { + "epoch": 0.7811983208243226, + "ewc_loss": 0.04947974905371666, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019938733021263033, + "grad_norm": 5.8239312171936035, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8457329273223877, + "num_tokens": 234290828.0, + "step": 6141 + }, + { + "epoch": 0.7813255311029131, + "ewc_loss": 0.04940192401409149, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019860909378621727, + "grad_norm": 5.751611709594727, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8596483469009399, + "num_tokens": 234326336.0, + "step": 6142 + }, + { + "epoch": 0.7814527413815037, + "ewc_loss": 0.049541160464286804, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019878073362633586, + "grad_norm": 5.8381524085998535, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8558170199394226, + "num_tokens": 234366210.0, + "step": 6143 + }, + { + "epoch": 0.7815799516600941, + "ewc_loss": 0.04941253364086151, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019871517724823207, + "grad_norm": 5.747929096221924, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8514063954353333, + "num_tokens": 234398975.0, + "step": 6144 + }, + { + "epoch": 0.7817071619386846, + "ewc_loss": 0.049325793981552124, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.00019906846864614636, + "grad_norm": 5.77736234664917, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8537462949752808, + "num_tokens": 234437746.0, + "step": 6145 + }, + { + "epoch": 0.7818343722172751, + "ewc_loss": 0.04931378364562988, + "ewc_loss_diag": 2.944469451904297e-05, + "ewc_loss_parallel": 0.0001989484007935971, + "grad_norm": 5.757742404937744, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8637542724609375, + "num_tokens": 234478402.0, + "step": 6146 + }, + { + "epoch": 0.7819615824958657, + "ewc_loss": 0.049453333020210266, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001991231692954898, + "grad_norm": 5.846784591674805, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8437526226043701, + "num_tokens": 234509699.0, + "step": 6147 + }, + { + "epoch": 0.7820887927744562, + "ewc_loss": 0.04958319664001465, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001992010948015377, + "grad_norm": 5.72546911239624, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8545750379562378, + "num_tokens": 234551515.0, + "step": 6148 + }, + { + "epoch": 0.7822160030530467, + "ewc_loss": 0.04944472014904022, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019903706561308354, + "grad_norm": 5.7893877029418945, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8669247627258301, + "num_tokens": 234582811.0, + "step": 6149 + }, + { + "epoch": 0.7823432133316371, + "ewc_loss": 0.04956931620836258, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019906232773791999, + "grad_norm": 5.7181501388549805, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8528648614883423, + "num_tokens": 234625299.0, + "step": 6150 + }, + { + "epoch": 0.7824704236102277, + "ewc_loss": 0.04958014190196991, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001991705794353038, + "grad_norm": 5.797979354858398, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8654258847236633, + "num_tokens": 234665641.0, + "step": 6151 + }, + { + "epoch": 0.7825976338888182, + "ewc_loss": 0.04957983270287514, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019916746532544494, + "grad_norm": 5.770951747894287, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8585508465766907, + "num_tokens": 234704668.0, + "step": 6152 + }, + { + "epoch": 0.7827248441674087, + "ewc_loss": 0.049588222056627274, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019925135711673647, + "grad_norm": 5.850970268249512, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.863868236541748, + "num_tokens": 234738080.0, + "step": 6153 + }, + { + "epoch": 0.7828520544459993, + "ewc_loss": 0.049385055899620056, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019844039343297482, + "grad_norm": 5.709423542022705, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.857573390007019, + "num_tokens": 234781077.0, + "step": 6154 + }, + { + "epoch": 0.7829792647245898, + "ewc_loss": 0.04942353069782257, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001988251315196976, + "grad_norm": 5.885097980499268, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8448137640953064, + "num_tokens": 234809374.0, + "step": 6155 + }, + { + "epoch": 0.7831064750031802, + "ewc_loss": 0.04945366457104683, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019912648713216186, + "grad_norm": 5.782740116119385, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8476691842079163, + "num_tokens": 234845585.0, + "step": 6156 + }, + { + "epoch": 0.7832336852817707, + "ewc_loss": 0.049391474574804306, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019850459648296237, + "grad_norm": 5.850323677062988, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8584331274032593, + "num_tokens": 234883891.0, + "step": 6157 + }, + { + "epoch": 0.7833608955603613, + "ewc_loss": 0.04940204322338104, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.000198610287043266, + "grad_norm": 5.719803333282471, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8592391014099121, + "num_tokens": 234919445.0, + "step": 6158 + }, + { + "epoch": 0.7834881058389518, + "ewc_loss": 0.0493680015206337, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019826984498649836, + "grad_norm": 5.761045932769775, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8621205687522888, + "num_tokens": 234951465.0, + "step": 6159 + }, + { + "epoch": 0.7836153161175423, + "ewc_loss": 0.04941662773489952, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019875611178576946, + "grad_norm": 5.791134834289551, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.850586473941803, + "num_tokens": 234989970.0, + "step": 6160 + }, + { + "epoch": 0.7837425263961328, + "ewc_loss": 0.04942924529314041, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019888230599462986, + "grad_norm": 5.757784843444824, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8598842620849609, + "num_tokens": 235029673.0, + "step": 6161 + }, + { + "epoch": 0.7838697366747234, + "ewc_loss": 0.049598328769207, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019935244927182794, + "grad_norm": 5.794801712036133, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.858018159866333, + "num_tokens": 235065389.0, + "step": 6162 + }, + { + "epoch": 0.7839969469533138, + "ewc_loss": 0.04955558106303215, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019892494310624897, + "grad_norm": 5.791952133178711, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8715450763702393, + "num_tokens": 235099795.0, + "step": 6163 + }, + { + "epoch": 0.7841241572319043, + "ewc_loss": 0.049615390598773956, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019952302682213485, + "grad_norm": 5.748002052307129, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8607140183448792, + "num_tokens": 235137475.0, + "step": 6164 + }, + { + "epoch": 0.7842513675104948, + "ewc_loss": 0.049597255885601044, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001993417099583894, + "grad_norm": 5.787231922149658, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8553314208984375, + "num_tokens": 235178454.0, + "step": 6165 + }, + { + "epoch": 0.7843785777890854, + "ewc_loss": 0.04959122836589813, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001992814359255135, + "grad_norm": 5.760319709777832, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8771919012069702, + "num_tokens": 235216139.0, + "step": 6166 + }, + { + "epoch": 0.7845057880676759, + "ewc_loss": 0.04959911108016968, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001993602782022208, + "grad_norm": 5.7705206871032715, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8743159770965576, + "num_tokens": 235253571.0, + "step": 6167 + }, + { + "epoch": 0.7846329983462664, + "ewc_loss": 0.049579884856939316, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019916798919439316, + "grad_norm": 5.750082969665527, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.844182550907135, + "num_tokens": 235293950.0, + "step": 6168 + }, + { + "epoch": 0.7847602086248568, + "ewc_loss": 0.04955541342496872, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019892325508408248, + "grad_norm": 5.6955742835998535, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8646810054779053, + "num_tokens": 235336419.0, + "step": 6169 + }, + { + "epoch": 0.7848874189034474, + "ewc_loss": 0.04959987476468086, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019936788885388523, + "grad_norm": 5.792867183685303, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8635022044181824, + "num_tokens": 235376039.0, + "step": 6170 + }, + { + "epoch": 0.7850146291820379, + "ewc_loss": 0.04958143085241318, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019918342877645046, + "grad_norm": 5.782337188720703, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8473808765411377, + "num_tokens": 235412736.0, + "step": 6171 + }, + { + "epoch": 0.7851418394606284, + "ewc_loss": 0.0495564229786396, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001989333686651662, + "grad_norm": 5.745736122131348, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8601073026657104, + "num_tokens": 235457318.0, + "step": 6172 + }, + { + "epoch": 0.785269049739219, + "ewc_loss": 0.04959584027528763, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019932756549678743, + "grad_norm": 5.825617790222168, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8491710424423218, + "num_tokens": 235491471.0, + "step": 6173 + }, + { + "epoch": 0.7853962600178095, + "ewc_loss": 0.049570538103580475, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019907450769096613, + "grad_norm": 5.763688564300537, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8705804347991943, + "num_tokens": 235531324.0, + "step": 6174 + }, + { + "epoch": 0.7855234702963999, + "ewc_loss": 0.049610767513513565, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019947680993936956, + "grad_norm": 5.794341087341309, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8483016490936279, + "num_tokens": 235573233.0, + "step": 6175 + }, + { + "epoch": 0.7856506805749904, + "ewc_loss": 0.04953991621732712, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001987682917388156, + "grad_norm": 5.734247207641602, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8613104820251465, + "num_tokens": 235614400.0, + "step": 6176 + }, + { + "epoch": 0.785777890853581, + "ewc_loss": 0.04965873062610626, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019995645561721176, + "grad_norm": 5.817732334136963, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8670696020126343, + "num_tokens": 235652247.0, + "step": 6177 + }, + { + "epoch": 0.7859051011321715, + "ewc_loss": 0.04960213601589203, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019939047342631966, + "grad_norm": 5.86673641204834, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8548216223716736, + "num_tokens": 235681810.0, + "step": 6178 + }, + { + "epoch": 0.786032311410762, + "ewc_loss": 0.04961400106549263, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019950915884692222, + "grad_norm": 5.8249406814575195, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.868475079536438, + "num_tokens": 235718422.0, + "step": 6179 + }, + { + "epoch": 0.7861595216893525, + "ewc_loss": 0.049610089510679245, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019947002874687314, + "grad_norm": 5.802591800689697, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8549665212631226, + "num_tokens": 235758886.0, + "step": 6180 + }, + { + "epoch": 0.786286731967943, + "ewc_loss": 0.04955201968550682, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019888933456968516, + "grad_norm": 6.062536239624023, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8631612062454224, + "num_tokens": 235792270.0, + "step": 6181 + }, + { + "epoch": 0.7864139422465335, + "ewc_loss": 0.04958386719226837, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019920778868254274, + "grad_norm": 5.7911529541015625, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8494083881378174, + "num_tokens": 235826678.0, + "step": 6182 + }, + { + "epoch": 0.786541152525124, + "ewc_loss": 0.049513332545757294, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019850244279950857, + "grad_norm": 5.867056846618652, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8714027404785156, + "num_tokens": 235860437.0, + "step": 6183 + }, + { + "epoch": 0.7866683628037145, + "ewc_loss": 0.04951213300228119, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019849046657327563, + "grad_norm": 5.718930721282959, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8651006817817688, + "num_tokens": 235900865.0, + "step": 6184 + }, + { + "epoch": 0.7867955730823051, + "ewc_loss": 0.049622394144535065, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019959309429395944, + "grad_norm": 5.8309125900268555, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8458865880966187, + "num_tokens": 235937419.0, + "step": 6185 + }, + { + "epoch": 0.7869227833608956, + "ewc_loss": 0.049604929983615875, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019941841310355812, + "grad_norm": 5.809840202331543, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8501977920532227, + "num_tokens": 235976425.0, + "step": 6186 + }, + { + "epoch": 0.787049993639486, + "ewc_loss": 0.049577001482248306, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019913916185032576, + "grad_norm": 5.826977252960205, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.854576587677002, + "num_tokens": 236012968.0, + "step": 6187 + }, + { + "epoch": 0.7871772039180766, + "ewc_loss": 0.04959698021411896, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019933893054258078, + "grad_norm": 5.778179168701172, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8607060313224792, + "num_tokens": 236052462.0, + "step": 6188 + }, + { + "epoch": 0.7873044141966671, + "ewc_loss": 0.04957643896341324, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019913354481104761, + "grad_norm": 5.836394309997559, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8744691014289856, + "num_tokens": 236087604.0, + "step": 6189 + }, + { + "epoch": 0.7874316244752576, + "ewc_loss": 0.049576111137866974, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019913024152629077, + "grad_norm": 5.747687339782715, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8591877818107605, + "num_tokens": 236126344.0, + "step": 6190 + }, + { + "epoch": 0.7875588347538481, + "ewc_loss": 0.04958726838231087, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001992418256122619, + "grad_norm": 5.78491735458374, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8677133917808533, + "num_tokens": 236161913.0, + "step": 6191 + }, + { + "epoch": 0.7876860450324387, + "ewc_loss": 0.04958633705973625, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019923251238651574, + "grad_norm": 5.812811851501465, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8626134395599365, + "num_tokens": 236200612.0, + "step": 6192 + }, + { + "epoch": 0.7878132553110291, + "ewc_loss": 0.049629513174295425, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019966426771134138, + "grad_norm": 5.800525188446045, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8657363057136536, + "num_tokens": 236238136.0, + "step": 6193 + }, + { + "epoch": 0.7879404655896196, + "ewc_loss": 0.04959501326084137, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001993192854570225, + "grad_norm": 5.779532432556152, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8570864796638489, + "num_tokens": 236281303.0, + "step": 6194 + }, + { + "epoch": 0.7880676758682101, + "ewc_loss": 0.04961124807596207, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019948161207139492, + "grad_norm": 5.819977283477783, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.856361985206604, + "num_tokens": 236322355.0, + "step": 6195 + }, + { + "epoch": 0.7881948861468007, + "ewc_loss": 0.049655646085739136, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019992562010884285, + "grad_norm": 6.065125942230225, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8583061695098877, + "num_tokens": 236354263.0, + "step": 6196 + }, + { + "epoch": 0.7883220964253912, + "ewc_loss": 0.04955030605196953, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019887220696546137, + "grad_norm": 5.7155866622924805, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8620782494544983, + "num_tokens": 236391581.0, + "step": 6197 + }, + { + "epoch": 0.7884493067039817, + "ewc_loss": 0.04954999312758446, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019886906375177205, + "grad_norm": 5.837117671966553, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8681542873382568, + "num_tokens": 236426003.0, + "step": 6198 + }, + { + "epoch": 0.7885765169825721, + "ewc_loss": 0.049552690237760544, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019889604300260544, + "grad_norm": 5.75696325302124, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.87027907371521, + "num_tokens": 236464874.0, + "step": 6199 + }, + { + "epoch": 0.7887037272611627, + "ewc_loss": 0.04960604012012482, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019942955987062305, + "grad_norm": 5.809906959533691, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8718897700309753, + "num_tokens": 236499826.0, + "step": 6200 + }, + { + "epoch": 0.7888309375397532, + "ewc_loss": 0.04960183426737785, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019938749028369784, + "grad_norm": 5.754176139831543, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8717832565307617, + "num_tokens": 236538615.0, + "step": 6201 + }, + { + "epoch": 0.7889581478183437, + "ewc_loss": 0.04960324615240097, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001994015765376389, + "grad_norm": 5.811027526855469, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8579553365707397, + "num_tokens": 236577860.0, + "step": 6202 + }, + { + "epoch": 0.7890853580969343, + "ewc_loss": 0.04965946078300476, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0001999637606786564, + "grad_norm": 5.7570929527282715, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8744301795959473, + "num_tokens": 236619041.0, + "step": 6203 + }, + { + "epoch": 0.7892125683755248, + "ewc_loss": 0.04963211715221405, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019969030108768493, + "grad_norm": 5.786820888519287, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8635623455047607, + "num_tokens": 236655788.0, + "step": 6204 + }, + { + "epoch": 0.7893397786541152, + "ewc_loss": 0.0496056042611599, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00019942517974413931, + "grad_norm": 5.791193962097168, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8721587657928467, + "num_tokens": 236691532.0, + "step": 6205 + }, + { + "epoch": 0.7894669889327057, + "ewc_loss": 0.04966341704130173, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002000033127842471, + "grad_norm": 5.773567199707031, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8669406175613403, + "num_tokens": 236733792.0, + "step": 6206 + }, + { + "epoch": 0.7895941992112963, + "ewc_loss": 0.04950687289237976, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001996585779124871, + "grad_norm": 5.751601219177246, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8726264834403992, + "num_tokens": 236775454.0, + "step": 6207 + }, + { + "epoch": 0.7897214094898868, + "ewc_loss": 0.04950131103396416, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019960294594056904, + "grad_norm": 5.789665222167969, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8702118396759033, + "num_tokens": 236811572.0, + "step": 6208 + }, + { + "epoch": 0.7898486197684773, + "ewc_loss": 0.0495123416185379, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019971323490608484, + "grad_norm": 5.77323055267334, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8453288078308105, + "num_tokens": 236851018.0, + "step": 6209 + }, + { + "epoch": 0.7899758300470678, + "ewc_loss": 0.04950542002916336, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001996440696530044, + "grad_norm": 5.788950443267822, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8645182847976685, + "num_tokens": 236891871.0, + "step": 6210 + }, + { + "epoch": 0.7901030403256584, + "ewc_loss": 0.04945986345410347, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001991884782910347, + "grad_norm": 5.767225742340088, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.863914966583252, + "num_tokens": 236926771.0, + "step": 6211 + }, + { + "epoch": 0.7902302506042488, + "ewc_loss": 0.04949178919196129, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019950773275922984, + "grad_norm": 5.783337593078613, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8484707474708557, + "num_tokens": 236964768.0, + "step": 6212 + }, + { + "epoch": 0.7903574608828393, + "ewc_loss": 0.049529075622558594, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001998806110350415, + "grad_norm": 5.765732288360596, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.848473072052002, + "num_tokens": 237005075.0, + "step": 6213 + }, + { + "epoch": 0.7904846711614298, + "ewc_loss": 0.049593180418014526, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020052162290085107, + "grad_norm": 5.823975563049316, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8622632026672363, + "num_tokens": 237035517.0, + "step": 6214 + }, + { + "epoch": 0.7906118814400204, + "ewc_loss": 0.04952993616461754, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00019988921121694148, + "grad_norm": 5.73301887512207, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8777147531509399, + "num_tokens": 237076319.0, + "step": 6215 + }, + { + "epoch": 0.7907390917186109, + "ewc_loss": 0.049618758261203766, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020077743101865053, + "grad_norm": 5.7821550369262695, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8515811562538147, + "num_tokens": 237117935.0, + "step": 6216 + }, + { + "epoch": 0.7908663019972014, + "ewc_loss": 0.04976142197847366, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002009833842748776, + "grad_norm": 5.809333801269531, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8694248795509338, + "num_tokens": 237155789.0, + "step": 6217 + }, + { + "epoch": 0.7909935122757918, + "ewc_loss": 0.049574561417102814, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020033544569741935, + "grad_norm": 5.719035625457764, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8666148781776428, + "num_tokens": 237198870.0, + "step": 6218 + }, + { + "epoch": 0.7911207225543824, + "ewc_loss": 0.04964572563767433, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020104710711166263, + "grad_norm": 5.8071417808532715, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8659976720809937, + "num_tokens": 237243084.0, + "step": 6219 + }, + { + "epoch": 0.7912479328329729, + "ewc_loss": 0.04962857812643051, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020087561279069632, + "grad_norm": 5.875252723693848, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.851524829864502, + "num_tokens": 237277245.0, + "step": 6220 + }, + { + "epoch": 0.7913751431115634, + "ewc_loss": 0.04951421171426773, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0001997319341171533, + "grad_norm": 5.725411415100098, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8649678826332092, + "num_tokens": 237317841.0, + "step": 6221 + }, + { + "epoch": 0.791502353390154, + "ewc_loss": 0.049741655588150024, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020078571105841547, + "grad_norm": 6.170654773712158, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8500803112983704, + "num_tokens": 237349327.0, + "step": 6222 + }, + { + "epoch": 0.7916295636687445, + "ewc_loss": 0.04959699511528015, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020055979257449508, + "grad_norm": 5.697540283203125, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8464322090148926, + "num_tokens": 237396179.0, + "step": 6223 + }, + { + "epoch": 0.7917567739473349, + "ewc_loss": 0.04962959140539169, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0002008857554756105, + "grad_norm": 5.807254314422607, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8569936156272888, + "num_tokens": 237437766.0, + "step": 6224 + }, + { + "epoch": 0.7918839842259254, + "ewc_loss": 0.049560707062482834, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0002001969114644453, + "grad_norm": 5.723509311676025, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8646631240844727, + "num_tokens": 237479239.0, + "step": 6225 + }, + { + "epoch": 0.792011194504516, + "ewc_loss": 0.04969914257526398, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020158124971203506, + "grad_norm": 5.878012657165527, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8748612403869629, + "num_tokens": 237514641.0, + "step": 6226 + }, + { + "epoch": 0.7921384047831065, + "ewc_loss": 0.049657274037599564, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020116259111091495, + "grad_norm": 5.788466453552246, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8714483380317688, + "num_tokens": 237555319.0, + "step": 6227 + }, + { + "epoch": 0.792265615061697, + "ewc_loss": 0.049649350345134735, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020108332682866603, + "grad_norm": 5.791869163513184, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8634403944015503, + "num_tokens": 237597786.0, + "step": 6228 + }, + { + "epoch": 0.7923928253402875, + "ewc_loss": 0.04964779317378998, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020106779993511736, + "grad_norm": 5.755417823791504, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8724820613861084, + "num_tokens": 237636584.0, + "step": 6229 + }, + { + "epoch": 0.792520035618878, + "ewc_loss": 0.049593567848205566, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020052552281413227, + "grad_norm": 5.812741279602051, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8526976108551025, + "num_tokens": 237676000.0, + "step": 6230 + }, + { + "epoch": 0.7926472458974685, + "ewc_loss": 0.049654632806777954, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020113617938477546, + "grad_norm": 5.765337944030762, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8870482444763184, + "num_tokens": 237713003.0, + "step": 6231 + }, + { + "epoch": 0.792774456176059, + "ewc_loss": 0.049595460295677185, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.00020054445485584438, + "grad_norm": 5.856625556945801, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8622924089431763, + "num_tokens": 237742054.0, + "step": 6232 + }, + { + "epoch": 0.7929016664546495, + "ewc_loss": 0.0497715100646019, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020108425815124065, + "grad_norm": 5.79070520401001, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.852942168712616, + "num_tokens": 237783147.0, + "step": 6233 + }, + { + "epoch": 0.7930288767332401, + "ewc_loss": 0.04974720627069473, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020084121206309646, + "grad_norm": 5.898509502410889, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8558267951011658, + "num_tokens": 237814322.0, + "step": 6234 + }, + { + "epoch": 0.7931560870118306, + "ewc_loss": 0.04972141981124878, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020058332302141935, + "grad_norm": 5.781369686126709, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8654838800430298, + "num_tokens": 237849278.0, + "step": 6235 + }, + { + "epoch": 0.793283297290421, + "ewc_loss": 0.049840040504932404, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020176956604700536, + "grad_norm": 5.82832145690918, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8644924163818359, + "num_tokens": 237893434.0, + "step": 6236 + }, + { + "epoch": 0.7934105075690115, + "ewc_loss": 0.04973653703927994, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020073450286872685, + "grad_norm": 5.760589122772217, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8680083751678467, + "num_tokens": 237932135.0, + "step": 6237 + }, + { + "epoch": 0.7935377178476021, + "ewc_loss": 0.04982335492968559, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020160268468316644, + "grad_norm": 5.807704925537109, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8563649654388428, + "num_tokens": 237966320.0, + "step": 6238 + }, + { + "epoch": 0.7936649281261926, + "ewc_loss": 0.04980845749378204, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020145373127888888, + "grad_norm": 5.806365489959717, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.865842342376709, + "num_tokens": 238006365.0, + "step": 6239 + }, + { + "epoch": 0.7937921384047831, + "ewc_loss": 0.04979810118675232, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020135015074629337, + "grad_norm": 5.826094150543213, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.857597827911377, + "num_tokens": 238039307.0, + "step": 6240 + }, + { + "epoch": 0.7939193486833737, + "ewc_loss": 0.04974278062582016, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020079694513697177, + "grad_norm": 5.812027454376221, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8467599153518677, + "num_tokens": 238078015.0, + "step": 6241 + }, + { + "epoch": 0.7940465589619641, + "ewc_loss": 0.049818601459264755, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002015551581280306, + "grad_norm": 5.816290378570557, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.858282208442688, + "num_tokens": 238116790.0, + "step": 6242 + }, + { + "epoch": 0.7941737692405546, + "ewc_loss": 0.049806904047727585, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020143818983342499, + "grad_norm": 5.792752265930176, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8668737411499023, + "num_tokens": 238155330.0, + "step": 6243 + }, + { + "epoch": 0.7943009795191451, + "ewc_loss": 0.04978935047984123, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002012626500800252, + "grad_norm": 5.7708821296691895, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8437125086784363, + "num_tokens": 238194492.0, + "step": 6244 + }, + { + "epoch": 0.7944281897977357, + "ewc_loss": 0.049786198884248734, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020123113063164055, + "grad_norm": 5.826375961303711, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8598480224609375, + "num_tokens": 238230451.0, + "step": 6245 + }, + { + "epoch": 0.7945554000763262, + "ewc_loss": 0.04977070540189743, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020107621094211936, + "grad_norm": 5.75172233581543, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8684451580047607, + "num_tokens": 238266331.0, + "step": 6246 + }, + { + "epoch": 0.7946826103549167, + "ewc_loss": 0.04982328414916992, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020160198619123548, + "grad_norm": 5.789897441864014, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8498673439025879, + "num_tokens": 238307753.0, + "step": 6247 + }, + { + "epoch": 0.7948098206335071, + "ewc_loss": 0.049778297543525696, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020115212828386575, + "grad_norm": 5.752130508422852, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8620607852935791, + "num_tokens": 238349644.0, + "step": 6248 + }, + { + "epoch": 0.7949370309120977, + "ewc_loss": 0.04981471598148346, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002015163190662861, + "grad_norm": 5.844882965087891, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8457530736923218, + "num_tokens": 238386308.0, + "step": 6249 + }, + { + "epoch": 0.7950642411906882, + "ewc_loss": 0.04979371279478073, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020130624761804938, + "grad_norm": 5.781267166137695, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8454570770263672, + "num_tokens": 238427935.0, + "step": 6250 + }, + { + "epoch": 0.7951914514692787, + "ewc_loss": 0.049847982823848724, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020184897584840655, + "grad_norm": 5.816259860992432, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8672751188278198, + "num_tokens": 238463639.0, + "step": 6251 + }, + { + "epoch": 0.7953186617478692, + "ewc_loss": 0.049846746027469635, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020183657761663198, + "grad_norm": 5.780463218688965, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8742983341217041, + "num_tokens": 238498295.0, + "step": 6252 + }, + { + "epoch": 0.7954458720264598, + "ewc_loss": 0.04991985857486725, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020256770949345082, + "grad_norm": 5.779026031494141, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8608635663986206, + "num_tokens": 238538109.0, + "step": 6253 + }, + { + "epoch": 0.7955730823050502, + "ewc_loss": 0.049848660826683044, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020185575704090297, + "grad_norm": 5.798985481262207, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8452237844467163, + "num_tokens": 238574415.0, + "step": 6254 + }, + { + "epoch": 0.7957002925836407, + "ewc_loss": 0.04990824684500694, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002024516143137589, + "grad_norm": 5.8135294914245605, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.863136351108551, + "num_tokens": 238616368.0, + "step": 6255 + }, + { + "epoch": 0.7958275028622313, + "ewc_loss": 0.049842167645692825, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.000201790826395154, + "grad_norm": 5.7909064292907715, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8646687865257263, + "num_tokens": 238650634.0, + "step": 6256 + }, + { + "epoch": 0.7959547131408218, + "ewc_loss": 0.049911752343177795, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020248665532562882, + "grad_norm": 5.835020542144775, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8552971482276917, + "num_tokens": 238684383.0, + "step": 6257 + }, + { + "epoch": 0.7960819234194123, + "ewc_loss": 0.04989928752183914, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020236200361978263, + "grad_norm": 5.834820747375488, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8573272228240967, + "num_tokens": 238722863.0, + "step": 6258 + }, + { + "epoch": 0.7962091336980028, + "ewc_loss": 0.049882061779499054, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020218973804730922, + "grad_norm": 5.779078006744385, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.861747145652771, + "num_tokens": 238765027.0, + "step": 6259 + }, + { + "epoch": 0.7963363439765934, + "ewc_loss": 0.049908943474292755, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020245857012923807, + "grad_norm": 5.859370708465576, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8657413125038147, + "num_tokens": 238801950.0, + "step": 6260 + }, + { + "epoch": 0.7964635542551838, + "ewc_loss": 0.049862854182720184, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020199768187012523, + "grad_norm": 5.764671325683594, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8519315719604492, + "num_tokens": 238840963.0, + "step": 6261 + }, + { + "epoch": 0.7965907645337743, + "ewc_loss": 0.04991951584815979, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002025643188972026, + "grad_norm": 5.871626377105713, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8464707732200623, + "num_tokens": 238876167.0, + "step": 6262 + }, + { + "epoch": 0.7967179748123648, + "ewc_loss": 0.04986016824841499, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020197081903461367, + "grad_norm": 5.812478542327881, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8658957481384277, + "num_tokens": 238910293.0, + "step": 6263 + }, + { + "epoch": 0.7968451850909554, + "ewc_loss": 0.04990815371274948, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020245066843926907, + "grad_norm": 5.815133571624756, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8591987490653992, + "num_tokens": 238947279.0, + "step": 6264 + }, + { + "epoch": 0.7969723953695459, + "ewc_loss": 0.049879346042871475, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020216259872540832, + "grad_norm": 5.8160200119018555, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8501335382461548, + "num_tokens": 238984148.0, + "step": 6265 + }, + { + "epoch": 0.7970996056481364, + "ewc_loss": 0.04995183274149895, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020288747327867895, + "grad_norm": 5.855471134185791, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8480136394500732, + "num_tokens": 239022370.0, + "step": 6266 + }, + { + "epoch": 0.7972268159267268, + "ewc_loss": 0.04986250773072243, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020199421851430088, + "grad_norm": 5.729781627655029, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8577485680580139, + "num_tokens": 239064542.0, + "step": 6267 + }, + { + "epoch": 0.7973540262053174, + "ewc_loss": 0.04993752762675285, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020274441340006888, + "grad_norm": 5.866304397583008, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8590946197509766, + "num_tokens": 239103193.0, + "step": 6268 + }, + { + "epoch": 0.7974812364839079, + "ewc_loss": 0.04991597682237625, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.000202528914087452, + "grad_norm": 5.8218302726745605, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8683934211730957, + "num_tokens": 239137952.0, + "step": 6269 + }, + { + "epoch": 0.7976084467624984, + "ewc_loss": 0.0498967282474041, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020233642135281116, + "grad_norm": 5.847850799560547, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8500688076019287, + "num_tokens": 239174600.0, + "step": 6270 + }, + { + "epoch": 0.797735657041089, + "ewc_loss": 0.04989529401063919, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020232210226822644, + "grad_norm": 5.825005054473877, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.86567223072052, + "num_tokens": 239206157.0, + "step": 6271 + }, + { + "epoch": 0.7978628673196795, + "ewc_loss": 0.04987776279449463, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020214678079355508, + "grad_norm": 5.832632541656494, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8649724721908569, + "num_tokens": 239243094.0, + "step": 6272 + }, + { + "epoch": 0.7979900775982699, + "ewc_loss": 0.04988361895084381, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020220535225234926, + "grad_norm": 5.77332067489624, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8612955808639526, + "num_tokens": 239281208.0, + "step": 6273 + }, + { + "epoch": 0.7981172878768604, + "ewc_loss": 0.049898870289325714, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020235782722011209, + "grad_norm": 5.8166937828063965, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8559794425964355, + "num_tokens": 239320159.0, + "step": 6274 + }, + { + "epoch": 0.798244498155451, + "ewc_loss": 0.04987074062228203, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020207655325066298, + "grad_norm": 5.812841415405273, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8477022647857666, + "num_tokens": 239360160.0, + "step": 6275 + }, + { + "epoch": 0.7983717084340415, + "ewc_loss": 0.04990755394101143, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002024446876021102, + "grad_norm": 5.798397064208984, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8631564378738403, + "num_tokens": 239398101.0, + "step": 6276 + }, + { + "epoch": 0.798498918712632, + "ewc_loss": 0.04989007115364075, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020226983178872615, + "grad_norm": 5.783544063568115, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8758065700531006, + "num_tokens": 239438560.0, + "step": 6277 + }, + { + "epoch": 0.7986261289912225, + "ewc_loss": 0.04993262141942978, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002026953297900036, + "grad_norm": 5.841394424438477, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8555725812911987, + "num_tokens": 239475851.0, + "step": 6278 + }, + { + "epoch": 0.798753339269813, + "ewc_loss": 0.04989796131849289, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020234873227309436, + "grad_norm": 5.758493423461914, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.865220308303833, + "num_tokens": 239519165.0, + "step": 6279 + }, + { + "epoch": 0.7988805495484035, + "ewc_loss": 0.04996073991060257, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020297656010370702, + "grad_norm": 5.857822895050049, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.848020076751709, + "num_tokens": 239560602.0, + "step": 6280 + }, + { + "epoch": 0.799007759826994, + "ewc_loss": 0.049939390271902084, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002027630398515612, + "grad_norm": 5.945505142211914, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8615726232528687, + "num_tokens": 239591933.0, + "step": 6281 + }, + { + "epoch": 0.7991349701055845, + "ewc_loss": 0.0498613603413105, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020198275160510093, + "grad_norm": 5.866646766662598, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8554778099060059, + "num_tokens": 239625495.0, + "step": 6282 + }, + { + "epoch": 0.7992621803841751, + "ewc_loss": 0.049876753240823746, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020213666721247137, + "grad_norm": 5.8111491203308105, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8762513399124146, + "num_tokens": 239658249.0, + "step": 6283 + }, + { + "epoch": 0.7993893906627656, + "ewc_loss": 0.04988684505224228, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020223759929649532, + "grad_norm": 5.882401466369629, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8522824048995972, + "num_tokens": 239690194.0, + "step": 6284 + }, + { + "epoch": 0.799516600941356, + "ewc_loss": 0.04979774355888367, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002013465709751472, + "grad_norm": 5.770538806915283, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.859825611114502, + "num_tokens": 239728315.0, + "step": 6285 + }, + { + "epoch": 0.7996438112199465, + "ewc_loss": 0.049851421266794205, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020188334747217596, + "grad_norm": 5.881950855255127, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8646821975708008, + "num_tokens": 239764505.0, + "step": 6286 + }, + { + "epoch": 0.7997710214985371, + "ewc_loss": 0.049862172454595566, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020199087157379836, + "grad_norm": 5.786133766174316, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8447994589805603, + "num_tokens": 239809461.0, + "step": 6287 + }, + { + "epoch": 0.7998982317771276, + "ewc_loss": 0.04983227327466011, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002016918733716011, + "grad_norm": 5.860171318054199, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8546726703643799, + "num_tokens": 239845232.0, + "step": 6288 + }, + { + "epoch": 0.8000254420557181, + "ewc_loss": 0.049895741045475006, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020232655515428632, + "grad_norm": 5.843822956085205, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8662117719650269, + "num_tokens": 239885489.0, + "step": 6289 + }, + { + "epoch": 0.8001526523343087, + "ewc_loss": 0.04983721673488617, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020174129167571664, + "grad_norm": 5.90762186050415, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8469494581222534, + "num_tokens": 239921159.0, + "step": 6290 + }, + { + "epoch": 0.8002798626128991, + "ewc_loss": 0.04984474927186966, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020181664149276912, + "grad_norm": 5.773940563201904, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8649579286575317, + "num_tokens": 239960879.0, + "step": 6291 + }, + { + "epoch": 0.8004070728914896, + "ewc_loss": 0.04983843117952347, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002017534279730171, + "grad_norm": 5.91312313079834, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8483741283416748, + "num_tokens": 239994302.0, + "step": 6292 + }, + { + "epoch": 0.8005342831700801, + "ewc_loss": 0.0497855469584465, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020122459682170302, + "grad_norm": 5.779489517211914, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8669810891151428, + "num_tokens": 240027759.0, + "step": 6293 + }, + { + "epoch": 0.8006614934486707, + "ewc_loss": 0.04982929304242134, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002016620710492134, + "grad_norm": 5.828035354614258, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8545091152191162, + "num_tokens": 240065049.0, + "step": 6294 + }, + { + "epoch": 0.8007887037272612, + "ewc_loss": 0.04983006417751312, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020166976901236922, + "grad_norm": 5.820285797119141, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8455033302307129, + "num_tokens": 240101453.0, + "step": 6295 + }, + { + "epoch": 0.8009159140058517, + "ewc_loss": 0.04984481632709503, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002018172963289544, + "grad_norm": 5.864853382110596, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.838432788848877, + "num_tokens": 240136831.0, + "step": 6296 + }, + { + "epoch": 0.8010431242844421, + "ewc_loss": 0.04981160908937454, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020148522162344307, + "grad_norm": 5.808244705200195, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8585726022720337, + "num_tokens": 240173364.0, + "step": 6297 + }, + { + "epoch": 0.8011703345630327, + "ewc_loss": 0.049864381551742554, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020201297593303025, + "grad_norm": 5.8975653648376465, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8375483751296997, + "num_tokens": 240206718.0, + "step": 6298 + }, + { + "epoch": 0.8012975448416232, + "ewc_loss": 0.049867548048496246, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020204462634865195, + "grad_norm": 5.7916035652160645, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8476815819740295, + "num_tokens": 240245633.0, + "step": 6299 + }, + { + "epoch": 0.8014247551202137, + "ewc_loss": 0.04987093061208725, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020207844499964267, + "grad_norm": 5.825313091278076, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8734797835350037, + "num_tokens": 240283887.0, + "step": 6300 + }, + { + "epoch": 0.8015519653988042, + "ewc_loss": 0.04990676790475845, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020243680046405643, + "grad_norm": 5.817006587982178, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8520216941833496, + "num_tokens": 240322208.0, + "step": 6301 + }, + { + "epoch": 0.8016791756773948, + "ewc_loss": 0.04986497759819031, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002020188985625282, + "grad_norm": 5.857888221740723, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8635424971580505, + "num_tokens": 240356154.0, + "step": 6302 + }, + { + "epoch": 0.8018063859559852, + "ewc_loss": 0.04988420009613037, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002022111148107797, + "grad_norm": 5.822906494140625, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8590555191040039, + "num_tokens": 240389103.0, + "step": 6303 + }, + { + "epoch": 0.8019335962345757, + "ewc_loss": 0.049899034202098846, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020235950069036335, + "grad_norm": 5.820005893707275, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.876687228679657, + "num_tokens": 240429545.0, + "step": 6304 + }, + { + "epoch": 0.8020608065131662, + "ewc_loss": 0.0499180406332016, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020254954870324582, + "grad_norm": 5.839121341705322, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8773406147956848, + "num_tokens": 240468556.0, + "step": 6305 + }, + { + "epoch": 0.8021880167917568, + "ewc_loss": 0.04992561414837837, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020262527687009424, + "grad_norm": 5.795605182647705, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8616043329238892, + "num_tokens": 240508534.0, + "step": 6306 + }, + { + "epoch": 0.8023152270703473, + "ewc_loss": 0.049886554479599, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002022346598096192, + "grad_norm": 5.856472492218018, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8599734902381897, + "num_tokens": 240540833.0, + "step": 6307 + }, + { + "epoch": 0.8024424373489378, + "ewc_loss": 0.04987644776701927, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020213362586218864, + "grad_norm": 5.821732044219971, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8511760234832764, + "num_tokens": 240577768.0, + "step": 6308 + }, + { + "epoch": 0.8025696476275284, + "ewc_loss": 0.0498887300491333, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002022564149228856, + "grad_norm": 5.855495929718018, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8554539084434509, + "num_tokens": 240610607.0, + "step": 6309 + }, + { + "epoch": 0.8026968579061188, + "ewc_loss": 0.049902528524398804, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020239442528691143, + "grad_norm": 5.838639736175537, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8569551110267639, + "num_tokens": 240646056.0, + "step": 6310 + }, + { + "epoch": 0.8028240681847093, + "ewc_loss": 0.049914032220840454, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020250948728062212, + "grad_norm": 5.790564060211182, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8668966293334961, + "num_tokens": 240682513.0, + "step": 6311 + }, + { + "epoch": 0.8029512784632998, + "ewc_loss": 0.04994864761829376, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020285560458432883, + "grad_norm": 5.869858264923096, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8603465557098389, + "num_tokens": 240716311.0, + "step": 6312 + }, + { + "epoch": 0.8030784887418904, + "ewc_loss": 0.04994766041636467, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.000202845738385804, + "grad_norm": 5.816349506378174, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8509673476219177, + "num_tokens": 240758873.0, + "step": 6313 + }, + { + "epoch": 0.8032056990204809, + "ewc_loss": 0.04988299310207367, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020219905127305537, + "grad_norm": 5.80321741104126, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8654334545135498, + "num_tokens": 240797163.0, + "step": 6314 + }, + { + "epoch": 0.8033329092990714, + "ewc_loss": 0.04995303973555565, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020289953681640327, + "grad_norm": 5.8115410804748535, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8700653314590454, + "num_tokens": 240835719.0, + "step": 6315 + }, + { + "epoch": 0.8034601195776618, + "ewc_loss": 0.04987026005983353, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020207175111863762, + "grad_norm": 5.824827671051025, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.852729320526123, + "num_tokens": 240874359.0, + "step": 6316 + }, + { + "epoch": 0.8035873298562524, + "ewc_loss": 0.049957506358623505, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020294421119615436, + "grad_norm": 5.838682174682617, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8733204007148743, + "num_tokens": 240909125.0, + "step": 6317 + }, + { + "epoch": 0.8037145401348429, + "ewc_loss": 0.049963004887104034, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002029991737799719, + "grad_norm": 5.787878036499023, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.863396167755127, + "num_tokens": 240949391.0, + "step": 6318 + }, + { + "epoch": 0.8038417504134334, + "ewc_loss": 0.04996538162231445, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020302295160945505, + "grad_norm": 5.834848880767822, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8501405119895935, + "num_tokens": 240987390.0, + "step": 6319 + }, + { + "epoch": 0.803968960692024, + "ewc_loss": 0.049949534237384796, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020286448125261813, + "grad_norm": 5.7669243812561035, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8528010845184326, + "num_tokens": 241025359.0, + "step": 6320 + }, + { + "epoch": 0.8040961709706145, + "ewc_loss": 0.049963951110839844, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020300867618061602, + "grad_norm": 5.874484062194824, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8549526333808899, + "num_tokens": 241060532.0, + "step": 6321 + }, + { + "epoch": 0.8042233812492049, + "ewc_loss": 0.04997248947620392, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020309403771534562, + "grad_norm": 5.762643814086914, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8660179376602173, + "num_tokens": 241098547.0, + "step": 6322 + }, + { + "epoch": 0.8043505915277954, + "ewc_loss": 0.049977272748947144, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002031418407568708, + "grad_norm": 5.867900371551514, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8593288660049438, + "num_tokens": 241131091.0, + "step": 6323 + }, + { + "epoch": 0.804477801806386, + "ewc_loss": 0.049956854432821274, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020293767738621682, + "grad_norm": 5.897330284118652, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.849001944065094, + "num_tokens": 241164495.0, + "step": 6324 + }, + { + "epoch": 0.8046050120849765, + "ewc_loss": 0.0499117411673069, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020248655346222222, + "grad_norm": 5.810727596282959, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8539772629737854, + "num_tokens": 241203777.0, + "step": 6325 + }, + { + "epoch": 0.804732222363567, + "ewc_loss": 0.04989723488688469, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020234148541931063, + "grad_norm": 5.789341926574707, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8533048033714294, + "num_tokens": 241243219.0, + "step": 6326 + }, + { + "epoch": 0.8048594326421575, + "ewc_loss": 0.04994753375649452, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020284448692109436, + "grad_norm": 5.827038764953613, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8438629508018494, + "num_tokens": 241284547.0, + "step": 6327 + }, + { + "epoch": 0.804986642920748, + "ewc_loss": 0.04990581423044205, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002024272980634123, + "grad_norm": 5.758523464202881, + "learning_rate": 1e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8422621488571167, + "num_tokens": 241331110.0, + "step": 6328 + }, + { + "epoch": 0.8051138531993385, + "ewc_loss": 0.04992084950208664, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020257761934772134, + "grad_norm": 5.816195964813232, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8599671125411987, + "num_tokens": 241372720.0, + "step": 6329 + }, + { + "epoch": 0.805241063477929, + "ewc_loss": 0.04996831715106964, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020305228827055544, + "grad_norm": 5.799166202545166, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8612821102142334, + "num_tokens": 241408173.0, + "step": 6330 + }, + { + "epoch": 0.8053682737565195, + "ewc_loss": 0.049956247210502625, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020293159468565136, + "grad_norm": 5.835515975952148, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8659014105796814, + "num_tokens": 241445021.0, + "step": 6331 + }, + { + "epoch": 0.8054954840351101, + "ewc_loss": 0.04996415972709656, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020301074255257845, + "grad_norm": 5.8396782875061035, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8646755814552307, + "num_tokens": 241481874.0, + "step": 6332 + }, + { + "epoch": 0.8056226943137006, + "ewc_loss": 0.0499534085392952, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020290320389904082, + "grad_norm": 5.806443214416504, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.843704342842102, + "num_tokens": 241523140.0, + "step": 6333 + }, + { + "epoch": 0.805749904592291, + "ewc_loss": 0.049993351101875305, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020330262486822903, + "grad_norm": 5.861817359924316, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8403528332710266, + "num_tokens": 241563424.0, + "step": 6334 + }, + { + "epoch": 0.8058771148708815, + "ewc_loss": 0.049916282296180725, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020253196998964995, + "grad_norm": 5.811242580413818, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8723485469818115, + "num_tokens": 241600004.0, + "step": 6335 + }, + { + "epoch": 0.8060043251494721, + "ewc_loss": 0.049945563077926636, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020282478362787515, + "grad_norm": 5.801023960113525, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8574625849723816, + "num_tokens": 241642704.0, + "step": 6336 + }, + { + "epoch": 0.8061315354280626, + "ewc_loss": 0.04990135133266449, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020238265278749168, + "grad_norm": 5.80905818939209, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8638046383857727, + "num_tokens": 241679176.0, + "step": 6337 + }, + { + "epoch": 0.8062587457066531, + "ewc_loss": 0.049994420260190964, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020331334962975234, + "grad_norm": 5.780926704406738, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8630918264389038, + "num_tokens": 241721168.0, + "step": 6338 + }, + { + "epoch": 0.8063859559852437, + "ewc_loss": 0.04998687654733658, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002032379270531237, + "grad_norm": 5.820900917053223, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8736850023269653, + "num_tokens": 241759735.0, + "step": 6339 + }, + { + "epoch": 0.8065131662638341, + "ewc_loss": 0.0499783530831337, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020315265282988548, + "grad_norm": 5.818289756774902, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.860092282295227, + "num_tokens": 241799445.0, + "step": 6340 + }, + { + "epoch": 0.8066403765424246, + "ewc_loss": 0.04997435212135315, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020311263506300747, + "grad_norm": 5.802979469299316, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8641433715820312, + "num_tokens": 241839333.0, + "step": 6341 + }, + { + "epoch": 0.8067675868210151, + "ewc_loss": 0.04997958242893219, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020316496375016868, + "grad_norm": 5.799655437469482, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8480905294418335, + "num_tokens": 241880248.0, + "step": 6342 + }, + { + "epoch": 0.8068947970996057, + "ewc_loss": 0.05001949891448021, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020356412278488278, + "grad_norm": 5.766317844390869, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8726632595062256, + "num_tokens": 241926717.0, + "step": 6343 + }, + { + "epoch": 0.8070220073781962, + "ewc_loss": 0.04998455569148064, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020321470219641924, + "grad_norm": 5.856834888458252, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8503934741020203, + "num_tokens": 241966080.0, + "step": 6344 + }, + { + "epoch": 0.8071492176567867, + "ewc_loss": 0.05007093399763107, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002040784602286294, + "grad_norm": 5.845482349395752, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8560031652450562, + "num_tokens": 242011975.0, + "step": 6345 + }, + { + "epoch": 0.8072764279353771, + "ewc_loss": 0.050015609711408615, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002035252400673926, + "grad_norm": 5.82163143157959, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8724285960197449, + "num_tokens": 242046320.0, + "step": 6346 + }, + { + "epoch": 0.8074036382139677, + "ewc_loss": 0.05003015697002411, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020367071556393057, + "grad_norm": 5.799747467041016, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8615807294845581, + "num_tokens": 242085157.0, + "step": 6347 + }, + { + "epoch": 0.8075308484925582, + "ewc_loss": 0.050010815262794495, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020347732061054558, + "grad_norm": 5.922307968139648, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8562248945236206, + "num_tokens": 242116548.0, + "step": 6348 + }, + { + "epoch": 0.8076580587711487, + "ewc_loss": 0.05003349483013153, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020370408310554922, + "grad_norm": 5.807366371154785, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8654263615608215, + "num_tokens": 242151883.0, + "step": 6349 + }, + { + "epoch": 0.8077852690497392, + "ewc_loss": 0.04997311532497406, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020310029503889382, + "grad_norm": 5.835303783416748, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.856023371219635, + "num_tokens": 242190486.0, + "step": 6350 + }, + { + "epoch": 0.8079124793283298, + "ewc_loss": 0.05000096932053566, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020337883324828, + "grad_norm": 5.7857584953308105, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8633051514625549, + "num_tokens": 242232720.0, + "step": 6351 + }, + { + "epoch": 0.8080396896069202, + "ewc_loss": 0.050120435655117035, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020457350183278322, + "grad_norm": 5.811532020568848, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8610653877258301, + "num_tokens": 242273926.0, + "step": 6352 + }, + { + "epoch": 0.8081668998855107, + "ewc_loss": 0.05003248527646065, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020369398407638073, + "grad_norm": 5.778215408325195, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8499565124511719, + "num_tokens": 242313484.0, + "step": 6353 + }, + { + "epoch": 0.8082941101641012, + "ewc_loss": 0.05010373145341873, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020440647494979203, + "grad_norm": 5.827158451080322, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8539828658103943, + "num_tokens": 242352101.0, + "step": 6354 + }, + { + "epoch": 0.8084213204426918, + "ewc_loss": 0.050064150243997574, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020401063375175, + "grad_norm": 5.7842888832092285, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8754031658172607, + "num_tokens": 242392958.0, + "step": 6355 + }, + { + "epoch": 0.8085485307212823, + "ewc_loss": 0.05008542165160179, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020422335364855826, + "grad_norm": 5.80947732925415, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8605926036834717, + "num_tokens": 242432327.0, + "step": 6356 + }, + { + "epoch": 0.8086757409998728, + "ewc_loss": 0.050105683505535126, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020442598906811327, + "grad_norm": 5.839242935180664, + "learning_rate": 1e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8388268947601318, + "num_tokens": 242473714.0, + "step": 6357 + }, + { + "epoch": 0.8088029512784632, + "ewc_loss": 0.05009806156158447, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020434973703231663, + "grad_norm": 5.852303504943848, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8668587803840637, + "num_tokens": 242513179.0, + "step": 6358 + }, + { + "epoch": 0.8089301615570538, + "ewc_loss": 0.050113238394260406, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020450149895623326, + "grad_norm": 5.856434345245361, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8591183423995972, + "num_tokens": 242549398.0, + "step": 6359 + }, + { + "epoch": 0.8090573718356443, + "ewc_loss": 0.050046905875205994, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002038382226601243, + "grad_norm": 5.824342727661133, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8548925518989563, + "num_tokens": 242583705.0, + "step": 6360 + }, + { + "epoch": 0.8091845821142348, + "ewc_loss": 0.050062745809555054, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020399660570546985, + "grad_norm": 5.803397178649902, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8643523454666138, + "num_tokens": 242625900.0, + "step": 6361 + }, + { + "epoch": 0.8093117923928254, + "ewc_loss": 0.05005700886249542, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020393924205563962, + "grad_norm": 5.823454856872559, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8489178419113159, + "num_tokens": 242665766.0, + "step": 6362 + }, + { + "epoch": 0.8094390026714159, + "ewc_loss": 0.05006715655326843, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002040406980086118, + "grad_norm": 5.811117649078369, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8624966740608215, + "num_tokens": 242707634.0, + "step": 6363 + }, + { + "epoch": 0.8095662129500064, + "ewc_loss": 0.05008038133382797, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020417297491803765, + "grad_norm": 5.801334381103516, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.856231153011322, + "num_tokens": 242746791.0, + "step": 6364 + }, + { + "epoch": 0.8096934232285968, + "ewc_loss": 0.05005403608083725, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020390949794091284, + "grad_norm": 5.779849052429199, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8715808987617493, + "num_tokens": 242784861.0, + "step": 6365 + }, + { + "epoch": 0.8098206335071874, + "ewc_loss": 0.05010830983519554, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020445224072318524, + "grad_norm": 5.8475823402404785, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8736865520477295, + "num_tokens": 242817612.0, + "step": 6366 + }, + { + "epoch": 0.8099478437857779, + "ewc_loss": 0.0500841923058033, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002042110572801903, + "grad_norm": 5.811196327209473, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8666480779647827, + "num_tokens": 242854951.0, + "step": 6367 + }, + { + "epoch": 0.8100750540643684, + "ewc_loss": 0.05010132119059563, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002043823478743434, + "grad_norm": 5.807505130767822, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8745532631874084, + "num_tokens": 242894166.0, + "step": 6368 + }, + { + "epoch": 0.8102022643429589, + "ewc_loss": 0.05014347285032272, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020480388775467873, + "grad_norm": 5.777793884277344, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8656255602836609, + "num_tokens": 242936230.0, + "step": 6369 + }, + { + "epoch": 0.8103294746215495, + "ewc_loss": 0.050157055258750916, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020493967167567462, + "grad_norm": 5.867280960083008, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8709017634391785, + "num_tokens": 242973148.0, + "step": 6370 + }, + { + "epoch": 0.8104566849001399, + "ewc_loss": 0.05016320198774338, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020500115351751447, + "grad_norm": 5.828632354736328, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8607438802719116, + "num_tokens": 243013029.0, + "step": 6371 + }, + { + "epoch": 0.8105838951787304, + "ewc_loss": 0.050124406814575195, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020461321400944144, + "grad_norm": 5.835123062133789, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8593690395355225, + "num_tokens": 243054847.0, + "step": 6372 + }, + { + "epoch": 0.810711105457321, + "ewc_loss": 0.05013471841812134, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020471634343266487, + "grad_norm": 5.802859783172607, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8532652258872986, + "num_tokens": 243096524.0, + "step": 6373 + }, + { + "epoch": 0.8108383157359115, + "ewc_loss": 0.05015316605567932, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020490080351009965, + "grad_norm": 5.8712077140808105, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.851456880569458, + "num_tokens": 243132397.0, + "step": 6374 + }, + { + "epoch": 0.810965526014502, + "ewc_loss": 0.05015658587217331, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020493498595897108, + "grad_norm": 5.8238525390625, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.847625732421875, + "num_tokens": 243171790.0, + "step": 6375 + }, + { + "epoch": 0.8110927362930925, + "ewc_loss": 0.050178635865449905, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020515549113042653, + "grad_norm": 5.8569512367248535, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8583767414093018, + "num_tokens": 243208217.0, + "step": 6376 + }, + { + "epoch": 0.811219946571683, + "ewc_loss": 0.050155725330114365, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020492638577707112, + "grad_norm": 5.834856033325195, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8651124238967896, + "num_tokens": 243245260.0, + "step": 6377 + }, + { + "epoch": 0.8113471568502735, + "ewc_loss": 0.05018175393342972, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020518667588476092, + "grad_norm": 5.826956748962402, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8681971430778503, + "num_tokens": 243283924.0, + "step": 6378 + }, + { + "epoch": 0.811474367128864, + "ewc_loss": 0.0501539446413517, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020490858878474683, + "grad_norm": 5.911384105682373, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8677971363067627, + "num_tokens": 243321977.0, + "step": 6379 + }, + { + "epoch": 0.8116015774074545, + "ewc_loss": 0.05009690672159195, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002043381828116253, + "grad_norm": 5.817835330963135, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8568156957626343, + "num_tokens": 243359349.0, + "step": 6380 + }, + { + "epoch": 0.8117287876860451, + "ewc_loss": 0.05018801614642143, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002052493073279038, + "grad_norm": 5.8595404624938965, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8626934289932251, + "num_tokens": 243395722.0, + "step": 6381 + }, + { + "epoch": 0.8118559979646356, + "ewc_loss": 0.050144825130701065, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020481739193201065, + "grad_norm": 5.8869428634643555, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8485960960388184, + "num_tokens": 243429022.0, + "step": 6382 + }, + { + "epoch": 0.811983208243226, + "ewc_loss": 0.05016952008008957, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002050643233815208, + "grad_norm": 5.861352443695068, + "learning_rate": 1e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8390102386474609, + "num_tokens": 243468474.0, + "step": 6383 + }, + { + "epoch": 0.8121104185218165, + "ewc_loss": 0.05014606565237045, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020482980471570045, + "grad_norm": 5.809144496917725, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8754764199256897, + "num_tokens": 243503045.0, + "step": 6384 + }, + { + "epoch": 0.8122376288004071, + "ewc_loss": 0.05024437606334686, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.000205812873900868, + "grad_norm": 5.843465805053711, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.849812388420105, + "num_tokens": 243543865.0, + "step": 6385 + }, + { + "epoch": 0.8123648390789976, + "ewc_loss": 0.05017866939306259, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.000205155840376392, + "grad_norm": 5.833319187164307, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8525083065032959, + "num_tokens": 243581258.0, + "step": 6386 + }, + { + "epoch": 0.8124920493575881, + "ewc_loss": 0.05024338141083717, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020580296404659748, + "grad_norm": 5.817591190338135, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8529542088508606, + "num_tokens": 243622777.0, + "step": 6387 + }, + { + "epoch": 0.8126192596361786, + "ewc_loss": 0.05020323395729065, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002054015058092773, + "grad_norm": 5.85614538192749, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8470491170883179, + "num_tokens": 243664549.0, + "step": 6388 + }, + { + "epoch": 0.8127464699147691, + "ewc_loss": 0.050185780972242355, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020522695558611304, + "grad_norm": 5.768757343292236, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8539963364601135, + "num_tokens": 243710408.0, + "step": 6389 + }, + { + "epoch": 0.8128736801933596, + "ewc_loss": 0.0502246730029583, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020561587007250637, + "grad_norm": 5.818673610687256, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8606741428375244, + "num_tokens": 243752598.0, + "step": 6390 + }, + { + "epoch": 0.8130008904719501, + "ewc_loss": 0.05027839541435242, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020615306857507676, + "grad_norm": 5.849460124969482, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8569092750549316, + "num_tokens": 243789805.0, + "step": 6391 + }, + { + "epoch": 0.8131281007505406, + "ewc_loss": 0.05026458948850632, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020601501455530524, + "grad_norm": 5.913637638092041, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8695993423461914, + "num_tokens": 243824124.0, + "step": 6392 + }, + { + "epoch": 0.8132553110291312, + "ewc_loss": 0.05018497630953789, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020521890837699175, + "grad_norm": 5.8366289138793945, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8589775562286377, + "num_tokens": 243863448.0, + "step": 6393 + }, + { + "epoch": 0.8133825213077217, + "ewc_loss": 0.05015923082828522, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020496144134085625, + "grad_norm": 5.820528507232666, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.863051176071167, + "num_tokens": 243902085.0, + "step": 6394 + }, + { + "epoch": 0.8135097315863121, + "ewc_loss": 0.050168149173259735, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020505065913312137, + "grad_norm": 5.790113925933838, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8544904589653015, + "num_tokens": 243945349.0, + "step": 6395 + }, + { + "epoch": 0.8136369418649027, + "ewc_loss": 0.05018005520105362, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002051696937996894, + "grad_norm": 5.941400051116943, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8568636178970337, + "num_tokens": 243982221.0, + "step": 6396 + }, + { + "epoch": 0.8137641521434932, + "ewc_loss": 0.05012627691030502, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002046319132205099, + "grad_norm": 5.866145610809326, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8481361865997314, + "num_tokens": 244012576.0, + "step": 6397 + }, + { + "epoch": 0.8138913624220837, + "ewc_loss": 0.05013479292392731, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002047170710284263, + "grad_norm": 5.827733993530273, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8671257495880127, + "num_tokens": 244052089.0, + "step": 6398 + }, + { + "epoch": 0.8140185727006742, + "ewc_loss": 0.05011744052171707, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020454353943932801, + "grad_norm": 5.9793901443481445, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8615735769271851, + "num_tokens": 244088712.0, + "step": 6399 + }, + { + "epoch": 0.8141457829792648, + "ewc_loss": 0.05012470483779907, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002046161680482328, + "grad_norm": 5.780511379241943, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8619537353515625, + "num_tokens": 244127655.0, + "step": 6400 + }, + { + "epoch": 0.8142729932578552, + "ewc_loss": 0.05015762150287628, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020494534692261368, + "grad_norm": 5.89194917678833, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8653546571731567, + "num_tokens": 244158588.0, + "step": 6401 + }, + { + "epoch": 0.8144002035364457, + "ewc_loss": 0.050165578722953796, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002050249167950824, + "grad_norm": 5.79600715637207, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8786658048629761, + "num_tokens": 244193872.0, + "step": 6402 + }, + { + "epoch": 0.8145274138150362, + "ewc_loss": 0.050212662667036057, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020549575856421143, + "grad_norm": 5.873210906982422, + "learning_rate": 1e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8438171148300171, + "num_tokens": 244234055.0, + "step": 6403 + }, + { + "epoch": 0.8146546240936268, + "ewc_loss": 0.05022234842181206, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020559263066388667, + "grad_norm": 5.9556169509887695, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8593964576721191, + "num_tokens": 244275163.0, + "step": 6404 + }, + { + "epoch": 0.8147818343722173, + "ewc_loss": 0.05012126639485359, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020458181097637862, + "grad_norm": 5.8118720054626465, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.856266438961029, + "num_tokens": 244315241.0, + "step": 6405 + }, + { + "epoch": 0.8149090446508078, + "ewc_loss": 0.05022396519780159, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020560878328979015, + "grad_norm": 5.851497173309326, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8742972016334534, + "num_tokens": 244351947.0, + "step": 6406 + }, + { + "epoch": 0.8150362549293982, + "ewc_loss": 0.050160132348537445, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020497046352829784, + "grad_norm": 5.852920055389404, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8571865558624268, + "num_tokens": 244387706.0, + "step": 6407 + }, + { + "epoch": 0.8151634652079888, + "ewc_loss": 0.05018450692296028, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020521420810837299, + "grad_norm": 5.838935852050781, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8668708801269531, + "num_tokens": 244428447.0, + "step": 6408 + }, + { + "epoch": 0.8152906754865793, + "ewc_loss": 0.05023956298828125, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020576477982103825, + "grad_norm": 5.848079681396484, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8448684215545654, + "num_tokens": 244471191.0, + "step": 6409 + }, + { + "epoch": 0.8154178857651698, + "ewc_loss": 0.050250180065631866, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020587096514645964, + "grad_norm": 5.8814921379089355, + "learning_rate": 1e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8420050144195557, + "num_tokens": 244509198.0, + "step": 6410 + }, + { + "epoch": 0.8155450960437604, + "ewc_loss": 0.05023376643657684, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020570680499076843, + "grad_norm": 5.843181610107422, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8502269983291626, + "num_tokens": 244549677.0, + "step": 6411 + }, + { + "epoch": 0.8156723063223509, + "ewc_loss": 0.050260983407497406, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020597896946128458, + "grad_norm": 6.007164001464844, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8469792604446411, + "num_tokens": 244583586.0, + "step": 6412 + }, + { + "epoch": 0.8157995166009414, + "ewc_loss": 0.05029632896184921, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002063324354821816, + "grad_norm": 5.852748394012451, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8681415319442749, + "num_tokens": 244621220.0, + "step": 6413 + }, + { + "epoch": 0.8159267268795318, + "ewc_loss": 0.050175461918115616, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020512375340331346, + "grad_norm": 5.865638732910156, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8673374652862549, + "num_tokens": 244655941.0, + "step": 6414 + }, + { + "epoch": 0.8160539371581224, + "ewc_loss": 0.05019508674740791, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002053200005320832, + "grad_norm": 5.845112323760986, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8764618039131165, + "num_tokens": 244685807.0, + "step": 6415 + }, + { + "epoch": 0.8161811474367129, + "ewc_loss": 0.05017879977822304, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020515713549684733, + "grad_norm": 5.836159706115723, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8582189083099365, + "num_tokens": 244726866.0, + "step": 6416 + }, + { + "epoch": 0.8163083577153034, + "ewc_loss": 0.05014926940202713, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002048618480330333, + "grad_norm": 5.808706760406494, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8513823747634888, + "num_tokens": 244767273.0, + "step": 6417 + }, + { + "epoch": 0.8164355679938939, + "ewc_loss": 0.050239771604537964, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020576683164108545, + "grad_norm": 5.818287372589111, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8560009598731995, + "num_tokens": 244806111.0, + "step": 6418 + }, + { + "epoch": 0.8165627782724845, + "ewc_loss": 0.050197940319776535, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020534855138976127, + "grad_norm": 5.868757247924805, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8700538873672485, + "num_tokens": 244841706.0, + "step": 6419 + }, + { + "epoch": 0.8166899885510749, + "ewc_loss": 0.05022595077753067, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020562863210216165, + "grad_norm": 5.778217315673828, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8625375628471375, + "num_tokens": 244883908.0, + "step": 6420 + }, + { + "epoch": 0.8168171988296654, + "ewc_loss": 0.05025123804807663, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020588150073308498, + "grad_norm": 5.903275489807129, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8512306809425354, + "num_tokens": 244919090.0, + "step": 6421 + }, + { + "epoch": 0.8169444091082559, + "ewc_loss": 0.05020332708954811, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020540240802802145, + "grad_norm": 5.749425888061523, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8738681674003601, + "num_tokens": 244961141.0, + "step": 6422 + }, + { + "epoch": 0.8170716193868465, + "ewc_loss": 0.05028088390827179, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002061779669020325, + "grad_norm": 5.858943939208984, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8439398407936096, + "num_tokens": 245004503.0, + "step": 6423 + }, + { + "epoch": 0.817198829665437, + "ewc_loss": 0.05028501898050308, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020621930889319628, + "grad_norm": 5.855978965759277, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8578366637229919, + "num_tokens": 245045042.0, + "step": 6424 + }, + { + "epoch": 0.8173260399440275, + "ewc_loss": 0.05031033605337143, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002064724831143394, + "grad_norm": 5.918828010559082, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8581751585006714, + "num_tokens": 245085416.0, + "step": 6425 + }, + { + "epoch": 0.8174532502226179, + "ewc_loss": 0.050220146775245667, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020557062816806138, + "grad_norm": 5.880074501037598, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8485139608383179, + "num_tokens": 245116137.0, + "step": 6426 + }, + { + "epoch": 0.8175804605012085, + "ewc_loss": 0.05025862902402878, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020595540991052985, + "grad_norm": 5.8259124755859375, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8714488744735718, + "num_tokens": 245159181.0, + "step": 6427 + }, + { + "epoch": 0.817707670779799, + "ewc_loss": 0.05026698112487793, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020603893790394068, + "grad_norm": 5.842917442321777, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8524660468101501, + "num_tokens": 245205199.0, + "step": 6428 + }, + { + "epoch": 0.8178348810583895, + "ewc_loss": 0.05024110525846481, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020578020485118032, + "grad_norm": 5.832019329071045, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8636558651924133, + "num_tokens": 245242455.0, + "step": 6429 + }, + { + "epoch": 0.8179620913369801, + "ewc_loss": 0.05024005472660065, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020576969836838543, + "grad_norm": 5.817442417144775, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8796383142471313, + "num_tokens": 245280675.0, + "step": 6430 + }, + { + "epoch": 0.8180893016155706, + "ewc_loss": 0.050293296575546265, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020630212384276092, + "grad_norm": 5.855441570281982, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.874559760093689, + "num_tokens": 245316947.0, + "step": 6431 + }, + { + "epoch": 0.818216511894161, + "ewc_loss": 0.05027637630701065, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020613288506865501, + "grad_norm": 5.858453750610352, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8633557558059692, + "num_tokens": 245355145.0, + "step": 6432 + }, + { + "epoch": 0.8183437221727515, + "ewc_loss": 0.050337426364421844, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002067433815682307, + "grad_norm": 5.90402889251709, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8712673187255859, + "num_tokens": 245390282.0, + "step": 6433 + }, + { + "epoch": 0.8184709324513421, + "ewc_loss": 0.05023102089762688, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002056793455267325, + "grad_norm": 5.856194496154785, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8686968684196472, + "num_tokens": 245430262.0, + "step": 6434 + }, + { + "epoch": 0.8185981427299326, + "ewc_loss": 0.050269462168216705, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020606374891940504, + "grad_norm": 5.8538384437561035, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8568048477172852, + "num_tokens": 245470874.0, + "step": 6435 + }, + { + "epoch": 0.8187253530085231, + "ewc_loss": 0.05025339126586914, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020590306667145342, + "grad_norm": 5.890473365783691, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8653901815414429, + "num_tokens": 245506626.0, + "step": 6436 + }, + { + "epoch": 0.8188525632871136, + "ewc_loss": 0.05023973807692528, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020576652605086565, + "grad_norm": 5.892891883850098, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8432767391204834, + "num_tokens": 245546794.0, + "step": 6437 + }, + { + "epoch": 0.8189797735657041, + "ewc_loss": 0.05027836933732033, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002061528357444331, + "grad_norm": 5.840488910675049, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8629835247993469, + "num_tokens": 245589544.0, + "step": 6438 + }, + { + "epoch": 0.8191069838442946, + "ewc_loss": 0.05035039782524109, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020687314099632204, + "grad_norm": 5.89882230758667, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8752257823944092, + "num_tokens": 245628889.0, + "step": 6439 + }, + { + "epoch": 0.8192341941228851, + "ewc_loss": 0.05027414858341217, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020611060608644038, + "grad_norm": 5.861750602722168, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.856239914894104, + "num_tokens": 245674615.0, + "step": 6440 + }, + { + "epoch": 0.8193614044014756, + "ewc_loss": 0.050259582698345184, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002059649705188349, + "grad_norm": 5.8676862716674805, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8589733242988586, + "num_tokens": 245713966.0, + "step": 6441 + }, + { + "epoch": 0.8194886146800662, + "ewc_loss": 0.05031180381774902, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002064871951006353, + "grad_norm": 5.928990840911865, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8643191456794739, + "num_tokens": 245750035.0, + "step": 6442 + }, + { + "epoch": 0.8196158249586567, + "ewc_loss": 0.05024197697639465, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020578889234457165, + "grad_norm": 5.843730926513672, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8750243782997131, + "num_tokens": 245788506.0, + "step": 6443 + }, + { + "epoch": 0.8197430352372471, + "ewc_loss": 0.05025004595518112, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020586959726642817, + "grad_norm": 5.883574485778809, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8624967336654663, + "num_tokens": 245826547.0, + "step": 6444 + }, + { + "epoch": 0.8198702455158376, + "ewc_loss": 0.050201427191495895, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002053834032267332, + "grad_norm": 5.8165130615234375, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8494365811347961, + "num_tokens": 245874363.0, + "step": 6445 + }, + { + "epoch": 0.8199974557944282, + "ewc_loss": 0.050364963710308075, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020701877656392753, + "grad_norm": 5.915175437927246, + "learning_rate": 1e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8419424295425415, + "num_tokens": 245917813.0, + "step": 6446 + }, + { + "epoch": 0.8201246660730187, + "ewc_loss": 0.05024147406220436, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020578387193381786, + "grad_norm": 5.911834716796875, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8557649850845337, + "num_tokens": 245952177.0, + "step": 6447 + }, + { + "epoch": 0.8202518763516092, + "ewc_loss": 0.05031230300664902, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020649215730372816, + "grad_norm": 5.8874311447143555, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8623991012573242, + "num_tokens": 245992897.0, + "step": 6448 + }, + { + "epoch": 0.8203790866301998, + "ewc_loss": 0.050260256975889206, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020597170805558562, + "grad_norm": 5.878222465515137, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8676234483718872, + "num_tokens": 246029657.0, + "step": 6449 + }, + { + "epoch": 0.8205062969087902, + "ewc_loss": 0.05029606819152832, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020632984524127096, + "grad_norm": 5.8495354652404785, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8506996631622314, + "num_tokens": 246072425.0, + "step": 6450 + }, + { + "epoch": 0.8206335071873807, + "ewc_loss": 0.05028794705867767, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020624861645046622, + "grad_norm": 5.820713996887207, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8773007392883301, + "num_tokens": 246113131.0, + "step": 6451 + }, + { + "epoch": 0.8207607174659712, + "ewc_loss": 0.05039391294121742, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020730827236548066, + "grad_norm": 5.9216203689575195, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8628019094467163, + "num_tokens": 246154548.0, + "step": 6452 + }, + { + "epoch": 0.8208879277445618, + "ewc_loss": 0.0502813346683979, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020618249254766852, + "grad_norm": 5.893850803375244, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8568686246871948, + "num_tokens": 246189518.0, + "step": 6453 + }, + { + "epoch": 0.8210151380231523, + "ewc_loss": 0.05030912905931473, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002064604195766151, + "grad_norm": 5.851147651672363, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8560609817504883, + "num_tokens": 246235248.0, + "step": 6454 + }, + { + "epoch": 0.8211423483017428, + "ewc_loss": 0.050372496247291565, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020709411182906479, + "grad_norm": 5.8669819831848145, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8522859215736389, + "num_tokens": 246277016.0, + "step": 6455 + }, + { + "epoch": 0.8212695585803332, + "ewc_loss": 0.05038625746965408, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020723171473946422, + "grad_norm": 5.8793439865112305, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8565011620521545, + "num_tokens": 246312352.0, + "step": 6456 + }, + { + "epoch": 0.8213967688589238, + "ewc_loss": 0.0503494068980217, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020686323114205152, + "grad_norm": 5.844451904296875, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8461759090423584, + "num_tokens": 246355079.0, + "step": 6457 + }, + { + "epoch": 0.8215239791375143, + "ewc_loss": 0.0504758283495903, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002081274287775159, + "grad_norm": 5.9534430503845215, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8469768762588501, + "num_tokens": 246391073.0, + "step": 6458 + }, + { + "epoch": 0.8216511894161048, + "ewc_loss": 0.05035897344350815, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020695885177701712, + "grad_norm": 5.79594612121582, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8667446970939636, + "num_tokens": 246435407.0, + "step": 6459 + }, + { + "epoch": 0.8217783996946953, + "ewc_loss": 0.05044740438461304, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002078431862173602, + "grad_norm": 5.971620082855225, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8468645811080933, + "num_tokens": 246475303.0, + "step": 6460 + }, + { + "epoch": 0.8219056099732859, + "ewc_loss": 0.05036282539367676, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002069973706966266, + "grad_norm": 5.845054626464844, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8578657507896423, + "num_tokens": 246515237.0, + "step": 6461 + }, + { + "epoch": 0.8220328202518764, + "ewc_loss": 0.05042673274874687, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020763646170962602, + "grad_norm": 5.916537284851074, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8545151352882385, + "num_tokens": 246554958.0, + "step": 6462 + }, + { + "epoch": 0.8221600305304668, + "ewc_loss": 0.05034560710191727, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020682523609139025, + "grad_norm": 5.874513149261475, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.860699474811554, + "num_tokens": 246592327.0, + "step": 6463 + }, + { + "epoch": 0.8222872408090574, + "ewc_loss": 0.05038243532180786, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002071934868581593, + "grad_norm": 5.852773189544678, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8643771409988403, + "num_tokens": 246626965.0, + "step": 6464 + }, + { + "epoch": 0.8224144510876479, + "ewc_loss": 0.05047391727566719, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020810832211282104, + "grad_norm": 5.906827449798584, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8554623126983643, + "num_tokens": 246662049.0, + "step": 6465 + }, + { + "epoch": 0.8225416613662384, + "ewc_loss": 0.05033247917890549, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020669390505645424, + "grad_norm": 5.898148536682129, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8615198135375977, + "num_tokens": 246696989.0, + "step": 6466 + }, + { + "epoch": 0.8226688716448289, + "ewc_loss": 0.050404395908117294, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020741310436278582, + "grad_norm": 7.307496070861816, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8638762831687927, + "num_tokens": 246738418.0, + "step": 6467 + }, + { + "epoch": 0.8227960819234195, + "ewc_loss": 0.05116366967558861, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00021500582806766033, + "grad_norm": 5.881543159484863, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.860237717628479, + "num_tokens": 246776779.0, + "step": 6468 + }, + { + "epoch": 0.8229232922020099, + "ewc_loss": 0.050280988216400146, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020617902919184417, + "grad_norm": 5.961752891540527, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8655891418457031, + "num_tokens": 246814029.0, + "step": 6469 + }, + { + "epoch": 0.8230505024806004, + "ewc_loss": 0.05039257928729057, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020729494281113148, + "grad_norm": 5.838275909423828, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8513192534446716, + "num_tokens": 246856390.0, + "step": 6470 + }, + { + "epoch": 0.8231777127591909, + "ewc_loss": 0.05044329911470413, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020780212071258575, + "grad_norm": 5.977383136749268, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8609744310379028, + "num_tokens": 246895563.0, + "step": 6471 + }, + { + "epoch": 0.8233049230377815, + "ewc_loss": 0.0504167266190052, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020753640274051577, + "grad_norm": 5.8959527015686035, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8564845323562622, + "num_tokens": 246935956.0, + "step": 6472 + }, + { + "epoch": 0.823432133316372, + "ewc_loss": 0.050356365740299225, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020693281840067357, + "grad_norm": 5.902525901794434, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8563669919967651, + "num_tokens": 246984765.0, + "step": 6473 + }, + { + "epoch": 0.8235593435949625, + "ewc_loss": 0.050417475402355194, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002075439115287736, + "grad_norm": 5.955244541168213, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8620703816413879, + "num_tokens": 247017086.0, + "step": 6474 + }, + { + "epoch": 0.8236865538735529, + "ewc_loss": 0.05040774121880531, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020744655921589583, + "grad_norm": 7.371833801269531, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8650415539741516, + "num_tokens": 247056195.0, + "step": 6475 + }, + { + "epoch": 0.8238137641521435, + "ewc_loss": 0.05116706341505051, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00021503979223780334, + "grad_norm": 5.880523681640625, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8730205297470093, + "num_tokens": 247094063.0, + "step": 6476 + }, + { + "epoch": 0.823940974430734, + "ewc_loss": 0.050295956432819366, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020632872474379838, + "grad_norm": 6.053725242614746, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8583511710166931, + "num_tokens": 247131743.0, + "step": 6477 + }, + { + "epoch": 0.8240681847093245, + "ewc_loss": 0.05052434653043747, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020861263328697532, + "grad_norm": 5.919439315795898, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8576784729957581, + "num_tokens": 247169477.0, + "step": 6478 + }, + { + "epoch": 0.824195394987915, + "ewc_loss": 0.05035249888896942, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020689412485808134, + "grad_norm": 5.916780471801758, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8684366941452026, + "num_tokens": 247209159.0, + "step": 6479 + }, + { + "epoch": 0.8243226052665056, + "ewc_loss": 0.05046375095844269, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020800663332920521, + "grad_norm": 5.887270927429199, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8622188568115234, + "num_tokens": 247247603.0, + "step": 6480 + }, + { + "epoch": 0.824449815545096, + "ewc_loss": 0.050410833209753036, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002074774820357561, + "grad_norm": 5.859621524810791, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8700462579727173, + "num_tokens": 247290184.0, + "step": 6481 + }, + { + "epoch": 0.8245770258236865, + "ewc_loss": 0.05043933913111687, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002077625395031646, + "grad_norm": 5.941409587860107, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.842807948589325, + "num_tokens": 247331862.0, + "step": 6482 + }, + { + "epoch": 0.824704236102277, + "ewc_loss": 0.05041123181581497, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020748145470861346, + "grad_norm": 5.881173610687256, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8597599267959595, + "num_tokens": 247368271.0, + "step": 6483 + }, + { + "epoch": 0.8248314463808676, + "ewc_loss": 0.05045528709888458, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020792199939023703, + "grad_norm": 5.945605278015137, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8596292734146118, + "num_tokens": 247408077.0, + "step": 6484 + }, + { + "epoch": 0.8249586566594581, + "ewc_loss": 0.050354089587926865, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020691003010142595, + "grad_norm": 5.86680269241333, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8671562671661377, + "num_tokens": 247447408.0, + "step": 6485 + }, + { + "epoch": 0.8250858669380486, + "ewc_loss": 0.050371259450912476, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020708171359729022, + "grad_norm": 5.9353413581848145, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8706406354904175, + "num_tokens": 247489863.0, + "step": 6486 + }, + { + "epoch": 0.8252130772166391, + "ewc_loss": 0.050517771393060684, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002073261421173811, + "grad_norm": 5.833826065063477, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8552930355072021, + "num_tokens": 247532233.0, + "step": 6487 + }, + { + "epoch": 0.8253402874952296, + "ewc_loss": 0.05057433247566223, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020789177506230772, + "grad_norm": 5.945557117462158, + "learning_rate": 1e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.835745096206665, + "num_tokens": 247570719.0, + "step": 6488 + }, + { + "epoch": 0.8254674977738201, + "ewc_loss": 0.0503569170832634, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002069383190246299, + "grad_norm": 5.939446449279785, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8645483255386353, + "num_tokens": 247603302.0, + "step": 6489 + }, + { + "epoch": 0.8255947080524106, + "ewc_loss": 0.050447285175323486, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002078420075122267, + "grad_norm": 5.889359474182129, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8632696866989136, + "num_tokens": 247642837.0, + "step": 6490 + }, + { + "epoch": 0.8257219183310012, + "ewc_loss": 0.050389956682920456, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020726870570797473, + "grad_norm": 6.0016984939575195, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8578742146492004, + "num_tokens": 247685246.0, + "step": 6491 + }, + { + "epoch": 0.8258491286095917, + "ewc_loss": 0.05036281421780586, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020699728338513523, + "grad_norm": 5.874452590942383, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8504255414009094, + "num_tokens": 247720871.0, + "step": 6492 + }, + { + "epoch": 0.8259763388881821, + "ewc_loss": 0.05043110251426697, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020768019021488726, + "grad_norm": 6.095879077911377, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8610328435897827, + "num_tokens": 247759796.0, + "step": 6493 + }, + { + "epoch": 0.8261035491667726, + "ewc_loss": 0.050265565514564514, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020602480799425393, + "grad_norm": 5.816133499145508, + "learning_rate": 1e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8389607071876526, + "num_tokens": 247806486.0, + "step": 6494 + }, + { + "epoch": 0.8262307594453632, + "ewc_loss": 0.05041101574897766, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020747930102515966, + "grad_norm": 5.91221809387207, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8671672344207764, + "num_tokens": 247842031.0, + "step": 6495 + }, + { + "epoch": 0.8263579697239537, + "ewc_loss": 0.05030863359570503, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020645547192543745, + "grad_norm": 5.788862705230713, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8637876510620117, + "num_tokens": 247883438.0, + "step": 6496 + }, + { + "epoch": 0.8264851800025442, + "ewc_loss": 0.0504583865404129, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020795298041775823, + "grad_norm": 5.928497791290283, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8679325580596924, + "num_tokens": 247921491.0, + "step": 6497 + }, + { + "epoch": 0.8266123902811348, + "ewc_loss": 0.05038011819124222, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020717032020911574, + "grad_norm": 5.893687725067139, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8702512383460999, + "num_tokens": 247957454.0, + "step": 6498 + }, + { + "epoch": 0.8267396005597252, + "ewc_loss": 0.05043177679181099, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020768691319972277, + "grad_norm": 5.921019554138184, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8449961543083191, + "num_tokens": 247997705.0, + "step": 6499 + }, + { + "epoch": 0.8268668108383157, + "ewc_loss": 0.05043929070234299, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020776204473804682, + "grad_norm": 5.918401718139648, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8549920916557312, + "num_tokens": 248034536.0, + "step": 6500 + }, + { + "epoch": 0.8269940211169062, + "ewc_loss": 0.05040682479739189, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020743739150930196, + "grad_norm": 5.843908786773682, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8608736395835876, + "num_tokens": 248080916.0, + "step": 6501 + }, + { + "epoch": 0.8271212313954968, + "ewc_loss": 0.05048716813325882, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020824081730097532, + "grad_norm": 5.913335800170898, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8591943383216858, + "num_tokens": 248119719.0, + "step": 6502 + }, + { + "epoch": 0.8272484416740873, + "ewc_loss": 0.050453584641218185, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020790498820133507, + "grad_norm": 5.916840076446533, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8573487997055054, + "num_tokens": 248156221.0, + "step": 6503 + }, + { + "epoch": 0.8273756519526778, + "ewc_loss": 0.050423238426446915, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002076015225611627, + "grad_norm": 5.881124496459961, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8624996542930603, + "num_tokens": 248191780.0, + "step": 6504 + }, + { + "epoch": 0.8275028622312682, + "ewc_loss": 0.050515059381723404, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020851973386015743, + "grad_norm": 5.9394612312316895, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8547875881195068, + "num_tokens": 248228594.0, + "step": 6505 + }, + { + "epoch": 0.8276300725098588, + "ewc_loss": 0.05047699809074402, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020813914306927472, + "grad_norm": 5.897803783416748, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8651385307312012, + "num_tokens": 248260372.0, + "step": 6506 + }, + { + "epoch": 0.8277572827884493, + "ewc_loss": 0.05068036541342735, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020895208581350744, + "grad_norm": 5.93131685256958, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8519283533096313, + "num_tokens": 248299361.0, + "step": 6507 + }, + { + "epoch": 0.8278844930670398, + "ewc_loss": 0.050571296364068985, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020786140521522611, + "grad_norm": 5.89989709854126, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8511068224906921, + "num_tokens": 248337370.0, + "step": 6508 + }, + { + "epoch": 0.8280117033456303, + "ewc_loss": 0.05065222829580307, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020867072453256696, + "grad_norm": 5.867199420928955, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8624194860458374, + "num_tokens": 248377541.0, + "step": 6509 + }, + { + "epoch": 0.8281389136242209, + "ewc_loss": 0.05065507814288139, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002086992171825841, + "grad_norm": 5.914716720581055, + "learning_rate": 1e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8421357870101929, + "num_tokens": 248413928.0, + "step": 6510 + }, + { + "epoch": 0.8282661239028114, + "ewc_loss": 0.050641581416130066, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020856426272075623, + "grad_norm": 5.909945487976074, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8595146536827087, + "num_tokens": 248453283.0, + "step": 6511 + }, + { + "epoch": 0.8283933341814018, + "ewc_loss": 0.050630684942007065, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.000208455283427611, + "grad_norm": 5.8748979568481445, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8645576238632202, + "num_tokens": 248490286.0, + "step": 6512 + }, + { + "epoch": 0.8285205444599923, + "ewc_loss": 0.05066274106502533, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002087758475681767, + "grad_norm": 5.921472549438477, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8440772294998169, + "num_tokens": 248526349.0, + "step": 6513 + }, + { + "epoch": 0.8286477547385829, + "ewc_loss": 0.05063086003065109, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020845704420935363, + "grad_norm": 5.895812511444092, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8533251285552979, + "num_tokens": 248561751.0, + "step": 6514 + }, + { + "epoch": 0.8287749650171734, + "ewc_loss": 0.0506286546587944, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020843498350586742, + "grad_norm": 5.857807636260986, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8734264373779297, + "num_tokens": 248601921.0, + "step": 6515 + }, + { + "epoch": 0.8289021752957639, + "ewc_loss": 0.05059944838285446, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020814294111914933, + "grad_norm": 5.860498905181885, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8709734678268433, + "num_tokens": 248641218.0, + "step": 6516 + }, + { + "epoch": 0.8290293855743545, + "ewc_loss": 0.05065580829977989, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020870652224402875, + "grad_norm": 5.980116367340088, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8615965843200684, + "num_tokens": 248672616.0, + "step": 6517 + }, + { + "epoch": 0.8291565958529449, + "ewc_loss": 0.05059308558702469, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020807927649002522, + "grad_norm": 5.859881401062012, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8632580041885376, + "num_tokens": 248708212.0, + "step": 6518 + }, + { + "epoch": 0.8292838061315354, + "ewc_loss": 0.05066660791635513, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020881451200693846, + "grad_norm": 5.97882080078125, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8630505800247192, + "num_tokens": 248746575.0, + "step": 6519 + }, + { + "epoch": 0.8294110164101259, + "ewc_loss": 0.050622835755348206, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020837679039686918, + "grad_norm": 5.861976623535156, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8556551933288574, + "num_tokens": 248789423.0, + "step": 6520 + }, + { + "epoch": 0.8295382266887165, + "ewc_loss": 0.050663061439991, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020877906354144216, + "grad_norm": 5.931797504425049, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8624556660652161, + "num_tokens": 248824044.0, + "step": 6521 + }, + { + "epoch": 0.829665436967307, + "ewc_loss": 0.050577517598867416, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020792361465282738, + "grad_norm": 5.891384124755859, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8573355674743652, + "num_tokens": 248860805.0, + "step": 6522 + }, + { + "epoch": 0.8297926472458975, + "ewc_loss": 0.05063385143876076, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020848694839514792, + "grad_norm": 5.922651290893555, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8660028576850891, + "num_tokens": 248898958.0, + "step": 6523 + }, + { + "epoch": 0.8299198575244879, + "ewc_loss": 0.050583094358444214, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020797940669581294, + "grad_norm": 5.838343620300293, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8588587045669556, + "num_tokens": 248938847.0, + "step": 6524 + }, + { + "epoch": 0.8300470678030785, + "ewc_loss": 0.05063554272055626, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020850385772064328, + "grad_norm": 5.940983772277832, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8569996356964111, + "num_tokens": 248977944.0, + "step": 6525 + }, + { + "epoch": 0.830174278081669, + "ewc_loss": 0.050597019493579865, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020811865397263318, + "grad_norm": 5.8835368156433105, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8528159856796265, + "num_tokens": 249014610.0, + "step": 6526 + }, + { + "epoch": 0.8303014883602595, + "ewc_loss": 0.05062045529484749, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020835298346355557, + "grad_norm": 5.943915843963623, + "learning_rate": 1e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.830117404460907, + "num_tokens": 249053869.0, + "step": 6527 + }, + { + "epoch": 0.83042869863885, + "ewc_loss": 0.05058741942048073, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002080226258840412, + "grad_norm": 5.848330020904541, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8526747822761536, + "num_tokens": 249096874.0, + "step": 6528 + }, + { + "epoch": 0.8305559089174406, + "ewc_loss": 0.050648584961891174, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002086342719849199, + "grad_norm": 5.959199905395508, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8575800657272339, + "num_tokens": 249133141.0, + "step": 6529 + }, + { + "epoch": 0.830683119196031, + "ewc_loss": 0.050628725439310074, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002084356965497136, + "grad_norm": 6.064131259918213, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8653088808059692, + "num_tokens": 249163813.0, + "step": 6530 + }, + { + "epoch": 0.8308103294746215, + "ewc_loss": 0.05052873492240906, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020743576169479638, + "grad_norm": 5.883914947509766, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8526405096054077, + "num_tokens": 249202935.0, + "step": 6531 + }, + { + "epoch": 0.830937539753212, + "ewc_loss": 0.05074726790189743, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.0002084003936033696, + "grad_norm": 5.950747489929199, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8663642406463623, + "num_tokens": 249241061.0, + "step": 6532 + }, + { + "epoch": 0.8310647500318026, + "ewc_loss": 0.0506923571228981, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020785132073797286, + "grad_norm": 5.832926273345947, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8471904397010803, + "num_tokens": 249277863.0, + "step": 6533 + }, + { + "epoch": 0.8311919603103931, + "ewc_loss": 0.050790391862392426, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020883163961116225, + "grad_norm": 5.932950019836426, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8584698438644409, + "num_tokens": 249318047.0, + "step": 6534 + }, + { + "epoch": 0.8313191705889836, + "ewc_loss": 0.05089966207742691, + "ewc_loss_diag": 3.0040740966796875e-05, + "ewc_loss_parallel": 0.000208703670068644, + "grad_norm": 5.917172431945801, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8545199036598206, + "num_tokens": 249362095.0, + "step": 6535 + }, + { + "epoch": 0.831446380867574, + "ewc_loss": 0.05092960596084595, + "ewc_loss_diag": 3.0040740966796875e-05, + "ewc_loss_parallel": 0.00020900309027638286, + "grad_norm": 5.891736030578613, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.872673511505127, + "num_tokens": 249403362.0, + "step": 6536 + }, + { + "epoch": 0.8315735911461646, + "ewc_loss": 0.05085303634405136, + "ewc_loss_diag": 3.0040740966796875e-05, + "ewc_loss_parallel": 0.00020823738304898143, + "grad_norm": 5.893435478210449, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8661185503005981, + "num_tokens": 249438051.0, + "step": 6537 + }, + { + "epoch": 0.8317008014247551, + "ewc_loss": 0.050911761820316315, + "ewc_loss_diag": 3.0040740966796875e-05, + "ewc_loss_parallel": 0.0002088246401399374, + "grad_norm": 5.912785053253174, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8643547296524048, + "num_tokens": 249477185.0, + "step": 6538 + }, + { + "epoch": 0.8318280117033456, + "ewc_loss": 0.05078895390033722, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020881727687083185, + "grad_norm": 5.926949501037598, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8522301912307739, + "num_tokens": 249516880.0, + "step": 6539 + }, + { + "epoch": 0.8319552219819362, + "ewc_loss": 0.05074847489595413, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.0002084124571410939, + "grad_norm": 5.886106967926025, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8699192404747009, + "num_tokens": 249555864.0, + "step": 6540 + }, + { + "epoch": 0.8320824322605267, + "ewc_loss": 0.05077480897307396, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020867583225481212, + "grad_norm": 5.928867340087891, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8590555191040039, + "num_tokens": 249590566.0, + "step": 6541 + }, + { + "epoch": 0.8322096425391171, + "ewc_loss": 0.05075646936893463, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020849244901910424, + "grad_norm": 5.858889102935791, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8721448183059692, + "num_tokens": 249631523.0, + "step": 6542 + }, + { + "epoch": 0.8323368528177076, + "ewc_loss": 0.05066747963428497, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020760251209139824, + "grad_norm": 5.853387355804443, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8679755926132202, + "num_tokens": 249674113.0, + "step": 6543 + }, + { + "epoch": 0.8324640630962982, + "ewc_loss": 0.05060345679521561, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020818301709368825, + "grad_norm": 5.8930983543396, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8711503148078918, + "num_tokens": 249712934.0, + "step": 6544 + }, + { + "epoch": 0.8325912733748887, + "ewc_loss": 0.050581276416778564, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020796121680177748, + "grad_norm": 5.894311428070068, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8654289245605469, + "num_tokens": 249750295.0, + "step": 6545 + }, + { + "epoch": 0.8327184836534792, + "ewc_loss": 0.05062863603234291, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020843479433096945, + "grad_norm": 5.979773998260498, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8636153340339661, + "num_tokens": 249790289.0, + "step": 6546 + }, + { + "epoch": 0.8328456939320698, + "ewc_loss": 0.05058315023779869, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020797993056476116, + "grad_norm": 5.91981315612793, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8720795512199402, + "num_tokens": 249822873.0, + "step": 6547 + }, + { + "epoch": 0.8329729042106602, + "ewc_loss": 0.05055294185876846, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020767786190845072, + "grad_norm": 5.874759197235107, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8616060018539429, + "num_tokens": 249860564.0, + "step": 6548 + }, + { + "epoch": 0.8331001144892507, + "ewc_loss": 0.05053099989891052, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002074584481306374, + "grad_norm": 5.926838397979736, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8591430187225342, + "num_tokens": 249901448.0, + "step": 6549 + }, + { + "epoch": 0.8332273247678412, + "ewc_loss": 0.05057808756828308, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020792930445168167, + "grad_norm": 5.999227523803711, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8489220142364502, + "num_tokens": 249933524.0, + "step": 6550 + }, + { + "epoch": 0.8333545350464318, + "ewc_loss": 0.050519414246082306, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020734257122967392, + "grad_norm": 5.865379333496094, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8546209335327148, + "num_tokens": 249971856.0, + "step": 6551 + }, + { + "epoch": 0.8334817453250223, + "ewc_loss": 0.05049809813499451, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020712944387923926, + "grad_norm": 5.843442440032959, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8523858189582825, + "num_tokens": 250013940.0, + "step": 6552 + }, + { + "epoch": 0.8336089556036128, + "ewc_loss": 0.05056242272257805, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020777266763616353, + "grad_norm": 6.028566837310791, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8413591980934143, + "num_tokens": 250044826.0, + "step": 6553 + }, + { + "epoch": 0.8337361658822032, + "ewc_loss": 0.05054185166954994, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002075669472105801, + "grad_norm": 5.837562084197998, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.842954158782959, + "num_tokens": 250087409.0, + "step": 6554 + }, + { + "epoch": 0.8338633761607938, + "ewc_loss": 0.05057594180107117, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002079078694805503, + "grad_norm": 5.881712913513184, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8669851422309875, + "num_tokens": 250126236.0, + "step": 6555 + }, + { + "epoch": 0.8339905864393843, + "ewc_loss": 0.05054730176925659, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020762142958119512, + "grad_norm": 5.862606048583984, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8682326078414917, + "num_tokens": 250163559.0, + "step": 6556 + }, + { + "epoch": 0.8341177967179748, + "ewc_loss": 0.05061084032058716, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002082568680634722, + "grad_norm": 5.883393287658691, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8605502843856812, + "num_tokens": 250203536.0, + "step": 6557 + }, + { + "epoch": 0.8342450069965653, + "ewc_loss": 0.05058705061674118, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002080189442494884, + "grad_norm": 5.865559101104736, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8657760620117188, + "num_tokens": 250237455.0, + "step": 6558 + }, + { + "epoch": 0.8343722172751559, + "ewc_loss": 0.05059443786740303, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002080928097711876, + "grad_norm": 5.879605770111084, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.867982029914856, + "num_tokens": 250274559.0, + "step": 6559 + }, + { + "epoch": 0.8344994275537464, + "ewc_loss": 0.05066376179456711, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.000208786063012667, + "grad_norm": 5.894055366516113, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8615607023239136, + "num_tokens": 250309392.0, + "step": 6560 + }, + { + "epoch": 0.8346266378323368, + "ewc_loss": 0.05063765496015549, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020852497254963964, + "grad_norm": 5.883174896240234, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8646853566169739, + "num_tokens": 250352010.0, + "step": 6561 + }, + { + "epoch": 0.8347538481109273, + "ewc_loss": 0.05068717896938324, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002090202469844371, + "grad_norm": 5.984427452087402, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8475626111030579, + "num_tokens": 250385745.0, + "step": 6562 + }, + { + "epoch": 0.8348810583895179, + "ewc_loss": 0.05056402087211609, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002077886601909995, + "grad_norm": 5.8523640632629395, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8588775396347046, + "num_tokens": 250426055.0, + "step": 6563 + }, + { + "epoch": 0.8350082686681084, + "ewc_loss": 0.05067553371191025, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002089037880068645, + "grad_norm": 5.977100849151611, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.857207179069519, + "num_tokens": 250468795.0, + "step": 6564 + }, + { + "epoch": 0.8351354789466989, + "ewc_loss": 0.05060502886772156, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020819873316213489, + "grad_norm": 5.890255451202393, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8582993745803833, + "num_tokens": 250509115.0, + "step": 6565 + }, + { + "epoch": 0.8352626892252895, + "ewc_loss": 0.050630610436201096, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020845454127993435, + "grad_norm": 5.894968032836914, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8610198497772217, + "num_tokens": 250547770.0, + "step": 6566 + }, + { + "epoch": 0.8353898995038799, + "ewc_loss": 0.05059411749243736, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020808960834983736, + "grad_norm": 5.887760162353516, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8575857281684875, + "num_tokens": 250589737.0, + "step": 6567 + }, + { + "epoch": 0.8355171097824704, + "ewc_loss": 0.050610531121492386, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020825375395361334, + "grad_norm": 5.954603672027588, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.866715133190155, + "num_tokens": 250623058.0, + "step": 6568 + }, + { + "epoch": 0.8356443200610609, + "ewc_loss": 0.050596002489328384, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002081084530800581, + "grad_norm": 5.887547969818115, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8596978783607483, + "num_tokens": 250660526.0, + "step": 6569 + }, + { + "epoch": 0.8357715303396515, + "ewc_loss": 0.05066470056772232, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020879546354990453, + "grad_norm": 5.929481506347656, + "learning_rate": 1e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8451074957847595, + "num_tokens": 250698218.0, + "step": 6570 + }, + { + "epoch": 0.835898740618242, + "ewc_loss": 0.050560496747493744, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020775338634848595, + "grad_norm": 5.8360371589660645, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8526351451873779, + "num_tokens": 250743410.0, + "step": 6571 + }, + { + "epoch": 0.8360259508968325, + "ewc_loss": 0.051869168877601624, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00020863310783170164, + "grad_norm": 53.57672882080078, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8699037432670593, + "num_tokens": 250781919.0, + "step": 6572 + }, + { + "epoch": 0.8361531611754229, + "ewc_loss": 0.07574943453073502, + "ewc_loss_diag": 3.0040740966796875e-05, + "ewc_loss_parallel": 0.0004572013858705759, + "grad_norm": 9.517889976501465, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8575271964073181, + "num_tokens": 250817051.0, + "step": 6573 + }, + { + "epoch": 0.8362803714540135, + "ewc_loss": 0.054509855806827545, + "ewc_loss_diag": 3.0040740966796875e-05, + "ewc_loss_parallel": 0.00024480558931827545, + "grad_norm": 5.571630477905273, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8578060865402222, + "num_tokens": 250857241.0, + "step": 6574 + }, + { + "epoch": 0.836407581732604, + "ewc_loss": 0.06099642068147659, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00031089194817468524, + "grad_norm": 8.217641830444336, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8609365224838257, + "num_tokens": 250903591.0, + "step": 6575 + }, + { + "epoch": 0.8365347920111945, + "ewc_loss": 0.06733546406030655, + "ewc_loss_diag": 3.0040740966796875e-05, + "ewc_loss_parallel": 0.0003730616590473801, + "grad_norm": 8.42839527130127, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8523174524307251, + "num_tokens": 250945788.0, + "step": 6576 + }, + { + "epoch": 0.836662002289785, + "ewc_loss": 0.05631779879331589, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00026410570717416704, + "grad_norm": 6.372768402099609, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8530301451683044, + "num_tokens": 250984884.0, + "step": 6577 + }, + { + "epoch": 0.8367892125683756, + "ewc_loss": 0.05659691244363785, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.0002668968809302896, + "grad_norm": 7.229948043823242, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8722696900367737, + "num_tokens": 251024347.0, + "step": 6578 + }, + { + "epoch": 0.836916422846966, + "ewc_loss": 0.0585637092590332, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00028656484209932387, + "grad_norm": 6.993974685668945, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8813489079475403, + "num_tokens": 251068527.0, + "step": 6579 + }, + { + "epoch": 0.8370436331255565, + "ewc_loss": 0.054240599274635315, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002445544523652643, + "grad_norm": 6.4405364990234375, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8645157814025879, + "num_tokens": 251109914.0, + "step": 6580 + }, + { + "epoch": 0.837170843404147, + "ewc_loss": 0.05442342534661293, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002463826967868954, + "grad_norm": 6.757408618927002, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8502016067504883, + "num_tokens": 251144760.0, + "step": 6581 + }, + { + "epoch": 0.8372980536827376, + "ewc_loss": 0.05428054928779602, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002449539315421134, + "grad_norm": 6.421806812286377, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8586134910583496, + "num_tokens": 251184456.0, + "step": 6582 + }, + { + "epoch": 0.8374252639613281, + "ewc_loss": 0.05273168534040451, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002306859678355977, + "grad_norm": 6.339179039001465, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.851879358291626, + "num_tokens": 251229493.0, + "step": 6583 + }, + { + "epoch": 0.8375524742399186, + "ewc_loss": 0.05271952971816063, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002305644447915256, + "grad_norm": 6.312625885009766, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8651320934295654, + "num_tokens": 251270886.0, + "step": 6584 + }, + { + "epoch": 0.837679684518509, + "ewc_loss": 0.052224524319171906, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00022561440709978342, + "grad_norm": 6.251795768737793, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8692895174026489, + "num_tokens": 251304833.0, + "step": 6585 + }, + { + "epoch": 0.8378068947970996, + "ewc_loss": 0.051913630217313766, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00022250544861890376, + "grad_norm": 6.266552925109863, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8636649250984192, + "num_tokens": 251334196.0, + "step": 6586 + }, + { + "epoch": 0.8379341050756901, + "ewc_loss": 0.05167238414287567, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00022009300300851464, + "grad_norm": 6.108145713806152, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.850078821182251, + "num_tokens": 251380278.0, + "step": 6587 + }, + { + "epoch": 0.8380613153542806, + "ewc_loss": 0.05138687416911125, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002172378881368786, + "grad_norm": 6.186771392822266, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.854148805141449, + "num_tokens": 251418432.0, + "step": 6588 + }, + { + "epoch": 0.8381885256328712, + "ewc_loss": 0.05119037628173828, + "ewc_loss_diag": 2.956390380859375e-05, + "ewc_loss_parallel": 0.0002164936304325238, + "grad_norm": 6.094548225402832, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8577278852462769, + "num_tokens": 251453747.0, + "step": 6589 + }, + { + "epoch": 0.8383157359114617, + "ewc_loss": 0.05087737739086151, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00021214289881754667, + "grad_norm": 5.9802021980285645, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8605408668518066, + "num_tokens": 251492439.0, + "step": 6590 + }, + { + "epoch": 0.8384429461900521, + "ewc_loss": 0.05099007114768028, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00021326984278857708, + "grad_norm": 6.099565029144287, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8615760207176208, + "num_tokens": 251527945.0, + "step": 6591 + }, + { + "epoch": 0.8385701564686426, + "ewc_loss": 0.050792668014764786, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.0002112958172801882, + "grad_norm": 5.976588726043701, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8685651421546936, + "num_tokens": 251567161.0, + "step": 6592 + }, + { + "epoch": 0.8386973667472332, + "ewc_loss": 0.05075839161872864, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00021095306146889925, + "grad_norm": 6.0536208152771, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8601200580596924, + "num_tokens": 251601189.0, + "step": 6593 + }, + { + "epoch": 0.8388245770258237, + "ewc_loss": 0.050699807703495026, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00021036724501755089, + "grad_norm": 6.023135662078857, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8577734231948853, + "num_tokens": 251630846.0, + "step": 6594 + }, + { + "epoch": 0.8389517873044142, + "ewc_loss": 0.05062002316117287, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020956936350557953, + "grad_norm": 5.955792427062988, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8675049543380737, + "num_tokens": 251665701.0, + "step": 6595 + }, + { + "epoch": 0.8390789975830047, + "ewc_loss": 0.05063178390264511, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020968700118828565, + "grad_norm": 6.015466213226318, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.852237343788147, + "num_tokens": 251699909.0, + "step": 6596 + }, + { + "epoch": 0.8392062078615952, + "ewc_loss": 0.05064249783754349, + "ewc_loss_diag": 2.968311309814453e-05, + "ewc_loss_parallel": 0.00020979410328436643, + "grad_norm": 6.026998043060303, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8582928776741028, + "num_tokens": 251734052.0, + "step": 6597 + }, + { + "epoch": 0.8393334181401857, + "ewc_loss": 0.05068129301071167, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002089613553835079, + "grad_norm": 5.933629512786865, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8486673831939697, + "num_tokens": 251772518.0, + "step": 6598 + }, + { + "epoch": 0.8394606284187762, + "ewc_loss": 0.05072542652487755, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020940270042046905, + "grad_norm": 5.917419910430908, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8570294380187988, + "num_tokens": 251816378.0, + "step": 6599 + }, + { + "epoch": 0.8395878386973668, + "ewc_loss": 0.050713714212179184, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020928557205479592, + "grad_norm": 5.9740071296691895, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.859269380569458, + "num_tokens": 251851264.0, + "step": 6600 + }, + { + "epoch": 0.8397150489759573, + "ewc_loss": 0.05077683553099632, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002099167904816568, + "grad_norm": 5.982416152954102, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8617351055145264, + "num_tokens": 251888906.0, + "step": 6601 + }, + { + "epoch": 0.8398422592545478, + "ewc_loss": 0.050710901618003845, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020925745775457472, + "grad_norm": 5.931007385253906, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8661530017852783, + "num_tokens": 251930641.0, + "step": 6602 + }, + { + "epoch": 0.8399694695331382, + "ewc_loss": 0.05077800154685974, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020992843201383948, + "grad_norm": 6.007088661193848, + "learning_rate": 1e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8413691520690918, + "num_tokens": 251968929.0, + "step": 6603 + }, + { + "epoch": 0.8400966798117288, + "ewc_loss": 0.0507560595870018, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020970904733985662, + "grad_norm": 5.972715854644775, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8511385917663574, + "num_tokens": 252006751.0, + "step": 6604 + }, + { + "epoch": 0.8402238900903193, + "ewc_loss": 0.05070611834526062, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020920962560921907, + "grad_norm": 5.946965217590332, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.854938805103302, + "num_tokens": 252045861.0, + "step": 6605 + }, + { + "epoch": 0.8403511003689098, + "ewc_loss": 0.0507170669734478, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002093191142193973, + "grad_norm": 5.952234745025635, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8522857427597046, + "num_tokens": 252084412.0, + "step": 6606 + }, + { + "epoch": 0.8404783106475003, + "ewc_loss": 0.05068853497505188, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002090337802655995, + "grad_norm": 5.959906578063965, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.864207923412323, + "num_tokens": 252120576.0, + "step": 6607 + }, + { + "epoch": 0.8406055209260909, + "ewc_loss": 0.050732485949993134, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020947330631315708, + "grad_norm": 5.9232354164123535, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8570086359977722, + "num_tokens": 252161636.0, + "step": 6608 + }, + { + "epoch": 0.8407327312046813, + "ewc_loss": 0.05069330334663391, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020908148144371808, + "grad_norm": 5.980884075164795, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8611908555030823, + "num_tokens": 252199669.0, + "step": 6609 + }, + { + "epoch": 0.8408599414832718, + "ewc_loss": 0.05070739984512329, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020922244584653527, + "grad_norm": 5.881439208984375, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8682688474655151, + "num_tokens": 252236832.0, + "step": 6610 + }, + { + "epoch": 0.8409871517618623, + "ewc_loss": 0.050794169306755066, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002100901328958571, + "grad_norm": 5.9473371505737305, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8549449443817139, + "num_tokens": 252282518.0, + "step": 6611 + }, + { + "epoch": 0.8411143620404529, + "ewc_loss": 0.050772774964571, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020987619063816965, + "grad_norm": 5.980400562286377, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8541080951690674, + "num_tokens": 252316479.0, + "step": 6612 + }, + { + "epoch": 0.8412415723190434, + "ewc_loss": 0.05076669901609421, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020981545094400644, + "grad_norm": 5.959722518920898, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8530241250991821, + "num_tokens": 252353789.0, + "step": 6613 + }, + { + "epoch": 0.8413687825976339, + "ewc_loss": 0.05068528279662132, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020900127128697932, + "grad_norm": 5.933394908905029, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8617477416992188, + "num_tokens": 252391609.0, + "step": 6614 + }, + { + "epoch": 0.8414959928762245, + "ewc_loss": 0.05075893551111221, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020973778737243265, + "grad_norm": 5.89482307434082, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8728207349777222, + "num_tokens": 252433514.0, + "step": 6615 + }, + { + "epoch": 0.8416232031548149, + "ewc_loss": 0.050789155066013336, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00021004000154789537, + "grad_norm": 5.931399822235107, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8633893132209778, + "num_tokens": 252469700.0, + "step": 6616 + }, + { + "epoch": 0.8417504134334054, + "ewc_loss": 0.050787825137376785, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00021002668654546142, + "grad_norm": 5.965502738952637, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8566300272941589, + "num_tokens": 252501658.0, + "step": 6617 + }, + { + "epoch": 0.8418776237119959, + "ewc_loss": 0.050737716257572174, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020952557679265738, + "grad_norm": 5.942091941833496, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8611158728599548, + "num_tokens": 252534546.0, + "step": 6618 + }, + { + "epoch": 0.8420048339905865, + "ewc_loss": 0.05079513043165207, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00021009973715990782, + "grad_norm": 5.97913122177124, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8679445385932922, + "num_tokens": 252573172.0, + "step": 6619 + }, + { + "epoch": 0.842132044269177, + "ewc_loss": 0.050742197781801224, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020957041124347597, + "grad_norm": 5.945960998535156, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8688061237335205, + "num_tokens": 252607258.0, + "step": 6620 + }, + { + "epoch": 0.8422592545477675, + "ewc_loss": 0.05072697997093201, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020941821276210248, + "grad_norm": 5.871524333953857, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8702913522720337, + "num_tokens": 252651451.0, + "step": 6621 + }, + { + "epoch": 0.8423864648263579, + "ewc_loss": 0.05083981156349182, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00021054655371699482, + "grad_norm": 5.991172790527344, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8713458180427551, + "num_tokens": 252686348.0, + "step": 6622 + }, + { + "epoch": 0.8425136751049485, + "ewc_loss": 0.05075632780790329, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002097116957884282, + "grad_norm": 5.89994478225708, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.853783369064331, + "num_tokens": 252724316.0, + "step": 6623 + }, + { + "epoch": 0.842640885383539, + "ewc_loss": 0.0508052259683609, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00021020069834776223, + "grad_norm": 5.971757411956787, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8496954441070557, + "num_tokens": 252764612.0, + "step": 6624 + }, + { + "epoch": 0.8427680956621295, + "ewc_loss": 0.05078386887907982, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002099871198879555, + "grad_norm": 5.931424140930176, + "learning_rate": 1e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8390438556671143, + "num_tokens": 252809510.0, + "step": 6625 + }, + { + "epoch": 0.84289530594072, + "ewc_loss": 0.05072910338640213, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020943948766216636, + "grad_norm": 5.98235559463501, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8572903871536255, + "num_tokens": 252847598.0, + "step": 6626 + }, + { + "epoch": 0.8430225162193106, + "ewc_loss": 0.05071509629487991, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020929938182234764, + "grad_norm": 5.911561012268066, + "learning_rate": 1e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8455599546432495, + "num_tokens": 252884401.0, + "step": 6627 + }, + { + "epoch": 0.843149726497901, + "ewc_loss": 0.050753459334373474, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002096830285154283, + "grad_norm": 5.944570541381836, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8427504897117615, + "num_tokens": 252928348.0, + "step": 6628 + }, + { + "epoch": 0.8432769367764915, + "ewc_loss": 0.05067076534032822, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020885611593257636, + "grad_norm": 5.925786972045898, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8585840463638306, + "num_tokens": 252965084.0, + "step": 6629 + }, + { + "epoch": 0.843404147055082, + "ewc_loss": 0.0507289320230484, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020943775598425418, + "grad_norm": 5.899875164031982, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8634032011032104, + "num_tokens": 253003697.0, + "step": 6630 + }, + { + "epoch": 0.8435313573336726, + "ewc_loss": 0.050744567066431046, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020959410176146775, + "grad_norm": 5.981784820556641, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8450949192047119, + "num_tokens": 253037239.0, + "step": 6631 + }, + { + "epoch": 0.8436585676122631, + "ewc_loss": 0.05080851912498474, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020901294192299247, + "grad_norm": 5.8773908615112305, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8512227535247803, + "num_tokens": 253078163.0, + "step": 6632 + }, + { + "epoch": 0.8437857778908536, + "ewc_loss": 0.050909094512462616, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00021001865388825536, + "grad_norm": 5.918588161468506, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8576914072036743, + "num_tokens": 253118907.0, + "step": 6633 + }, + { + "epoch": 0.843912988169444, + "ewc_loss": 0.05086677148938179, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020959544053766876, + "grad_norm": 5.9925618171691895, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8773495554924011, + "num_tokens": 253153132.0, + "step": 6634 + }, + { + "epoch": 0.8440401984480346, + "ewc_loss": 0.05094053968787193, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.0002103331353282556, + "grad_norm": 5.9613871574401855, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8556803464889526, + "num_tokens": 253194446.0, + "step": 6635 + }, + { + "epoch": 0.8441674087266251, + "ewc_loss": 0.05089883878827095, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020991612109355628, + "grad_norm": 5.933942794799805, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8623214960098267, + "num_tokens": 253230798.0, + "step": 6636 + }, + { + "epoch": 0.8442946190052156, + "ewc_loss": 0.05079471692442894, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020887490245513618, + "grad_norm": 5.8892388343811035, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.852164626121521, + "num_tokens": 253269029.0, + "step": 6637 + }, + { + "epoch": 0.8444218292838062, + "ewc_loss": 0.050895124673843384, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020987897005397826, + "grad_norm": 5.919443607330322, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8559202551841736, + "num_tokens": 253310656.0, + "step": 6638 + }, + { + "epoch": 0.8445490395623967, + "ewc_loss": 0.050876036286354065, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020968812168575823, + "grad_norm": 5.918924808502197, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8774408102035522, + "num_tokens": 253346882.0, + "step": 6639 + }, + { + "epoch": 0.8446762498409871, + "ewc_loss": 0.05094417184591293, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.0002103694569086656, + "grad_norm": 5.973386764526367, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.850334644317627, + "num_tokens": 253389601.0, + "step": 6640 + }, + { + "epoch": 0.8448034601195776, + "ewc_loss": 0.0507589690387249, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002097381220664829, + "grad_norm": 5.968449115753174, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8583382368087769, + "num_tokens": 253428725.0, + "step": 6641 + }, + { + "epoch": 0.8449306703981682, + "ewc_loss": 0.050774943083524704, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002098978729918599, + "grad_norm": 5.999368667602539, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.872803807258606, + "num_tokens": 253461404.0, + "step": 6642 + }, + { + "epoch": 0.8450578806767587, + "ewc_loss": 0.050746772438287735, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020961616246495396, + "grad_norm": 5.99183464050293, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8634260892868042, + "num_tokens": 253500324.0, + "step": 6643 + }, + { + "epoch": 0.8451850909553492, + "ewc_loss": 0.05077970400452614, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002099454723065719, + "grad_norm": 6.019423484802246, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8470016717910767, + "num_tokens": 253542500.0, + "step": 6644 + }, + { + "epoch": 0.8453123012339397, + "ewc_loss": 0.05064015835523605, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020855000184383243, + "grad_norm": 5.949451923370361, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8420411944389343, + "num_tokens": 253581887.0, + "step": 6645 + }, + { + "epoch": 0.8454395115125302, + "ewc_loss": 0.05065523833036423, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.000208700803341344, + "grad_norm": 5.952630043029785, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8675360679626465, + "num_tokens": 253616942.0, + "step": 6646 + }, + { + "epoch": 0.8455667217911207, + "ewc_loss": 0.05070890858769417, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020923752163071185, + "grad_norm": 5.976989269256592, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8540847301483154, + "num_tokens": 253652610.0, + "step": 6647 + }, + { + "epoch": 0.8456939320697112, + "ewc_loss": 0.050659749656915665, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002087459433823824, + "grad_norm": 5.937507152557373, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8582700490951538, + "num_tokens": 253686269.0, + "step": 6648 + }, + { + "epoch": 0.8458211423483017, + "ewc_loss": 0.05075214058160782, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.0002096698444802314, + "grad_norm": 5.998930931091309, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8535955548286438, + "num_tokens": 253723632.0, + "step": 6649 + }, + { + "epoch": 0.8459483526268923, + "ewc_loss": 0.05068288743495941, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020897731883451343, + "grad_norm": 6.020362377166748, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8533458709716797, + "num_tokens": 253755990.0, + "step": 6650 + }, + { + "epoch": 0.8460755629054828, + "ewc_loss": 0.05071427300572395, + "ewc_loss_diag": 2.9802322387695312e-05, + "ewc_loss_parallel": 0.00020929117454215884, + "grad_norm": 5.946768283843994, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8538745045661926, + "num_tokens": 253794200.0, + "step": 6651 + }, + { + "epoch": 0.8462027731840732, + "ewc_loss": 0.0507979616522789, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020890736777801067, + "grad_norm": 5.944705486297607, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8700828552246094, + "num_tokens": 253840554.0, + "step": 6652 + }, + { + "epoch": 0.8463299834626637, + "ewc_loss": 0.05080219730734825, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020894971385132521, + "grad_norm": 5.988528251647949, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8684309720993042, + "num_tokens": 253873642.0, + "step": 6653 + }, + { + "epoch": 0.8464571937412543, + "ewc_loss": 0.05086895823478699, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020961729751434177, + "grad_norm": 6.023372173309326, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8639874458312988, + "num_tokens": 253911839.0, + "step": 6654 + }, + { + "epoch": 0.8465844040198448, + "ewc_loss": 0.050789088010787964, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020881863019894809, + "grad_norm": 5.98540735244751, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8506800532341003, + "num_tokens": 253944649.0, + "step": 6655 + }, + { + "epoch": 0.8467116142984353, + "ewc_loss": 0.0508597306907177, + "ewc_loss_diag": 2.9921531677246094e-05, + "ewc_loss_parallel": 0.00020952503837179393, + "grad_norm": 25.81578254699707, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8594105839729309, + "num_tokens": 253977739.0, + "step": 6656 + }, + { + "epoch": 0.8468388245770259, + "ewc_loss": 0.06981688737869263, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0003942137991543859, + "grad_norm": 8.624163627624512, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8451905846595764, + "num_tokens": 254021216.0, + "step": 6657 + }, + { + "epoch": 0.8469660348556163, + "ewc_loss": 0.051787301898002625, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021269721037242562, + "grad_norm": 5.052314758300781, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8563551306724548, + "num_tokens": 254063209.0, + "step": 6658 + }, + { + "epoch": 0.8470932451342068, + "ewc_loss": 0.05979323387145996, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.0002903151616919786, + "grad_norm": 7.7410712242126465, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8548039793968201, + "num_tokens": 254102325.0, + "step": 6659 + }, + { + "epoch": 0.8472204554127973, + "ewc_loss": 0.0633196234703064, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0003206962428521365, + "grad_norm": 7.232122421264648, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8640312552452087, + "num_tokens": 254136991.0, + "step": 6660 + }, + { + "epoch": 0.8473476656913879, + "ewc_loss": 0.055690452456474304, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002444045094307512, + "grad_norm": 6.157793045043945, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8599447011947632, + "num_tokens": 254179520.0, + "step": 6661 + }, + { + "epoch": 0.8474748759699784, + "ewc_loss": 0.05674421787261963, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002549421915318817, + "grad_norm": 6.666314125061035, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8596158623695374, + "num_tokens": 254220661.0, + "step": 6662 + }, + { + "epoch": 0.8476020862485689, + "ewc_loss": 0.05672140419483185, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00025471404660493135, + "grad_norm": 6.327210903167725, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8669263124465942, + "num_tokens": 254261853.0, + "step": 6663 + }, + { + "epoch": 0.8477292965271594, + "ewc_loss": 0.0548088513314724, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023558850807603449, + "grad_norm": 6.2290191650390625, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.860848605632782, + "num_tokens": 254301577.0, + "step": 6664 + }, + { + "epoch": 0.8478565068057499, + "ewc_loss": 0.05496557801961899, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023715579300187528, + "grad_norm": 6.189663887023926, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8730108737945557, + "num_tokens": 254341792.0, + "step": 6665 + }, + { + "epoch": 0.8479837170843404, + "ewc_loss": 0.05430203676223755, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002305203815922141, + "grad_norm": 6.150542736053467, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8461049795150757, + "num_tokens": 254376755.0, + "step": 6666 + }, + { + "epoch": 0.8481109273629309, + "ewc_loss": 0.05389906093478203, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00022649060701951385, + "grad_norm": 6.081622123718262, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8696714639663696, + "num_tokens": 254413565.0, + "step": 6667 + }, + { + "epoch": 0.8482381376415215, + "ewc_loss": 0.05342070385813713, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022414844715967774, + "grad_norm": 6.074828147888184, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8489715456962585, + "num_tokens": 254449889.0, + "step": 6668 + }, + { + "epoch": 0.848365347920112, + "ewc_loss": 0.05310951918363571, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002210366219514981, + "grad_norm": 5.970235347747803, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8729832172393799, + "num_tokens": 254488129.0, + "step": 6669 + }, + { + "epoch": 0.8484925581987025, + "ewc_loss": 0.0528077632188797, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021801902039442211, + "grad_norm": 5.984555244445801, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8711800575256348, + "num_tokens": 254522031.0, + "step": 6670 + }, + { + "epoch": 0.8486197684772929, + "ewc_loss": 0.052726760506629944, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021720901713706553, + "grad_norm": 5.951260089874268, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8710418343544006, + "num_tokens": 254559249.0, + "step": 6671 + }, + { + "epoch": 0.8487469787558835, + "ewc_loss": 0.05252087861299515, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021515018306672573, + "grad_norm": 5.934572696685791, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8530266284942627, + "num_tokens": 254603968.0, + "step": 6672 + }, + { + "epoch": 0.848874189034474, + "ewc_loss": 0.05234496295452118, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021339106024242938, + "grad_norm": 5.936485767364502, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8695470094680786, + "num_tokens": 254642178.0, + "step": 6673 + }, + { + "epoch": 0.8490013993130645, + "ewc_loss": 0.052256420254707336, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021250561985652894, + "grad_norm": 5.884573936462402, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8513554334640503, + "num_tokens": 254679129.0, + "step": 6674 + }, + { + "epoch": 0.849128609591655, + "ewc_loss": 0.052168212831020355, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021162355551496148, + "grad_norm": 5.979100704193115, + "learning_rate": 1e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.8306182026863098, + "num_tokens": 254714952.0, + "step": 6675 + }, + { + "epoch": 0.8492558198702456, + "ewc_loss": 0.05211784690618515, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021111989917699248, + "grad_norm": 5.935488224029541, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8559190630912781, + "num_tokens": 254743323.0, + "step": 6676 + }, + { + "epoch": 0.849383030148836, + "ewc_loss": 0.05209936946630478, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021093508985359222, + "grad_norm": 5.814412593841553, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8662856221199036, + "num_tokens": 254784749.0, + "step": 6677 + }, + { + "epoch": 0.8495102404274265, + "ewc_loss": 0.05203256011009216, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021026699687354267, + "grad_norm": 5.863571643829346, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8660119771957397, + "num_tokens": 254820102.0, + "step": 6678 + }, + { + "epoch": 0.849637450706017, + "ewc_loss": 0.05206102877855301, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021055169054307044, + "grad_norm": 5.891690254211426, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8569260239601135, + "num_tokens": 254859988.0, + "step": 6679 + }, + { + "epoch": 0.8497646609846076, + "ewc_loss": 0.05176670849323273, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.0002100499114021659, + "grad_norm": 5.8240766525268555, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8554511070251465, + "num_tokens": 254906792.0, + "step": 6680 + }, + { + "epoch": 0.8498918712631981, + "ewc_loss": 0.05175285041332245, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00020991133351344615, + "grad_norm": 5.941295623779297, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8618411421775818, + "num_tokens": 254941552.0, + "step": 6681 + }, + { + "epoch": 0.8500190815417886, + "ewc_loss": 0.05171802639961243, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.0002095630916301161, + "grad_norm": 5.913051128387451, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8523042798042297, + "num_tokens": 254974653.0, + "step": 6682 + }, + { + "epoch": 0.850146291820379, + "ewc_loss": 0.051647260785102844, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00020885540288873017, + "grad_norm": 5.839757442474365, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8431436419487, + "num_tokens": 255018418.0, + "step": 6683 + }, + { + "epoch": 0.8502735020989696, + "ewc_loss": 0.05162685364484787, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00020865134138148278, + "grad_norm": 5.906286239624023, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8644161820411682, + "num_tokens": 255049626.0, + "step": 6684 + }, + { + "epoch": 0.8504007123775601, + "ewc_loss": 0.05159330368041992, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00020831583242397755, + "grad_norm": 5.8801960945129395, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8542401790618896, + "num_tokens": 255086323.0, + "step": 6685 + }, + { + "epoch": 0.8505279226561506, + "ewc_loss": 0.051614902913570404, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00020853185560554266, + "grad_norm": 5.850910663604736, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8683565258979797, + "num_tokens": 255124102.0, + "step": 6686 + }, + { + "epoch": 0.8506551329347412, + "ewc_loss": 0.05155717208981514, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00020795453747268766, + "grad_norm": 5.882001876831055, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8629391193389893, + "num_tokens": 255162176.0, + "step": 6687 + }, + { + "epoch": 0.8507823432133317, + "ewc_loss": 0.05134116858243942, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002082359278574586, + "grad_norm": 5.879017353057861, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8630118370056152, + "num_tokens": 255196256.0, + "step": 6688 + }, + { + "epoch": 0.8509095534919221, + "ewc_loss": 0.051347143948078156, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00020829566346947104, + "grad_norm": 5.851017951965332, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8528225421905518, + "num_tokens": 255236752.0, + "step": 6689 + }, + { + "epoch": 0.8510367637705126, + "ewc_loss": 0.051328450441360474, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002081087150145322, + "grad_norm": 5.922058582305908, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8588956594467163, + "num_tokens": 255268694.0, + "step": 6690 + }, + { + "epoch": 0.8511639740491032, + "ewc_loss": 0.051262177526950836, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00020744599169120193, + "grad_norm": 5.806244373321533, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8650680184364319, + "num_tokens": 255309445.0, + "step": 6691 + }, + { + "epoch": 0.8512911843276937, + "ewc_loss": 0.05130142718553543, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002078385150525719, + "grad_norm": 5.886795997619629, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8621978759765625, + "num_tokens": 255341126.0, + "step": 6692 + }, + { + "epoch": 0.8514183946062842, + "ewc_loss": 0.05130118131637573, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00020783602667506784, + "grad_norm": 5.849260330200195, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8685166239738464, + "num_tokens": 255379553.0, + "step": 6693 + }, + { + "epoch": 0.8515456048848747, + "ewc_loss": 0.051299333572387695, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00020781754574272782, + "grad_norm": 5.857452392578125, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8597608804702759, + "num_tokens": 255416588.0, + "step": 6694 + }, + { + "epoch": 0.8516728151634652, + "ewc_loss": 0.0512935146689415, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00020775935263372958, + "grad_norm": 5.880690097808838, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.846843957901001, + "num_tokens": 255449574.0, + "step": 6695 + }, + { + "epoch": 0.8518000254420557, + "ewc_loss": 0.051246486604213715, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002072891074931249, + "grad_norm": 5.801697254180908, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8577916026115417, + "num_tokens": 255494088.0, + "step": 6696 + }, + { + "epoch": 0.8519272357206462, + "ewc_loss": 0.051273878663778305, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00020756300364155322, + "grad_norm": 5.895500659942627, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.867534339427948, + "num_tokens": 255529785.0, + "step": 6697 + }, + { + "epoch": 0.8520544459992367, + "ewc_loss": 0.051299333572387695, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002078175311908126, + "grad_norm": 5.879678726196289, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8460068702697754, + "num_tokens": 255566675.0, + "step": 6698 + }, + { + "epoch": 0.8521816562778273, + "ewc_loss": 0.05127764493227005, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00020760066399816424, + "grad_norm": 5.861950397491455, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8643133044242859, + "num_tokens": 255606502.0, + "step": 6699 + }, + { + "epoch": 0.8523088665564178, + "ewc_loss": 0.05110097676515579, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020705469069071114, + "grad_norm": 5.847079753875732, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8507142066955566, + "num_tokens": 255649179.0, + "step": 6700 + }, + { + "epoch": 0.8524360768350082, + "ewc_loss": 0.05116870254278183, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020773193682543933, + "grad_norm": 5.923542499542236, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.859531044960022, + "num_tokens": 255688190.0, + "step": 6701 + }, + { + "epoch": 0.8525632871135987, + "ewc_loss": 0.05109019577503204, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020694687555078417, + "grad_norm": 5.796917915344238, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8580039739608765, + "num_tokens": 255730401.0, + "step": 6702 + }, + { + "epoch": 0.8526904973921893, + "ewc_loss": 0.05112171545624733, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020726208458654583, + "grad_norm": 5.861969470977783, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8548513650894165, + "num_tokens": 255767881.0, + "step": 6703 + }, + { + "epoch": 0.8528177076707798, + "ewc_loss": 0.05114952474832535, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002075401571346447, + "grad_norm": 5.844152927398682, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8505095839500427, + "num_tokens": 255808347.0, + "step": 6704 + }, + { + "epoch": 0.8529449179493703, + "ewc_loss": 0.05123650282621384, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002071892231469974, + "grad_norm": 5.856010913848877, + "learning_rate": 1e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8412261605262756, + "num_tokens": 255849538.0, + "step": 6705 + }, + { + "epoch": 0.8530721282279609, + "ewc_loss": 0.05112302303314209, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020727516675833613, + "grad_norm": 5.850027561187744, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8702434301376343, + "num_tokens": 255890228.0, + "step": 6706 + }, + { + "epoch": 0.8531993385065513, + "ewc_loss": 0.051154665648937225, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002075915690511465, + "grad_norm": 5.859004974365234, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8588977456092834, + "num_tokens": 255930189.0, + "step": 6707 + }, + { + "epoch": 0.8533265487851418, + "ewc_loss": 0.051124006509780884, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020728496019728482, + "grad_norm": 5.842858791351318, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8656234741210938, + "num_tokens": 255968310.0, + "step": 6708 + }, + { + "epoch": 0.8534537590637323, + "ewc_loss": 0.0512114092707634, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020815899188164622, + "grad_norm": 5.887278079986572, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8704255819320679, + "num_tokens": 256005872.0, + "step": 6709 + }, + { + "epoch": 0.8535809693423229, + "ewc_loss": 0.05114266276359558, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002074715303024277, + "grad_norm": 5.869166851043701, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8646427989006042, + "num_tokens": 256045374.0, + "step": 6710 + }, + { + "epoch": 0.8537081796209134, + "ewc_loss": 0.05116560310125351, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020770095579791814, + "grad_norm": 5.863225936889648, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8619858026504517, + "num_tokens": 256084874.0, + "step": 6711 + }, + { + "epoch": 0.8538353898995039, + "ewc_loss": 0.05116970092058182, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020774194854311645, + "grad_norm": 5.941920757293701, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8523259162902832, + "num_tokens": 256122251.0, + "step": 6712 + }, + { + "epoch": 0.8539626001780944, + "ewc_loss": 0.05116434395313263, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002076883683912456, + "grad_norm": 5.854072570800781, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8578130006790161, + "num_tokens": 256165025.0, + "step": 6713 + }, + { + "epoch": 0.8540898104566849, + "ewc_loss": 0.05118834227323532, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020792832947336137, + "grad_norm": 5.920072555541992, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.865554928779602, + "num_tokens": 256202220.0, + "step": 6714 + }, + { + "epoch": 0.8542170207352754, + "ewc_loss": 0.05116318166255951, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002076767268590629, + "grad_norm": 5.9176106452941895, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8615052700042725, + "num_tokens": 256234036.0, + "step": 6715 + }, + { + "epoch": 0.8543442310138659, + "ewc_loss": 0.051154427230358124, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020758918253704906, + "grad_norm": 5.888744354248047, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8577824831008911, + "num_tokens": 256275253.0, + "step": 6716 + }, + { + "epoch": 0.8544714412924564, + "ewc_loss": 0.051161400973796844, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020765894441865385, + "grad_norm": 5.932394981384277, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.856690526008606, + "num_tokens": 256311296.0, + "step": 6717 + }, + { + "epoch": 0.854598651571047, + "ewc_loss": 0.05108880251646042, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002069329348159954, + "grad_norm": 5.841804504394531, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8642176389694214, + "num_tokens": 256348149.0, + "step": 6718 + }, + { + "epoch": 0.8547258618496375, + "ewc_loss": 0.05117662250995636, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020781115745194256, + "grad_norm": 5.850199222564697, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8734946250915527, + "num_tokens": 256387861.0, + "step": 6719 + }, + { + "epoch": 0.8548530721282279, + "ewc_loss": 0.051124244928359985, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020728737581521273, + "grad_norm": 5.807706356048584, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8692336082458496, + "num_tokens": 256427131.0, + "step": 6720 + }, + { + "epoch": 0.8549802824068184, + "ewc_loss": 0.05106796324253082, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00020794523879885674, + "grad_norm": 5.852182865142822, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8636341094970703, + "num_tokens": 256465041.0, + "step": 6721 + }, + { + "epoch": 0.855107492685409, + "ewc_loss": 0.051213689148426056, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020818179473280907, + "grad_norm": 5.882847785949707, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8507799506187439, + "num_tokens": 256502576.0, + "step": 6722 + }, + { + "epoch": 0.8552347029639995, + "ewc_loss": 0.05116904899477959, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002089561166940257, + "grad_norm": 5.903896808624268, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8622280359268188, + "num_tokens": 256540493.0, + "step": 6723 + }, + { + "epoch": 0.85536191324259, + "ewc_loss": 0.051159001886844635, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002076349192066118, + "grad_norm": 5.900959014892578, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.844597339630127, + "num_tokens": 256578115.0, + "step": 6724 + }, + { + "epoch": 0.8554891235211806, + "ewc_loss": 0.05117355287075043, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002077804529108107, + "grad_norm": 5.8654632568359375, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8677946329116821, + "num_tokens": 256615957.0, + "step": 6725 + }, + { + "epoch": 0.855616333799771, + "ewc_loss": 0.051139555871486664, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00020866116392426193, + "grad_norm": 5.9388275146484375, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8630914092063904, + "num_tokens": 256649972.0, + "step": 6726 + }, + { + "epoch": 0.8557435440783615, + "ewc_loss": 0.05116590857505798, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020770399714820087, + "grad_norm": 5.855681896209717, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8665457963943481, + "num_tokens": 256686683.0, + "step": 6727 + }, + { + "epoch": 0.855870754356952, + "ewc_loss": 0.05124124139547348, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020845732069574296, + "grad_norm": 5.907064914703369, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8580944538116455, + "num_tokens": 256723587.0, + "step": 6728 + }, + { + "epoch": 0.8559979646355426, + "ewc_loss": 0.051262132823467255, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002086662279907614, + "grad_norm": 5.857685089111328, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8599011301994324, + "num_tokens": 256765245.0, + "step": 6729 + }, + { + "epoch": 0.8561251749141331, + "ewc_loss": 0.05124705284833908, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002085154555970803, + "grad_norm": 5.830867290496826, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8867794871330261, + "num_tokens": 256806930.0, + "step": 6730 + }, + { + "epoch": 0.8562523851927236, + "ewc_loss": 0.05132558196783066, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020930076425429434, + "grad_norm": 5.878859996795654, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8587793111801147, + "num_tokens": 256844568.0, + "step": 6731 + }, + { + "epoch": 0.856379595471314, + "ewc_loss": 0.05126989632844925, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002087438915623352, + "grad_norm": 5.9397406578063965, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8580820560455322, + "num_tokens": 256877483.0, + "step": 6732 + }, + { + "epoch": 0.8565068057499046, + "ewc_loss": 0.05129582807421684, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002090032066917047, + "grad_norm": 5.8694305419921875, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8639271259307861, + "num_tokens": 256921139.0, + "step": 6733 + }, + { + "epoch": 0.8566340160284951, + "ewc_loss": 0.05136317387223244, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002096766693284735, + "grad_norm": 5.963995456695557, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8598145246505737, + "num_tokens": 256957861.0, + "step": 6734 + }, + { + "epoch": 0.8567612263070856, + "ewc_loss": 0.051308587193489075, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020913078333251178, + "grad_norm": 5.839789390563965, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8618627786636353, + "num_tokens": 256998582.0, + "step": 6735 + }, + { + "epoch": 0.8568884365856761, + "ewc_loss": 0.05137726664543152, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.000209817590075545, + "grad_norm": 5.933347702026367, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8585529327392578, + "num_tokens": 257037638.0, + "step": 6736 + }, + { + "epoch": 0.8570156468642667, + "ewc_loss": 0.05131348967552185, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020917979418300092, + "grad_norm": 5.876072406768799, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8624618053436279, + "num_tokens": 257074427.0, + "step": 6737 + }, + { + "epoch": 0.8571428571428571, + "ewc_loss": 0.0513523630797863, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020956854859832674, + "grad_norm": 5.921148777008057, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8628998398780823, + "num_tokens": 257113826.0, + "step": 6738 + }, + { + "epoch": 0.8572700674214476, + "ewc_loss": 0.05122571811079979, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00020952279737684876, + "grad_norm": 5.986008167266846, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8459768891334534, + "num_tokens": 257152387.0, + "step": 6739 + }, + { + "epoch": 0.8573972777000382, + "ewc_loss": 0.05123111605644226, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00020957677043043077, + "grad_norm": 5.852993488311768, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8617814779281616, + "num_tokens": 257192083.0, + "step": 6740 + }, + { + "epoch": 0.8575244879786287, + "ewc_loss": 0.05124916136264801, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00020975721417926252, + "grad_norm": 5.9072465896606445, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8517207503318787, + "num_tokens": 257235425.0, + "step": 6741 + }, + { + "epoch": 0.8576516982572192, + "ewc_loss": 0.0513053722679615, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020909865270368755, + "grad_norm": 5.903731346130371, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8556797504425049, + "num_tokens": 257268621.0, + "step": 6742 + }, + { + "epoch": 0.8577789085358097, + "ewc_loss": 0.0512893944978714, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002101595455314964, + "grad_norm": 5.915837287902832, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8643969893455505, + "num_tokens": 257306909.0, + "step": 6743 + }, + { + "epoch": 0.8579061188144002, + "ewc_loss": 0.05135398358106613, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020958477398380637, + "grad_norm": 5.910439491271973, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8744034171104431, + "num_tokens": 257343388.0, + "step": 6744 + }, + { + "epoch": 0.8580333290929907, + "ewc_loss": 0.05138233304023743, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020986824529245496, + "grad_norm": 5.917835235595703, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8650901913642883, + "num_tokens": 257377172.0, + "step": 6745 + }, + { + "epoch": 0.8581605393715812, + "ewc_loss": 0.051147058606147766, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00020995689556002617, + "grad_norm": 5.950096130371094, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8527361154556274, + "num_tokens": 257410312.0, + "step": 6746 + }, + { + "epoch": 0.8582877496501717, + "ewc_loss": 0.05113974213600159, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002098837576340884, + "grad_norm": 5.895540237426758, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8630200624465942, + "num_tokens": 257449919.0, + "step": 6747 + }, + { + "epoch": 0.8584149599287623, + "ewc_loss": 0.05123336613178253, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002095993113471195, + "grad_norm": 5.87891960144043, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8711682558059692, + "num_tokens": 257488296.0, + "step": 6748 + }, + { + "epoch": 0.8585421702073528, + "ewc_loss": 0.051296480000019073, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021023041335865855, + "grad_norm": 5.9561591148376465, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8689233064651489, + "num_tokens": 257524678.0, + "step": 6749 + }, + { + "epoch": 0.8586693804859432, + "ewc_loss": 0.051336005330085754, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020940495596732944, + "grad_norm": 5.889946937561035, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8653209805488586, + "num_tokens": 257559060.0, + "step": 6750 + }, + { + "epoch": 0.8587965907645337, + "ewc_loss": 0.051409587264060974, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021014080266468227, + "grad_norm": 5.908088684082031, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.869702935218811, + "num_tokens": 257594340.0, + "step": 6751 + }, + { + "epoch": 0.8589238010431243, + "ewc_loss": 0.05141325667500496, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021017748804297298, + "grad_norm": 5.909731388092041, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.872078537940979, + "num_tokens": 257627936.0, + "step": 6752 + }, + { + "epoch": 0.8590510113217148, + "ewc_loss": 0.05136948823928833, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020973978098481894, + "grad_norm": 5.949559688568115, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8575718402862549, + "num_tokens": 257660311.0, + "step": 6753 + }, + { + "epoch": 0.8591782216003053, + "ewc_loss": 0.051392920315265656, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020997412502765656, + "grad_norm": 5.891927242279053, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8536472916603088, + "num_tokens": 257697496.0, + "step": 6754 + }, + { + "epoch": 0.8593054318788959, + "ewc_loss": 0.051082856953144073, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002093149087158963, + "grad_norm": 5.909544467926025, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8420009016990662, + "num_tokens": 257737242.0, + "step": 6755 + }, + { + "epoch": 0.8594326421574863, + "ewc_loss": 0.05114737153053284, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00020996005332563072, + "grad_norm": 5.913772106170654, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8617644309997559, + "num_tokens": 257772946.0, + "step": 6756 + }, + { + "epoch": 0.8595598524360768, + "ewc_loss": 0.05113649368286133, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00020985127775929868, + "grad_norm": 5.94005012512207, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8564139604568481, + "num_tokens": 257807198.0, + "step": 6757 + }, + { + "epoch": 0.8596870627146673, + "ewc_loss": 0.05130531266331673, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020909805607516319, + "grad_norm": 5.845605850219727, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8659622669219971, + "num_tokens": 257848618.0, + "step": 6758 + }, + { + "epoch": 0.8598142729932579, + "ewc_loss": 0.05110517144203186, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00020953801868017763, + "grad_norm": 5.869448661804199, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8720688819885254, + "num_tokens": 257887805.0, + "step": 6759 + }, + { + "epoch": 0.8599414832718484, + "ewc_loss": 0.05137091875076294, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020975408551748842, + "grad_norm": 5.8574700355529785, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8669058084487915, + "num_tokens": 257929276.0, + "step": 6760 + }, + { + "epoch": 0.8600686935504389, + "ewc_loss": 0.051395658403635025, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021000151173211634, + "grad_norm": 6.000032901763916, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8544242978096008, + "num_tokens": 257958629.0, + "step": 6761 + }, + { + "epoch": 0.8601959038290294, + "ewc_loss": 0.05135659500956535, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020961088011972606, + "grad_norm": 5.971367359161377, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8567874431610107, + "num_tokens": 257990303.0, + "step": 6762 + }, + { + "epoch": 0.8603231141076199, + "ewc_loss": 0.051317811012268066, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020922301337122917, + "grad_norm": 5.925021171569824, + "learning_rate": 1e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8436077237129211, + "num_tokens": 258028704.0, + "step": 6763 + }, + { + "epoch": 0.8604503243862104, + "ewc_loss": 0.05130131542682648, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020905808196403086, + "grad_norm": 5.914074420928955, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8695293068885803, + "num_tokens": 258062757.0, + "step": 6764 + }, + { + "epoch": 0.8605775346648009, + "ewc_loss": 0.05135359615087509, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020958085951860994, + "grad_norm": 6.0928053855896, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8596377372741699, + "num_tokens": 258098574.0, + "step": 6765 + }, + { + "epoch": 0.8607047449433914, + "ewc_loss": 0.051378726959228516, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020983217109460384, + "grad_norm": 5.880772590637207, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8700575828552246, + "num_tokens": 258130937.0, + "step": 6766 + }, + { + "epoch": 0.860831955221982, + "ewc_loss": 0.0513964481651783, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021000939887017012, + "grad_norm": 5.97014856338501, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8629883527755737, + "num_tokens": 258165142.0, + "step": 6767 + }, + { + "epoch": 0.8609591655005725, + "ewc_loss": 0.05135447531938553, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020958964887540787, + "grad_norm": 5.861516952514648, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8765594959259033, + "num_tokens": 258203986.0, + "step": 6768 + }, + { + "epoch": 0.8610863757791629, + "ewc_loss": 0.05141143873333931, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021015931270085275, + "grad_norm": 5.962527751922607, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8617260456085205, + "num_tokens": 258238692.0, + "step": 6769 + }, + { + "epoch": 0.8612135860577534, + "ewc_loss": 0.05136021971702576, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020964709983672947, + "grad_norm": 5.894680500030518, + "learning_rate": 1e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.8403483033180237, + "num_tokens": 258280942.0, + "step": 6770 + }, + { + "epoch": 0.861340796336344, + "ewc_loss": 0.051435574889183044, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021040067076683044, + "grad_norm": 5.976778984069824, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.862855076789856, + "num_tokens": 258318835.0, + "step": 6771 + }, + { + "epoch": 0.8614680066149345, + "ewc_loss": 0.05137497931718826, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020979472901672125, + "grad_norm": 5.909780025482178, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.851859986782074, + "num_tokens": 258359461.0, + "step": 6772 + }, + { + "epoch": 0.861595216893525, + "ewc_loss": 0.05133763700723648, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002094212977681309, + "grad_norm": 5.937106609344482, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8568704724311829, + "num_tokens": 258394582.0, + "step": 6773 + }, + { + "epoch": 0.8617224271721156, + "ewc_loss": 0.05137103050947189, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.000209755206014961, + "grad_norm": 5.916923999786377, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8635078072547913, + "num_tokens": 258438526.0, + "step": 6774 + }, + { + "epoch": 0.861849637450706, + "ewc_loss": 0.0512990728020668, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020903565746266395, + "grad_norm": 5.985103607177734, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8471440076828003, + "num_tokens": 258479332.0, + "step": 6775 + }, + { + "epoch": 0.8619768477292965, + "ewc_loss": 0.05127095431089401, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020875447080470622, + "grad_norm": 5.926796913146973, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8500019311904907, + "num_tokens": 258513202.0, + "step": 6776 + }, + { + "epoch": 0.862104058007887, + "ewc_loss": 0.051330193877220154, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020934687927365303, + "grad_norm": 6.003878116607666, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8619363903999329, + "num_tokens": 258549145.0, + "step": 6777 + }, + { + "epoch": 0.8622312682864776, + "ewc_loss": 0.051249925047159195, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020854416652582586, + "grad_norm": 5.943190097808838, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8510975241661072, + "num_tokens": 258586421.0, + "step": 6778 + }, + { + "epoch": 0.8623584785650681, + "ewc_loss": 0.051213301718235016, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020817795302718878, + "grad_norm": 5.969185829162598, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8607842922210693, + "num_tokens": 258620014.0, + "step": 6779 + }, + { + "epoch": 0.8624856888436586, + "ewc_loss": 0.05124758929014206, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020852083980571479, + "grad_norm": 5.94536828994751, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8659558296203613, + "num_tokens": 258656124.0, + "step": 6780 + }, + { + "epoch": 0.862612899122249, + "ewc_loss": 0.05116795003414154, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020772444258909672, + "grad_norm": 5.928412437438965, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.849082350730896, + "num_tokens": 258692537.0, + "step": 6781 + }, + { + "epoch": 0.8627401094008396, + "ewc_loss": 0.05119280517101288, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020797298930119723, + "grad_norm": 5.8849778175354, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8632795214653015, + "num_tokens": 258732737.0, + "step": 6782 + }, + { + "epoch": 0.8628673196794301, + "ewc_loss": 0.0512581467628479, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002086264139506966, + "grad_norm": 6.034162998199463, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.847984790802002, + "num_tokens": 258770634.0, + "step": 6783 + }, + { + "epoch": 0.8629945299580206, + "ewc_loss": 0.051224932074546814, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00020829426648560911, + "grad_norm": 12.765661239624023, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8598926067352295, + "num_tokens": 258809643.0, + "step": 6784 + }, + { + "epoch": 0.8631217402366111, + "ewc_loss": 0.05960007756948471, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002920457045547664, + "grad_norm": 7.212515830993652, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8653748035430908, + "num_tokens": 258845336.0, + "step": 6785 + }, + { + "epoch": 0.8632489505152017, + "ewc_loss": 0.04947652667760849, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00019081019854638726, + "grad_norm": 5.364492893218994, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8626054525375366, + "num_tokens": 258882206.0, + "step": 6786 + }, + { + "epoch": 0.8633761607937921, + "ewc_loss": 0.05315575748682022, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00022760247520636767, + "grad_norm": 6.547423839569092, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8674250841140747, + "num_tokens": 258918661.0, + "step": 6787 + }, + { + "epoch": 0.8635033710723826, + "ewc_loss": 0.05245393514633179, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00022058424656279385, + "grad_norm": 5.910607814788818, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.855027437210083, + "num_tokens": 258957452.0, + "step": 6788 + }, + { + "epoch": 0.8636305813509731, + "ewc_loss": 0.051921430975198746, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002152592351194471, + "grad_norm": 6.1287617683410645, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8513329029083252, + "num_tokens": 259003507.0, + "step": 6789 + }, + { + "epoch": 0.8637577916295637, + "ewc_loss": 0.0521833598613739, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.000217878507100977, + "grad_norm": 6.046443939208984, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8554257154464722, + "num_tokens": 259042284.0, + "step": 6790 + }, + { + "epoch": 0.8638850019081542, + "ewc_loss": 0.0517277792096138, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021332270989660174, + "grad_norm": 6.043706893920898, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8735777139663696, + "num_tokens": 259082929.0, + "step": 6791 + }, + { + "epoch": 0.8640122121867447, + "ewc_loss": 0.051877036690711975, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002148152852896601, + "grad_norm": 5.980116844177246, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8756221532821655, + "num_tokens": 259125920.0, + "step": 6792 + }, + { + "epoch": 0.8641394224653351, + "ewc_loss": 0.051647040992975235, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021251532598398626, + "grad_norm": 6.113431930541992, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.861222505569458, + "num_tokens": 259163140.0, + "step": 6793 + }, + { + "epoch": 0.8642666327439257, + "ewc_loss": 0.051608651876449585, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021213144646026194, + "grad_norm": 5.987191677093506, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8590172529220581, + "num_tokens": 259199038.0, + "step": 6794 + }, + { + "epoch": 0.8643938430225162, + "ewc_loss": 0.05150747671723366, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021111969545017928, + "grad_norm": 12.933067321777344, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8587149977684021, + "num_tokens": 259235713.0, + "step": 6795 + }, + { + "epoch": 0.8645210533011067, + "ewc_loss": 0.06096452474594116, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0003056901623494923, + "grad_norm": 7.390807628631592, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8502512574195862, + "num_tokens": 259278314.0, + "step": 6796 + }, + { + "epoch": 0.8646482635796973, + "ewc_loss": 0.0496739000082016, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00019278393301647156, + "grad_norm": 5.385429382324219, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8688530921936035, + "num_tokens": 259313039.0, + "step": 6797 + }, + { + "epoch": 0.8647754738582878, + "ewc_loss": 0.0538024976849556, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00023406988475471735, + "grad_norm": 6.691562652587891, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8671807646751404, + "num_tokens": 259348845.0, + "step": 6798 + }, + { + "epoch": 0.8649026841368782, + "ewc_loss": 0.053249746561050415, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00022854239796288311, + "grad_norm": 5.991283893585205, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8554881811141968, + "num_tokens": 259387345.0, + "step": 6799 + }, + { + "epoch": 0.8650298944154687, + "ewc_loss": 0.052251119166612625, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002185561170335859, + "grad_norm": 6.203813076019287, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8597092628479004, + "num_tokens": 259427835.0, + "step": 6800 + }, + { + "epoch": 0.8651571046940593, + "ewc_loss": 0.052649304270744324, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00022253795759752393, + "grad_norm": 6.156069278717041, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8639811873435974, + "num_tokens": 259460453.0, + "step": 6801 + }, + { + "epoch": 0.8652843149726498, + "ewc_loss": 0.05198247730731964, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021586970251519233, + "grad_norm": 6.001075744628906, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8668367266654968, + "num_tokens": 259499296.0, + "step": 6802 + }, + { + "epoch": 0.8654115252512403, + "ewc_loss": 0.052136749029159546, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021741242380812764, + "grad_norm": 6.132353782653809, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8570428490638733, + "num_tokens": 259537649.0, + "step": 6803 + }, + { + "epoch": 0.8655387355298308, + "ewc_loss": 0.0518539734184742, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002145846519852057, + "grad_norm": 5.987640380859375, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8525264263153076, + "num_tokens": 259583136.0, + "step": 6804 + }, + { + "epoch": 0.8656659458084213, + "ewc_loss": 0.05185767263174057, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.000214621628401801, + "grad_norm": 6.045386791229248, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8648786544799805, + "num_tokens": 259620422.0, + "step": 6805 + }, + { + "epoch": 0.8657931560870118, + "ewc_loss": 0.05170108377933502, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021305575501173735, + "grad_norm": 5.974460124969482, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8491503596305847, + "num_tokens": 259663948.0, + "step": 6806 + }, + { + "epoch": 0.8659203663656023, + "ewc_loss": 0.05167725682258606, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021281748195178807, + "grad_norm": 5.995960235595703, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8614799380302429, + "num_tokens": 259701986.0, + "step": 6807 + }, + { + "epoch": 0.8660475766441929, + "ewc_loss": 0.05136415362358093, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021212785213720053, + "grad_norm": 5.958008289337158, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8750054836273193, + "num_tokens": 259740684.0, + "step": 6808 + }, + { + "epoch": 0.8661747869227834, + "ewc_loss": 0.05135080963373184, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021199444017838687, + "grad_norm": 5.99075984954834, + "learning_rate": 1e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.843233585357666, + "num_tokens": 259781659.0, + "step": 6809 + }, + { + "epoch": 0.8663019972013739, + "ewc_loss": 0.05123958736658096, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021088220819365233, + "grad_norm": 5.986135482788086, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8554089665412903, + "num_tokens": 259818554.0, + "step": 6810 + }, + { + "epoch": 0.8664292074799644, + "ewc_loss": 0.05149426311254501, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002109875640599057, + "grad_norm": 5.97733211517334, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8618065118789673, + "num_tokens": 259856652.0, + "step": 6811 + }, + { + "epoch": 0.8665564177585549, + "ewc_loss": 0.05142281949520111, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021027309412602335, + "grad_norm": 5.932135581970215, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8785977363586426, + "num_tokens": 259898255.0, + "step": 6812 + }, + { + "epoch": 0.8666836280371454, + "ewc_loss": 0.051412977278232574, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021017470862716436, + "grad_norm": 5.977453231811523, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8653559684753418, + "num_tokens": 259935305.0, + "step": 6813 + }, + { + "epoch": 0.8668108383157359, + "ewc_loss": 0.051180534064769745, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021029166236985475, + "grad_norm": 5.938925743103027, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8642109632492065, + "num_tokens": 259975245.0, + "step": 6814 + }, + { + "epoch": 0.8669380485943264, + "ewc_loss": 0.05110377073287964, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002095240488415584, + "grad_norm": 5.925162315368652, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8658815622329712, + "num_tokens": 260014150.0, + "step": 6815 + }, + { + "epoch": 0.867065258872917, + "ewc_loss": 0.0511639378964901, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021012569777667522, + "grad_norm": 5.962926864624023, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.851141631603241, + "num_tokens": 260054655.0, + "step": 6816 + }, + { + "epoch": 0.8671924691515075, + "ewc_loss": 0.05107127130031586, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00020919906091876328, + "grad_norm": 5.973478317260742, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8603615760803223, + "num_tokens": 260091528.0, + "step": 6817 + }, + { + "epoch": 0.8673196794300979, + "ewc_loss": 0.0510031059384346, + "ewc_loss_diag": 3.0040740966796875e-05, + "ewc_loss_parallel": 0.00020973809296265244, + "grad_norm": 6.0189290046691895, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8759297132492065, + "num_tokens": 260123069.0, + "step": 6818 + }, + { + "epoch": 0.8674468897086884, + "ewc_loss": 0.051033783704042435, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00020882415992673486, + "grad_norm": 6.032655239105225, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8453168869018555, + "num_tokens": 260154482.0, + "step": 6819 + }, + { + "epoch": 0.867574099987279, + "ewc_loss": 0.05097833275794983, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002082696300931275, + "grad_norm": 5.915137767791748, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8617969751358032, + "num_tokens": 260193983.0, + "step": 6820 + }, + { + "epoch": 0.8677013102658695, + "ewc_loss": 0.051045119762420654, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002089375484501943, + "grad_norm": 5.955071449279785, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8632825613021851, + "num_tokens": 260228705.0, + "step": 6821 + }, + { + "epoch": 0.86782852054446, + "ewc_loss": 0.0510893389582634, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.000209379693842493, + "grad_norm": 5.954534530639648, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.85896897315979, + "num_tokens": 260262821.0, + "step": 6822 + }, + { + "epoch": 0.8679557308230506, + "ewc_loss": 0.05109857767820358, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00020947212760802358, + "grad_norm": 5.923985004425049, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8578228950500488, + "num_tokens": 260298949.0, + "step": 6823 + }, + { + "epoch": 0.868082941101641, + "ewc_loss": 0.05119264870882034, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021041282161604613, + "grad_norm": 6.049873352050781, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8681865930557251, + "num_tokens": 260340028.0, + "step": 6824 + }, + { + "epoch": 0.8682101513802315, + "ewc_loss": 0.05109083279967308, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00020939465321134776, + "grad_norm": 5.86676025390625, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8597652912139893, + "num_tokens": 260381328.0, + "step": 6825 + }, + { + "epoch": 0.868337361658822, + "ewc_loss": 0.05115945264697075, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021008086332585663, + "grad_norm": 5.968444347381592, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8613941669464111, + "num_tokens": 260423421.0, + "step": 6826 + }, + { + "epoch": 0.8684645719374126, + "ewc_loss": 0.05119360238313675, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021042233856860548, + "grad_norm": 5.9374260902404785, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8597103357315063, + "num_tokens": 260461985.0, + "step": 6827 + }, + { + "epoch": 0.8685917822160031, + "ewc_loss": 0.051102619618177414, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00020951252372469753, + "grad_norm": 5.929557800292969, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.845253050327301, + "num_tokens": 260503041.0, + "step": 6828 + }, + { + "epoch": 0.8687189924945936, + "ewc_loss": 0.05121072009205818, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.000210593527299352, + "grad_norm": 5.96715784072876, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8682686686515808, + "num_tokens": 260545553.0, + "step": 6829 + }, + { + "epoch": 0.868846202773184, + "ewc_loss": 0.05116572231054306, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021014356752857566, + "grad_norm": 6.039784908294678, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8670088648796082, + "num_tokens": 260577160.0, + "step": 6830 + }, + { + "epoch": 0.8689734130517746, + "ewc_loss": 0.05115649104118347, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021005126473028213, + "grad_norm": 5.877048492431641, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8723233938217163, + "num_tokens": 260614748.0, + "step": 6831 + }, + { + "epoch": 0.8691006233303651, + "ewc_loss": 0.051180534064769745, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002102916914736852, + "grad_norm": 6.012442111968994, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8613637685775757, + "num_tokens": 260651762.0, + "step": 6832 + }, + { + "epoch": 0.8692278336089556, + "ewc_loss": 0.05113266408443451, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002098129625665024, + "grad_norm": 5.889106273651123, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8490492105484009, + "num_tokens": 260694438.0, + "step": 6833 + }, + { + "epoch": 0.8693550438875461, + "ewc_loss": 0.051199913024902344, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021048547932878137, + "grad_norm": 6.02011251449585, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8689426183700562, + "num_tokens": 260730256.0, + "step": 6834 + }, + { + "epoch": 0.8694822541661367, + "ewc_loss": 0.05113677307963371, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002098540571751073, + "grad_norm": 5.94621467590332, + "learning_rate": 1e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8410131335258484, + "num_tokens": 260766190.0, + "step": 6835 + }, + { + "epoch": 0.8696094644447271, + "ewc_loss": 0.05119746923446655, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021046101755928248, + "grad_norm": 5.946899890899658, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8738440871238708, + "num_tokens": 260801733.0, + "step": 6836 + }, + { + "epoch": 0.8697366747233176, + "ewc_loss": 0.051260195672512054, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021108829241711646, + "grad_norm": 5.926591873168945, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8720728158950806, + "num_tokens": 260843461.0, + "step": 6837 + }, + { + "epoch": 0.8698638850019081, + "ewc_loss": 0.05117426812648773, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002102290018228814, + "grad_norm": 6.002136707305908, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8596286177635193, + "num_tokens": 260875006.0, + "step": 6838 + }, + { + "epoch": 0.8699910952804987, + "ewc_loss": 0.05120474472641945, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021053377713542432, + "grad_norm": 5.954020023345947, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8541746139526367, + "num_tokens": 260917553.0, + "step": 6839 + }, + { + "epoch": 0.8701183055590892, + "ewc_loss": 0.051204562187194824, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021053195814602077, + "grad_norm": 5.962853908538818, + "learning_rate": 1e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8363458514213562, + "num_tokens": 260955293.0, + "step": 6840 + }, + { + "epoch": 0.8702455158376797, + "ewc_loss": 0.05129139870405197, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002101795980706811, + "grad_norm": 6.024990558624268, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8609969019889832, + "num_tokens": 260983966.0, + "step": 6841 + }, + { + "epoch": 0.8703727261162701, + "ewc_loss": 0.0511680543422699, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021016686514485627, + "grad_norm": 5.9749650955200195, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.873388409614563, + "num_tokens": 261019434.0, + "step": 6842 + }, + { + "epoch": 0.8704999363948607, + "ewc_loss": 0.051172979176044464, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021021613792981952, + "grad_norm": 5.941502571105957, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8482571244239807, + "num_tokens": 261053761.0, + "step": 6843 + }, + { + "epoch": 0.8706271466734512, + "ewc_loss": 0.05121123045682907, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002105986059177667, + "grad_norm": 5.937812328338623, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8509531617164612, + "num_tokens": 261093490.0, + "step": 6844 + }, + { + "epoch": 0.8707543569520417, + "ewc_loss": 0.05117823928594589, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021026874310337007, + "grad_norm": 5.957989692687988, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8612384796142578, + "num_tokens": 261129518.0, + "step": 6845 + }, + { + "epoch": 0.8708815672306323, + "ewc_loss": 0.051225218921899796, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002107385080307722, + "grad_norm": 5.9474029541015625, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8438518643379211, + "num_tokens": 261164282.0, + "step": 6846 + }, + { + "epoch": 0.8710087775092228, + "ewc_loss": 0.051205478608608246, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021054112585261464, + "grad_norm": 5.933433532714844, + "learning_rate": 1e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8396874666213989, + "num_tokens": 261205055.0, + "step": 6847 + }, + { + "epoch": 0.8711359877878132, + "ewc_loss": 0.05124376714229584, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021092397219035774, + "grad_norm": 5.964478969573975, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8539626598358154, + "num_tokens": 261242329.0, + "step": 6848 + }, + { + "epoch": 0.8712631980664037, + "ewc_loss": 0.05124279856681824, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021091433882247657, + "grad_norm": 5.9445343017578125, + "learning_rate": 1e-06, + "loss": 0.5552, + "mean_token_accuracy": 0.8328330516815186, + "num_tokens": 261280330.0, + "step": 6849 + }, + { + "epoch": 0.8713904083449943, + "ewc_loss": 0.051240257918834686, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021088888752274215, + "grad_norm": 5.903339385986328, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8711766600608826, + "num_tokens": 261318232.0, + "step": 6850 + }, + { + "epoch": 0.8715176186235848, + "ewc_loss": 0.051289934664964676, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021138567535672337, + "grad_norm": 6.0307207107543945, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8769629597663879, + "num_tokens": 261351483.0, + "step": 6851 + }, + { + "epoch": 0.8716448289021753, + "ewc_loss": 0.0512951985001564, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021143829508218914, + "grad_norm": 5.941176891326904, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8622390031814575, + "num_tokens": 261387058.0, + "step": 6852 + }, + { + "epoch": 0.8717720391807658, + "ewc_loss": 0.05138035863637924, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021106920030433685, + "grad_norm": 5.8918070793151855, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8760402202606201, + "num_tokens": 261430110.0, + "step": 6853 + }, + { + "epoch": 0.8718992494593563, + "ewc_loss": 0.05125163495540619, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021100268349982798, + "grad_norm": 5.919037342071533, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8675474524497986, + "num_tokens": 261470707.0, + "step": 6854 + }, + { + "epoch": 0.8720264597379468, + "ewc_loss": 0.05127793923020363, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021126572391949594, + "grad_norm": 5.969864845275879, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8681613206863403, + "num_tokens": 261507146.0, + "step": 6855 + }, + { + "epoch": 0.8721536700165373, + "ewc_loss": 0.05126846581697464, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021117097639944404, + "grad_norm": 5.944339275360107, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8447147011756897, + "num_tokens": 261549723.0, + "step": 6856 + }, + { + "epoch": 0.8722808802951278, + "ewc_loss": 0.051272280514240265, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021120914607308805, + "grad_norm": 5.926839351654053, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8659944534301758, + "num_tokens": 261591030.0, + "step": 6857 + }, + { + "epoch": 0.8724080905737184, + "ewc_loss": 0.05131968855857849, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021168318926356733, + "grad_norm": 6.011070728302002, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8700031042098999, + "num_tokens": 261623919.0, + "step": 6858 + }, + { + "epoch": 0.8725353008523089, + "ewc_loss": 0.05130073428153992, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021149366511963308, + "grad_norm": 6.002387046813965, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.876718282699585, + "num_tokens": 261655277.0, + "step": 6859 + }, + { + "epoch": 0.8726625111308994, + "ewc_loss": 0.05128049850463867, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021129129163455218, + "grad_norm": 5.9971771240234375, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8479671478271484, + "num_tokens": 261697462.0, + "step": 6860 + }, + { + "epoch": 0.8727897214094898, + "ewc_loss": 0.05126818269491196, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002111681387759745, + "grad_norm": 5.9765400886535645, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8571854829788208, + "num_tokens": 261739134.0, + "step": 6861 + }, + { + "epoch": 0.8729169316880804, + "ewc_loss": 0.05120345950126648, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002105209423461929, + "grad_norm": 6.000040531158447, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8642547130584717, + "num_tokens": 261776295.0, + "step": 6862 + }, + { + "epoch": 0.8730441419666709, + "ewc_loss": 0.05121868476271629, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021067318448331207, + "grad_norm": 6.039817810058594, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8631041049957275, + "num_tokens": 261812388.0, + "step": 6863 + }, + { + "epoch": 0.8731713522452614, + "ewc_loss": 0.05116325616836548, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021011887292843312, + "grad_norm": 5.949740886688232, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8626567125320435, + "num_tokens": 261855537.0, + "step": 6864 + }, + { + "epoch": 0.873298562523852, + "ewc_loss": 0.051234740763902664, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021083373576402664, + "grad_norm": 5.936919689178467, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8693779706954956, + "num_tokens": 261899922.0, + "step": 6865 + }, + { + "epoch": 0.8734257728024425, + "ewc_loss": 0.05117993801832199, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021028572518844157, + "grad_norm": 5.999351978302002, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8724164366722107, + "num_tokens": 261934475.0, + "step": 6866 + }, + { + "epoch": 0.8735529830810329, + "ewc_loss": 0.05115533620119095, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021003966685384512, + "grad_norm": 5.983355522155762, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8612134456634521, + "num_tokens": 261973395.0, + "step": 6867 + }, + { + "epoch": 0.8736801933596234, + "ewc_loss": 0.05120745673775673, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021056088735349476, + "grad_norm": 5.999051570892334, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8448231816291809, + "num_tokens": 262011644.0, + "step": 6868 + }, + { + "epoch": 0.873807403638214, + "ewc_loss": 0.05119172856211662, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021040361025370657, + "grad_norm": 6.0098724365234375, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8560559153556824, + "num_tokens": 262044038.0, + "step": 6869 + }, + { + "epoch": 0.8739346139168045, + "ewc_loss": 0.05119030177593231, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.000210389363928698, + "grad_norm": 5.991075038909912, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8605221509933472, + "num_tokens": 262081023.0, + "step": 6870 + }, + { + "epoch": 0.874061824195395, + "ewc_loss": 0.05120856687426567, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021057200501672924, + "grad_norm": 5.912307262420654, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8530104756355286, + "num_tokens": 262123981.0, + "step": 6871 + }, + { + "epoch": 0.8741890344739855, + "ewc_loss": 0.051242150366306305, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021090781956445426, + "grad_norm": 6.0253167152404785, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8495382070541382, + "num_tokens": 262161542.0, + "step": 6872 + }, + { + "epoch": 0.874316244752576, + "ewc_loss": 0.051439784467220306, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002104427694575861, + "grad_norm": 5.946919918060303, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8502922654151917, + "num_tokens": 262202482.0, + "step": 6873 + }, + { + "epoch": 0.8744434550311665, + "ewc_loss": 0.051451846957206726, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021056340483482927, + "grad_norm": 6.0213422775268555, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8622490167617798, + "num_tokens": 262239499.0, + "step": 6874 + }, + { + "epoch": 0.874570665309757, + "ewc_loss": 0.05151950567960739, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021123998158145696, + "grad_norm": 5.917175769805908, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8620496988296509, + "num_tokens": 262281324.0, + "step": 6875 + }, + { + "epoch": 0.8746978755883476, + "ewc_loss": 0.05153109133243561, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021135582937859, + "grad_norm": 6.067903518676758, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8522444367408752, + "num_tokens": 262318772.0, + "step": 6876 + }, + { + "epoch": 0.8748250858669381, + "ewc_loss": 0.05149860307574272, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002110309578711167, + "grad_norm": 5.975678443908691, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8729509115219116, + "num_tokens": 262353264.0, + "step": 6877 + }, + { + "epoch": 0.8749522961455286, + "ewc_loss": 0.05157969146966934, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002118418487953022, + "grad_norm": 6.009011745452881, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8579813241958618, + "num_tokens": 262393360.0, + "step": 6878 + }, + { + "epoch": 0.875079506424119, + "ewc_loss": 0.05145204812288284, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021056542755104601, + "grad_norm": 5.935728549957275, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8709596395492554, + "num_tokens": 262432010.0, + "step": 6879 + }, + { + "epoch": 0.8752067167027096, + "ewc_loss": 0.051547300070524216, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021151792316231877, + "grad_norm": 6.013942241668701, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8606212735176086, + "num_tokens": 262468828.0, + "step": 6880 + }, + { + "epoch": 0.8753339269813001, + "ewc_loss": 0.05144156515598297, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002104605664499104, + "grad_norm": 5.871116638183594, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8683614730834961, + "num_tokens": 262515141.0, + "step": 6881 + }, + { + "epoch": 0.8754611372598906, + "ewc_loss": 0.05155981704592705, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021164309873711318, + "grad_norm": 5.994197845458984, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8580542802810669, + "num_tokens": 262553137.0, + "step": 6882 + }, + { + "epoch": 0.8755883475384811, + "ewc_loss": 0.051237329840660095, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021085960906930268, + "grad_norm": 5.883259296417236, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8829113245010376, + "num_tokens": 262593847.0, + "step": 6883 + }, + { + "epoch": 0.8757155578170717, + "ewc_loss": 0.05131657049059868, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002116520336130634, + "grad_norm": 5.988176345825195, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8496405482292175, + "num_tokens": 262635048.0, + "step": 6884 + }, + { + "epoch": 0.8758427680956621, + "ewc_loss": 0.05126308277249336, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021111716341692954, + "grad_norm": 5.949712753295898, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8529277443885803, + "num_tokens": 262671200.0, + "step": 6885 + }, + { + "epoch": 0.8759699783742526, + "ewc_loss": 0.051312632858753204, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021161265613045543, + "grad_norm": 6.0144944190979, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8629428744316101, + "num_tokens": 262708304.0, + "step": 6886 + }, + { + "epoch": 0.8760971886528431, + "ewc_loss": 0.05124211311340332, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.000210907484870404, + "grad_norm": 5.928956508636475, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8584544658660889, + "num_tokens": 262750680.0, + "step": 6887 + }, + { + "epoch": 0.8762243989314337, + "ewc_loss": 0.05121932178735733, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021067957277409732, + "grad_norm": 5.998834609985352, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8550971746444702, + "num_tokens": 262784845.0, + "step": 6888 + }, + { + "epoch": 0.8763516092100242, + "ewc_loss": 0.05128956958651543, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021138202282600105, + "grad_norm": 5.968232154846191, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8614009618759155, + "num_tokens": 262822646.0, + "step": 6889 + }, + { + "epoch": 0.8764788194886147, + "ewc_loss": 0.051209136843681335, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021057770936749876, + "grad_norm": 5.940492153167725, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8574495315551758, + "num_tokens": 262863663.0, + "step": 6890 + }, + { + "epoch": 0.8766060297672051, + "ewc_loss": 0.05131882429122925, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021167458908166736, + "grad_norm": 6.022772789001465, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8466209769248962, + "num_tokens": 262903726.0, + "step": 6891 + }, + { + "epoch": 0.8767332400457957, + "ewc_loss": 0.051253143697977066, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002110177738359198, + "grad_norm": 5.91620397567749, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.849787712097168, + "num_tokens": 262942748.0, + "step": 6892 + }, + { + "epoch": 0.8768604503243862, + "ewc_loss": 0.05125106871128082, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021099702280480415, + "grad_norm": 6.020005226135254, + "learning_rate": 1e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.841089129447937, + "num_tokens": 262977639.0, + "step": 6893 + }, + { + "epoch": 0.8769876606029767, + "ewc_loss": 0.05129844695329666, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021147080406080931, + "grad_norm": 5.960165500640869, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8640174865722656, + "num_tokens": 263015970.0, + "step": 6894 + }, + { + "epoch": 0.8771148708815673, + "ewc_loss": 0.05128155276179314, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021130185632500798, + "grad_norm": 5.932642936706543, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8704172372817993, + "num_tokens": 263058618.0, + "step": 6895 + }, + { + "epoch": 0.8772420811601578, + "ewc_loss": 0.05134507268667221, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002119370474247262, + "grad_norm": 5.998800277709961, + "learning_rate": 1e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8440718054771423, + "num_tokens": 263098628.0, + "step": 6896 + }, + { + "epoch": 0.8773692914387482, + "ewc_loss": 0.05130835622549057, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021156987349968404, + "grad_norm": 5.979968547821045, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8641167879104614, + "num_tokens": 263136793.0, + "step": 6897 + }, + { + "epoch": 0.8774965017173387, + "ewc_loss": 0.051329344511032104, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021177978487685323, + "grad_norm": 5.9856133460998535, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8627972602844238, + "num_tokens": 263170996.0, + "step": 6898 + }, + { + "epoch": 0.8776237119959293, + "ewc_loss": 0.051340147852897644, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021188781829550862, + "grad_norm": 6.0426249504089355, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8522586226463318, + "num_tokens": 263199961.0, + "step": 6899 + }, + { + "epoch": 0.8777509222745198, + "ewc_loss": 0.051329679787158966, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021178314636927098, + "grad_norm": 5.966967582702637, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8596009016036987, + "num_tokens": 263241541.0, + "step": 6900 + }, + { + "epoch": 0.8778781325531103, + "ewc_loss": 0.05134651064872742, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021195142471697181, + "grad_norm": 5.95727014541626, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8730292320251465, + "num_tokens": 263278778.0, + "step": 6901 + }, + { + "epoch": 0.8780053428317008, + "ewc_loss": 0.05130312219262123, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021151754481252283, + "grad_norm": 6.018869400024414, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8638808131217957, + "num_tokens": 263310155.0, + "step": 6902 + }, + { + "epoch": 0.8781325531102913, + "ewc_loss": 0.051310569047927856, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.00021159203606657684, + "grad_norm": 5.913046836853027, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8565431237220764, + "num_tokens": 263355793.0, + "step": 6903 + }, + { + "epoch": 0.8782597633888818, + "ewc_loss": 0.05145978182554245, + "ewc_loss_diag": 3.0159950256347656e-05, + "ewc_loss_parallel": 0.0002130841457983479, + "grad_norm": 5.990499019622803, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8702872395515442, + "num_tokens": 263394446.0, + "step": 6904 + }, + { + "epoch": 0.8783869736674723, + "ewc_loss": 0.051515400409698486, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021241964714135975, + "grad_norm": 5.932247161865234, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8632985353469849, + "num_tokens": 263438479.0, + "step": 6905 + }, + { + "epoch": 0.8785141839460628, + "ewc_loss": 0.0515853650867939, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021311927412170917, + "grad_norm": 6.029978275299072, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8460965752601624, + "num_tokens": 263478514.0, + "step": 6906 + }, + { + "epoch": 0.8786413942246534, + "ewc_loss": 0.05147978663444519, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002120635035680607, + "grad_norm": 6.001662731170654, + "learning_rate": 1e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.8422901630401611, + "num_tokens": 263516734.0, + "step": 6907 + }, + { + "epoch": 0.8787686045032439, + "ewc_loss": 0.051584310829639435, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021310875308699906, + "grad_norm": 5.990762233734131, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8498544692993164, + "num_tokens": 263560105.0, + "step": 6908 + }, + { + "epoch": 0.8788958147818343, + "ewc_loss": 0.05149189755320549, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021218460460659117, + "grad_norm": 5.983447551727295, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8629677891731262, + "num_tokens": 263597033.0, + "step": 6909 + }, + { + "epoch": 0.8790230250604248, + "ewc_loss": 0.05150220915675163, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021228771947789937, + "grad_norm": 6.070040225982666, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8679571747779846, + "num_tokens": 263631106.0, + "step": 6910 + }, + { + "epoch": 0.8791502353390154, + "ewc_loss": 0.05142568051815033, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021152243425603956, + "grad_norm": 5.9325737953186035, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8540621995925903, + "num_tokens": 263668227.0, + "step": 6911 + }, + { + "epoch": 0.8792774456176059, + "ewc_loss": 0.051504917442798615, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021231480059213936, + "grad_norm": 6.010794639587402, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8634283542633057, + "num_tokens": 263704335.0, + "step": 6912 + }, + { + "epoch": 0.8794046558961964, + "ewc_loss": 0.051437415182590485, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021163975179661065, + "grad_norm": 5.916421413421631, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8657971024513245, + "num_tokens": 263742532.0, + "step": 6913 + }, + { + "epoch": 0.879531866174787, + "ewc_loss": 0.05169186741113663, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021296358318068087, + "grad_norm": 6.031699180603027, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8617560863494873, + "num_tokens": 263782321.0, + "step": 6914 + }, + { + "epoch": 0.8796590764533775, + "ewc_loss": 0.05147276073694229, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021199321781750768, + "grad_norm": 5.962437629699707, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8682917356491089, + "num_tokens": 263818037.0, + "step": 6915 + }, + { + "epoch": 0.8797862867319679, + "ewc_loss": 0.05155334621667862, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021279910288285464, + "grad_norm": 6.009251117706299, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8464313745498657, + "num_tokens": 263854901.0, + "step": 6916 + }, + { + "epoch": 0.8799134970105584, + "ewc_loss": 0.05149321258068085, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021219774498604238, + "grad_norm": 5.959294319152832, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8652852773666382, + "num_tokens": 263896241.0, + "step": 6917 + }, + { + "epoch": 0.880040707289149, + "ewc_loss": 0.05147596821188927, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021202530479058623, + "grad_norm": 5.951629161834717, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.863239049911499, + "num_tokens": 263934673.0, + "step": 6918 + }, + { + "epoch": 0.8801679175677395, + "ewc_loss": 0.051535312086343765, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021261874644551426, + "grad_norm": 5.967996597290039, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8527716398239136, + "num_tokens": 263973456.0, + "step": 6919 + }, + { + "epoch": 0.88029512784633, + "ewc_loss": 0.05152365192770958, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021250214194878936, + "grad_norm": 6.029347896575928, + "learning_rate": 1e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.843705415725708, + "num_tokens": 264006050.0, + "step": 6920 + }, + { + "epoch": 0.8804223381249205, + "ewc_loss": 0.051535509526729584, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021262072550598532, + "grad_norm": 5.963579177856445, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8711333274841309, + "num_tokens": 264042736.0, + "step": 6921 + }, + { + "epoch": 0.880549548403511, + "ewc_loss": 0.0515364408493042, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002126300532836467, + "grad_norm": 5.955475807189941, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8522647619247437, + "num_tokens": 264084433.0, + "step": 6922 + }, + { + "epoch": 0.8806767586821015, + "ewc_loss": 0.051572222262620926, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021298784122336656, + "grad_norm": 6.018099308013916, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8681314587593079, + "num_tokens": 264118331.0, + "step": 6923 + }, + { + "epoch": 0.880803968960692, + "ewc_loss": 0.05148566514253616, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002121222496498376, + "grad_norm": 5.95884895324707, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8628243207931519, + "num_tokens": 264152015.0, + "step": 6924 + }, + { + "epoch": 0.8809311792392825, + "ewc_loss": 0.05159057676792145, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021317138453014195, + "grad_norm": 5.979720592498779, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8659444451332092, + "num_tokens": 264187085.0, + "step": 6925 + }, + { + "epoch": 0.8810583895178731, + "ewc_loss": 0.05148208886384964, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021208651014603674, + "grad_norm": 6.028295993804932, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8575060367584229, + "num_tokens": 264225558.0, + "step": 6926 + }, + { + "epoch": 0.8811855997964636, + "ewc_loss": 0.05150052160024643, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021227083925623447, + "grad_norm": 5.992212772369385, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8505735993385315, + "num_tokens": 264264220.0, + "step": 6927 + }, + { + "epoch": 0.881312810075054, + "ewc_loss": 0.05146004259586334, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021186606318224221, + "grad_norm": 5.949321746826172, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.864422082901001, + "num_tokens": 264302097.0, + "step": 6928 + }, + { + "epoch": 0.8814400203536445, + "ewc_loss": 0.051470931619405746, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021197494061198086, + "grad_norm": 5.938991069793701, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8624601364135742, + "num_tokens": 264341966.0, + "step": 6929 + }, + { + "epoch": 0.8815672306322351, + "ewc_loss": 0.05148717015981674, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021213731088209897, + "grad_norm": 5.951401710510254, + "learning_rate": 1e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8377418518066406, + "num_tokens": 264384298.0, + "step": 6930 + }, + { + "epoch": 0.8816944409108256, + "ewc_loss": 0.05155869200825691, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021285255206748843, + "grad_norm": 5.93655252456665, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8615001440048218, + "num_tokens": 264422513.0, + "step": 6931 + }, + { + "epoch": 0.8818216511894161, + "ewc_loss": 0.05158371478319168, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021310278680175543, + "grad_norm": 5.93341064453125, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8737729787826538, + "num_tokens": 264461819.0, + "step": 6932 + }, + { + "epoch": 0.8819488614680067, + "ewc_loss": 0.051609866321086884, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021336428471840918, + "grad_norm": 5.953811168670654, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8495947122573853, + "num_tokens": 264502618.0, + "step": 6933 + }, + { + "epoch": 0.8820760717465971, + "ewc_loss": 0.05161886289715767, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021345424465835094, + "grad_norm": 6.014636516571045, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8548625707626343, + "num_tokens": 264534382.0, + "step": 6934 + }, + { + "epoch": 0.8822032820251876, + "ewc_loss": 0.05165545642375946, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021382016711868346, + "grad_norm": 5.9790143966674805, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8499536514282227, + "num_tokens": 264575297.0, + "step": 6935 + }, + { + "epoch": 0.8823304923037781, + "ewc_loss": 0.0516011007130146, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002132766239810735, + "grad_norm": 5.993533611297607, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8672764301300049, + "num_tokens": 264611822.0, + "step": 6936 + }, + { + "epoch": 0.8824577025823687, + "ewc_loss": 0.05156979709863663, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021296361228451133, + "grad_norm": 5.959366321563721, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8559272289276123, + "num_tokens": 264657483.0, + "step": 6937 + }, + { + "epoch": 0.8825849128609592, + "ewc_loss": 0.051601067185401917, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021327627473510802, + "grad_norm": 6.0062408447265625, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8656195402145386, + "num_tokens": 264690633.0, + "step": 6938 + }, + { + "epoch": 0.8827121231395497, + "ewc_loss": 0.051532067358493805, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021258632477838546, + "grad_norm": 5.892123222351074, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8750468492507935, + "num_tokens": 264733268.0, + "step": 6939 + }, + { + "epoch": 0.8828393334181401, + "ewc_loss": 0.051618218421936035, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021344779815990478, + "grad_norm": 5.969101905822754, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8533996343612671, + "num_tokens": 264774769.0, + "step": 6940 + }, + { + "epoch": 0.8829665436967307, + "ewc_loss": 0.05161834508180618, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021344904962461442, + "grad_norm": 6.007212162017822, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8554153442382812, + "num_tokens": 264806658.0, + "step": 6941 + }, + { + "epoch": 0.8830937539753212, + "ewc_loss": 0.05158773809671402, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021314302284736186, + "grad_norm": 6.032337665557861, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8477530479431152, + "num_tokens": 264841054.0, + "step": 6942 + }, + { + "epoch": 0.8832209642539117, + "ewc_loss": 0.051561519503593445, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021288081188686192, + "grad_norm": 5.962776184082031, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.855185866355896, + "num_tokens": 264880241.0, + "step": 6943 + }, + { + "epoch": 0.8833481745325023, + "ewc_loss": 0.05161212384700775, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021338684018701315, + "grad_norm": 6.013790607452393, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8624749183654785, + "num_tokens": 264922449.0, + "step": 6944 + }, + { + "epoch": 0.8834753848110928, + "ewc_loss": 0.0515407957136631, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021267357806209475, + "grad_norm": 5.978246212005615, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8588407039642334, + "num_tokens": 264961605.0, + "step": 6945 + }, + { + "epoch": 0.8836025950896832, + "ewc_loss": 0.05153195559978485, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002125851606251672, + "grad_norm": 5.992977619171143, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8683637976646423, + "num_tokens": 264992924.0, + "step": 6946 + }, + { + "epoch": 0.8837298053682737, + "ewc_loss": 0.0515148788690567, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.000212414437555708, + "grad_norm": 5.979801654815674, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8600651025772095, + "num_tokens": 265031849.0, + "step": 6947 + }, + { + "epoch": 0.8838570156468643, + "ewc_loss": 0.05157950147986412, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021306064445525408, + "grad_norm": 6.010317802429199, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8723604679107666, + "num_tokens": 265069714.0, + "step": 6948 + }, + { + "epoch": 0.8839842259254548, + "ewc_loss": 0.051591213792562485, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021317775826901197, + "grad_norm": 5.978824615478516, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8548356294631958, + "num_tokens": 265103325.0, + "step": 6949 + }, + { + "epoch": 0.8841114362040453, + "ewc_loss": 0.051560599356889725, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002128716150764376, + "grad_norm": 5.993373870849609, + "learning_rate": 1e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.8384740352630615, + "num_tokens": 265139133.0, + "step": 6950 + }, + { + "epoch": 0.8842386464826358, + "ewc_loss": 0.05157208442687988, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002129864733433351, + "grad_norm": 6.00422477722168, + "learning_rate": 1e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8363523483276367, + "num_tokens": 265183536.0, + "step": 6951 + }, + { + "epoch": 0.8843658567612263, + "ewc_loss": 0.05148380249738693, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002121036231983453, + "grad_norm": 5.910422325134277, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8518725037574768, + "num_tokens": 265226288.0, + "step": 6952 + }, + { + "epoch": 0.8844930670398168, + "ewc_loss": 0.05153416097164154, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002126072213286534, + "grad_norm": 6.008861541748047, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8601730465888977, + "num_tokens": 265265846.0, + "step": 6953 + }, + { + "epoch": 0.8846202773184073, + "ewc_loss": 0.051488786935806274, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021215349261183292, + "grad_norm": 5.964695453643799, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8734412789344788, + "num_tokens": 265300612.0, + "step": 6954 + }, + { + "epoch": 0.8847474875969978, + "ewc_loss": 0.0515064001083374, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021232961444184184, + "grad_norm": 5.989574432373047, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8686711192131042, + "num_tokens": 265341440.0, + "step": 6955 + }, + { + "epoch": 0.8848746978755884, + "ewc_loss": 0.05151228606700897, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021238847693894058, + "grad_norm": 5.982888221740723, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8505252003669739, + "num_tokens": 265384854.0, + "step": 6956 + }, + { + "epoch": 0.8850019081541789, + "ewc_loss": 0.0515458807349205, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021272440790198743, + "grad_norm": 6.042179107666016, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8521008491516113, + "num_tokens": 265420611.0, + "step": 6957 + }, + { + "epoch": 0.8851291184327693, + "ewc_loss": 0.05149649828672409, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021223060321062803, + "grad_norm": 5.976846694946289, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8587121963500977, + "num_tokens": 265457720.0, + "step": 6958 + }, + { + "epoch": 0.8852563287113598, + "ewc_loss": 0.051548779010772705, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021275343897286803, + "grad_norm": 6.03876256942749, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8687790036201477, + "num_tokens": 265493434.0, + "step": 6959 + }, + { + "epoch": 0.8853835389899504, + "ewc_loss": 0.05150187015533447, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002122843434335664, + "grad_norm": 5.946390151977539, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.86223965883255, + "num_tokens": 265529015.0, + "step": 6960 + }, + { + "epoch": 0.8855107492685409, + "ewc_loss": 0.0515182763338089, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021244837262202054, + "grad_norm": 5.97674036026001, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8445677757263184, + "num_tokens": 265568698.0, + "step": 6961 + }, + { + "epoch": 0.8856379595471314, + "ewc_loss": 0.0515638068318367, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021290367294568568, + "grad_norm": 5.937188625335693, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8610345125198364, + "num_tokens": 265605631.0, + "step": 6962 + }, + { + "epoch": 0.885765169825722, + "ewc_loss": 0.05160244554281235, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021329008450265974, + "grad_norm": 6.015897274017334, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8613616228103638, + "num_tokens": 265647071.0, + "step": 6963 + }, + { + "epoch": 0.8858923801043125, + "ewc_loss": 0.05158326029777527, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021309823205228895, + "grad_norm": 5.968523025512695, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8596851825714111, + "num_tokens": 265688680.0, + "step": 6964 + }, + { + "epoch": 0.8860195903829029, + "ewc_loss": 0.05155598372220993, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021282544184941798, + "grad_norm": 5.9472784996032715, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.865932047367096, + "num_tokens": 265727707.0, + "step": 6965 + }, + { + "epoch": 0.8861468006614934, + "ewc_loss": 0.051556624472141266, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021283188834786415, + "grad_norm": 5.953676223754883, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8576703667640686, + "num_tokens": 265765860.0, + "step": 6966 + }, + { + "epoch": 0.886274010940084, + "ewc_loss": 0.05163094401359558, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021357506921049207, + "grad_norm": 6.009735584259033, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8635146617889404, + "num_tokens": 265802988.0, + "step": 6967 + }, + { + "epoch": 0.8864012212186745, + "ewc_loss": 0.05151849240064621, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002124505554093048, + "grad_norm": 5.983617782592773, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8541011214256287, + "num_tokens": 265840614.0, + "step": 6968 + }, + { + "epoch": 0.886528431497265, + "ewc_loss": 0.05166913568973541, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021395696967374533, + "grad_norm": 5.9837141036987305, + "learning_rate": 1e-06, + "loss": 0.5443, + "mean_token_accuracy": 0.8330684900283813, + "num_tokens": 265883032.0, + "step": 6969 + }, + { + "epoch": 0.8866556417758555, + "ewc_loss": 0.051554448902606964, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021281011868268251, + "grad_norm": 5.952661514282227, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8708794713020325, + "num_tokens": 265918309.0, + "step": 6970 + }, + { + "epoch": 0.886782852054446, + "ewc_loss": 0.05162744224071503, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021354007185436785, + "grad_norm": 5.941239356994629, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.863134503364563, + "num_tokens": 265956406.0, + "step": 6971 + }, + { + "epoch": 0.8869100623330365, + "ewc_loss": 0.05167641490697861, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021402978745754808, + "grad_norm": 6.0567307472229, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.86822110414505, + "num_tokens": 265988413.0, + "step": 6972 + }, + { + "epoch": 0.887037272611627, + "ewc_loss": 0.05161787196993828, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021344434935599566, + "grad_norm": 5.963451385498047, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8592729568481445, + "num_tokens": 266022036.0, + "step": 6973 + }, + { + "epoch": 0.8871644828902175, + "ewc_loss": 0.05160394310951233, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021330502931959927, + "grad_norm": 6.052465915679932, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8578259944915771, + "num_tokens": 266053735.0, + "step": 6974 + }, + { + "epoch": 0.8872916931688081, + "ewc_loss": 0.051620692014694214, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021347252186387777, + "grad_norm": 5.93026876449585, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8739124536514282, + "num_tokens": 266094198.0, + "step": 6975 + }, + { + "epoch": 0.8874189034473986, + "ewc_loss": 0.05166950076818466, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021396062220446765, + "grad_norm": 6.091039657592773, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8687991499900818, + "num_tokens": 266132213.0, + "step": 6976 + }, + { + "epoch": 0.887546113725989, + "ewc_loss": 0.05155378580093384, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021280348300933838, + "grad_norm": 5.901946067810059, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8598776459693909, + "num_tokens": 266168001.0, + "step": 6977 + }, + { + "epoch": 0.8876733240045795, + "ewc_loss": 0.051650285720825195, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021376850781962276, + "grad_norm": 6.084739685058594, + "learning_rate": 1e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.8312556147575378, + "num_tokens": 266208771.0, + "step": 6978 + }, + { + "epoch": 0.8878005342831701, + "ewc_loss": 0.05157051235437393, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021297077182680368, + "grad_norm": 5.949982166290283, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8656543493270874, + "num_tokens": 266248090.0, + "step": 6979 + }, + { + "epoch": 0.8879277445617606, + "ewc_loss": 0.05164641886949539, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002137297997251153, + "grad_norm": 6.039634704589844, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8499251008033752, + "num_tokens": 266283748.0, + "step": 6980 + }, + { + "epoch": 0.8880549548403511, + "ewc_loss": 0.05170679837465286, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021311290038283914, + "grad_norm": 6.007895469665527, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8673948645591736, + "num_tokens": 266316613.0, + "step": 6981 + }, + { + "epoch": 0.8881821651189417, + "ewc_loss": 0.051617808640003204, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021344373817555606, + "grad_norm": 5.942480564117432, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.862529993057251, + "num_tokens": 266357515.0, + "step": 6982 + }, + { + "epoch": 0.8883093753975321, + "ewc_loss": 0.05166735500097275, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002139392017852515, + "grad_norm": 6.011463642120361, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8721736669540405, + "num_tokens": 266393593.0, + "step": 6983 + }, + { + "epoch": 0.8884365856761226, + "ewc_loss": 0.05164792388677597, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021374484640546143, + "grad_norm": 5.939112663269043, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8716464042663574, + "num_tokens": 266436104.0, + "step": 6984 + }, + { + "epoch": 0.8885637959547131, + "ewc_loss": 0.05165912210941315, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021385683794505894, + "grad_norm": 6.030745983123779, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8731088042259216, + "num_tokens": 266472809.0, + "step": 6985 + }, + { + "epoch": 0.8886910062333037, + "ewc_loss": 0.05160434544086456, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021330908930394799, + "grad_norm": 5.949978351593018, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8703693151473999, + "num_tokens": 266512726.0, + "step": 6986 + }, + { + "epoch": 0.8888182165118942, + "ewc_loss": 0.05165409296751022, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002138065465260297, + "grad_norm": 6.04701566696167, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8496478796005249, + "num_tokens": 266546098.0, + "step": 6987 + }, + { + "epoch": 0.8889454267904847, + "ewc_loss": 0.05171223729848862, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021316728089004755, + "grad_norm": 5.979691028594971, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8523499965667725, + "num_tokens": 266588478.0, + "step": 6988 + }, + { + "epoch": 0.8890726370690751, + "ewc_loss": 0.051658995449543, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021385555737651885, + "grad_norm": 5.987485885620117, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8486418724060059, + "num_tokens": 266628975.0, + "step": 6989 + }, + { + "epoch": 0.8891998473476657, + "ewc_loss": 0.05156203359365463, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021288594871293753, + "grad_norm": 5.938311576843262, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8695120811462402, + "num_tokens": 266667533.0, + "step": 6990 + }, + { + "epoch": 0.8893270576262562, + "ewc_loss": 0.05171837657690048, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021444940648507327, + "grad_norm": 6.059412002563477, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.842300295829773, + "num_tokens": 266706990.0, + "step": 6991 + }, + { + "epoch": 0.8894542679048467, + "ewc_loss": 0.051587484776973724, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002131404762621969, + "grad_norm": 5.959999084472656, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8472037315368652, + "num_tokens": 266747314.0, + "step": 6992 + }, + { + "epoch": 0.8895814781834372, + "ewc_loss": 0.05169041454792023, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002141697914339602, + "grad_norm": 5.984374046325684, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8513824939727783, + "num_tokens": 266791031.0, + "step": 6993 + }, + { + "epoch": 0.8897086884620278, + "ewc_loss": 0.0516178272664547, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021344389824662358, + "grad_norm": 6.047738075256348, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8603082895278931, + "num_tokens": 266823236.0, + "step": 6994 + }, + { + "epoch": 0.8898358987406182, + "ewc_loss": 0.05167480558156967, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021401367848739028, + "grad_norm": 5.9424872398376465, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8589791059494019, + "num_tokens": 266867993.0, + "step": 6995 + }, + { + "epoch": 0.8899631090192087, + "ewc_loss": 0.05164090543985367, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021367467707023025, + "grad_norm": 6.033450126647949, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8417187929153442, + "num_tokens": 266908573.0, + "step": 6996 + }, + { + "epoch": 0.8900903192977992, + "ewc_loss": 0.05162516236305237, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021351722534745932, + "grad_norm": 5.960058212280273, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8577501773834229, + "num_tokens": 266951167.0, + "step": 6997 + }, + { + "epoch": 0.8902175295763898, + "ewc_loss": 0.05168219655752182, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002140876167686656, + "grad_norm": 6.078897953033447, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.858038067817688, + "num_tokens": 266988976.0, + "step": 6998 + }, + { + "epoch": 0.8903447398549803, + "ewc_loss": 0.051625169813632965, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021351734176278114, + "grad_norm": 5.984251499176025, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8588405847549438, + "num_tokens": 267022450.0, + "step": 6999 + }, + { + "epoch": 0.8904719501335708, + "ewc_loss": 0.051698580384254456, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021425144223030657, + "grad_norm": 6.068135738372803, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8537306785583496, + "num_tokens": 267059933.0, + "step": 7000 + }, + { + "epoch": 0.8905991604121613, + "ewc_loss": 0.0516669824719429, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021393543283920735, + "grad_norm": 6.1307373046875, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8510456681251526, + "num_tokens": 267092518.0, + "step": 7001 + }, + { + "epoch": 0.8907263706907518, + "ewc_loss": 0.051618605852127075, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021345169807318598, + "grad_norm": 5.9995927810668945, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8551949262619019, + "num_tokens": 267130008.0, + "step": 7002 + }, + { + "epoch": 0.8908535809693423, + "ewc_loss": 0.051647234708070755, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021373797790147364, + "grad_norm": 5.980508804321289, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8627710342407227, + "num_tokens": 267171039.0, + "step": 7003 + }, + { + "epoch": 0.8909807912479328, + "ewc_loss": 0.051594361662864685, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002132092195097357, + "grad_norm": 5.985132694244385, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8629019856452942, + "num_tokens": 267211197.0, + "step": 7004 + }, + { + "epoch": 0.8911080015265234, + "ewc_loss": 0.051710840314626694, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021437402756419033, + "grad_norm": 6.029130935668945, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8649686574935913, + "num_tokens": 267247471.0, + "step": 7005 + }, + { + "epoch": 0.8912352118051139, + "ewc_loss": 0.05164700746536255, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021373570780269802, + "grad_norm": 6.046891212463379, + "learning_rate": 1e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8331242203712463, + "num_tokens": 267287276.0, + "step": 7006 + }, + { + "epoch": 0.8913624220837043, + "ewc_loss": 0.0517105832695961, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021437148097902536, + "grad_norm": 6.046441555023193, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8567600250244141, + "num_tokens": 267323827.0, + "step": 7007 + }, + { + "epoch": 0.8914896323622948, + "ewc_loss": 0.05177877098321915, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002138326090062037, + "grad_norm": 5.987491130828857, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8680539131164551, + "num_tokens": 267364053.0, + "step": 7008 + }, + { + "epoch": 0.8916168426408854, + "ewc_loss": 0.05185779184103012, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002146228653145954, + "grad_norm": 6.081118583679199, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8425310254096985, + "num_tokens": 267404393.0, + "step": 7009 + }, + { + "epoch": 0.8917440529194759, + "ewc_loss": 0.05182015895843506, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002142465382348746, + "grad_norm": 6.048792362213135, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.860327959060669, + "num_tokens": 267444589.0, + "step": 7010 + }, + { + "epoch": 0.8918712631980664, + "ewc_loss": 0.051686301827430725, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021412866772152483, + "grad_norm": 6.030206203460693, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8603953123092651, + "num_tokens": 267484460.0, + "step": 7011 + }, + { + "epoch": 0.891998473476657, + "ewc_loss": 0.0516652837395668, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021391845075413585, + "grad_norm": 6.072544097900391, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8530759811401367, + "num_tokens": 267517390.0, + "step": 7012 + }, + { + "epoch": 0.8921256837552475, + "ewc_loss": 0.05165422707796097, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021380791440606117, + "grad_norm": 6.0137739181518555, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8598225116729736, + "num_tokens": 267555813.0, + "step": 7013 + }, + { + "epoch": 0.8922528940338379, + "ewc_loss": 0.051655758172273636, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021382320846896619, + "grad_norm": 6.020309925079346, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8579088449478149, + "num_tokens": 267595818.0, + "step": 7014 + }, + { + "epoch": 0.8923801043124284, + "ewc_loss": 0.051797982305288315, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021402473794296384, + "grad_norm": 6.049627304077148, + "learning_rate": 1e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.8340380191802979, + "num_tokens": 267633106.0, + "step": 7015 + }, + { + "epoch": 0.892507314591019, + "ewc_loss": 0.051705289632081985, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.0002143185120075941, + "grad_norm": 6.083749771118164, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.862308144569397, + "num_tokens": 267670166.0, + "step": 7016 + }, + { + "epoch": 0.8926345248696095, + "ewc_loss": 0.05171690508723259, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021443467994686216, + "grad_norm": 6.047113418579102, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8569203019142151, + "num_tokens": 267707284.0, + "step": 7017 + }, + { + "epoch": 0.8927617351482, + "ewc_loss": 0.05168505385518074, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021289546566549689, + "grad_norm": 6.014916896820068, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8604921102523804, + "num_tokens": 267742349.0, + "step": 7018 + }, + { + "epoch": 0.8928889454267905, + "ewc_loss": 0.05182253569364548, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021427030151244253, + "grad_norm": 6.207817077636719, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8646646738052368, + "num_tokens": 267776003.0, + "step": 7019 + }, + { + "epoch": 0.893016155705381, + "ewc_loss": 0.05168270319700241, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021287193521857262, + "grad_norm": 5.940927982330322, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8550165891647339, + "num_tokens": 267815887.0, + "step": 7020 + }, + { + "epoch": 0.8931433659839715, + "ewc_loss": 0.05183081328868866, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021435307280626148, + "grad_norm": 6.510240077972412, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8699952363967896, + "num_tokens": 267855874.0, + "step": 7021 + }, + { + "epoch": 0.893270576262562, + "ewc_loss": 0.05177555978298187, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002138005365850404, + "grad_norm": 5.916633129119873, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8694764971733093, + "num_tokens": 267897901.0, + "step": 7022 + }, + { + "epoch": 0.8933977865411525, + "ewc_loss": 0.05175913870334625, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021363631822168827, + "grad_norm": 6.0320844650268555, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8694765567779541, + "num_tokens": 267938967.0, + "step": 7023 + }, + { + "epoch": 0.8935249968197431, + "ewc_loss": 0.05172797664999962, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002133246889570728, + "grad_norm": 5.975857257843018, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8557150363922119, + "num_tokens": 267975196.0, + "step": 7024 + }, + { + "epoch": 0.8936522070983336, + "ewc_loss": 0.05183709040284157, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021441583521664143, + "grad_norm": 6.03125, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8734174370765686, + "num_tokens": 268014407.0, + "step": 7025 + }, + { + "epoch": 0.893779417376924, + "ewc_loss": 0.051819585263729095, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002142407902283594, + "grad_norm": 6.088871479034424, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8740834593772888, + "num_tokens": 268043892.0, + "step": 7026 + }, + { + "epoch": 0.8939066276555145, + "ewc_loss": 0.05173787474632263, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021342365653254092, + "grad_norm": 6.025603771209717, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8597767353057861, + "num_tokens": 268079651.0, + "step": 7027 + }, + { + "epoch": 0.8940338379341051, + "ewc_loss": 0.051815032958984375, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021419525728560984, + "grad_norm": 6.035362720489502, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.861723780632019, + "num_tokens": 268114021.0, + "step": 7028 + }, + { + "epoch": 0.8941610482126956, + "ewc_loss": 0.05177415907382965, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021378652309067547, + "grad_norm": 6.007808208465576, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8580392599105835, + "num_tokens": 268155315.0, + "step": 7029 + }, + { + "epoch": 0.8942882584912861, + "ewc_loss": 0.051812514662742615, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021417006792034954, + "grad_norm": 6.047103404998779, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8562971949577332, + "num_tokens": 268199312.0, + "step": 7030 + }, + { + "epoch": 0.8944154687698767, + "ewc_loss": 0.051732905209064484, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021337396174203604, + "grad_norm": 6.097341060638428, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8662081956863403, + "num_tokens": 268236955.0, + "step": 7031 + }, + { + "epoch": 0.8945426790484671, + "ewc_loss": 0.05183827877044678, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021442770957946777, + "grad_norm": 6.047790050506592, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8564119935035706, + "num_tokens": 268272183.0, + "step": 7032 + }, + { + "epoch": 0.8946698893270576, + "ewc_loss": 0.05168742686510086, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021291921439114958, + "grad_norm": 6.064142227172852, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8496506214141846, + "num_tokens": 268305407.0, + "step": 7033 + }, + { + "epoch": 0.8947970996056481, + "ewc_loss": 0.051723867654800415, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021328362345229834, + "grad_norm": 6.045701026916504, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.870539128780365, + "num_tokens": 268340443.0, + "step": 7034 + }, + { + "epoch": 0.8949243098842387, + "ewc_loss": 0.05173557251691818, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002134006645064801, + "grad_norm": 6.001699924468994, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.87782883644104, + "num_tokens": 268375597.0, + "step": 7035 + }, + { + "epoch": 0.8950515201628292, + "ewc_loss": 0.05174718797206879, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021351677423808724, + "grad_norm": 6.081463813781738, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.853989839553833, + "num_tokens": 268412872.0, + "step": 7036 + }, + { + "epoch": 0.8951787304414197, + "ewc_loss": 0.051734559237957, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002133905072696507, + "grad_norm": 5.9952263832092285, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8599612712860107, + "num_tokens": 268452021.0, + "step": 7037 + }, + { + "epoch": 0.8953059407200101, + "ewc_loss": 0.0517384335398674, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021342922991607338, + "grad_norm": 6.002041339874268, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8682312965393066, + "num_tokens": 268486354.0, + "step": 7038 + }, + { + "epoch": 0.8954331509986007, + "ewc_loss": 0.05177735537290573, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021381846454460174, + "grad_norm": 6.107506275177002, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8545447587966919, + "num_tokens": 268519918.0, + "step": 7039 + }, + { + "epoch": 0.8955603612771912, + "ewc_loss": 0.051794715225696564, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021399205434136093, + "grad_norm": 6.06396484375, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8509695529937744, + "num_tokens": 268554259.0, + "step": 7040 + }, + { + "epoch": 0.8956875715557817, + "ewc_loss": 0.05174154043197632, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021346031280700117, + "grad_norm": 5.991567134857178, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8535040020942688, + "num_tokens": 268595785.0, + "step": 7041 + }, + { + "epoch": 0.8958147818343722, + "ewc_loss": 0.051783207803964615, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021387700689956546, + "grad_norm": 6.031233310699463, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.843891978263855, + "num_tokens": 268640805.0, + "step": 7042 + }, + { + "epoch": 0.8959419921129628, + "ewc_loss": 0.05175312981009483, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021357621881179512, + "grad_norm": 5.981022357940674, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8643852472305298, + "num_tokens": 268682446.0, + "step": 7043 + }, + { + "epoch": 0.8960692023915532, + "ewc_loss": 0.05181526765227318, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002141976001439616, + "grad_norm": 6.038496017456055, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8685271739959717, + "num_tokens": 268720262.0, + "step": 7044 + }, + { + "epoch": 0.8961964126701437, + "ewc_loss": 0.051850832998752594, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021455327805597335, + "grad_norm": 6.04478120803833, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.847546398639679, + "num_tokens": 268760519.0, + "step": 7045 + }, + { + "epoch": 0.8963236229487342, + "ewc_loss": 0.05179151892662048, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021396009833551943, + "grad_norm": 6.05656623840332, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.85297691822052, + "num_tokens": 268796621.0, + "step": 7046 + }, + { + "epoch": 0.8964508332273248, + "ewc_loss": 0.05188880115747452, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021493290842045099, + "grad_norm": 6.058882236480713, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8682384490966797, + "num_tokens": 268830527.0, + "step": 7047 + }, + { + "epoch": 0.8965780435059153, + "ewc_loss": 0.05178205668926239, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021386549633461982, + "grad_norm": 6.087569236755371, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8657810091972351, + "num_tokens": 268862112.0, + "step": 7048 + }, + { + "epoch": 0.8967052537845058, + "ewc_loss": 0.05183952674269676, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002144401951227337, + "grad_norm": 6.063327789306641, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8583171963691711, + "num_tokens": 268901089.0, + "step": 7049 + }, + { + "epoch": 0.8968324640630962, + "ewc_loss": 0.05181480199098587, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021419291442725807, + "grad_norm": 6.046321868896484, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8615931272506714, + "num_tokens": 268942766.0, + "step": 7050 + }, + { + "epoch": 0.8969596743416868, + "ewc_loss": 0.05180436745285988, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002140885917469859, + "grad_norm": 6.055112361907959, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8653934001922607, + "num_tokens": 268978642.0, + "step": 7051 + }, + { + "epoch": 0.8970868846202773, + "ewc_loss": 0.05179448053240776, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002139897260349244, + "grad_norm": 6.02004337310791, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8657984733581543, + "num_tokens": 269016048.0, + "step": 7052 + }, + { + "epoch": 0.8972140948988678, + "ewc_loss": 0.05177054554224014, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021375039068516344, + "grad_norm": 6.0768723487854, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8602094650268555, + "num_tokens": 269054381.0, + "step": 7053 + }, + { + "epoch": 0.8973413051774584, + "ewc_loss": 0.05180119723081589, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021405686857178807, + "grad_norm": 6.027431488037109, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8637460470199585, + "num_tokens": 269090672.0, + "step": 7054 + }, + { + "epoch": 0.8974685154560489, + "ewc_loss": 0.051765911281108856, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021370405738707632, + "grad_norm": 5.999114513397217, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.862476646900177, + "num_tokens": 269130231.0, + "step": 7055 + }, + { + "epoch": 0.8975957257346393, + "ewc_loss": 0.05185851827263832, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002146300976164639, + "grad_norm": 6.094264030456543, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8637056350708008, + "num_tokens": 269166282.0, + "step": 7056 + }, + { + "epoch": 0.8977229360132298, + "ewc_loss": 0.051802948117256165, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021407440362963825, + "grad_norm": 6.024436950683594, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8646907806396484, + "num_tokens": 269205326.0, + "step": 7057 + }, + { + "epoch": 0.8978501462918204, + "ewc_loss": 0.05183675140142441, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021441241551656276, + "grad_norm": 6.056929588317871, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8612077236175537, + "num_tokens": 269243903.0, + "step": 7058 + }, + { + "epoch": 0.8979773565704109, + "ewc_loss": 0.051841702312231064, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002144619356840849, + "grad_norm": 6.062338352203369, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8537901639938354, + "num_tokens": 269288330.0, + "step": 7059 + }, + { + "epoch": 0.8981045668490014, + "ewc_loss": 0.051804929971694946, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021409423789009452, + "grad_norm": 6.020935535430908, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.850378155708313, + "num_tokens": 269328731.0, + "step": 7060 + }, + { + "epoch": 0.898231777127592, + "ewc_loss": 0.051890548318624496, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002149504143744707, + "grad_norm": 6.057921409606934, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8618277311325073, + "num_tokens": 269369576.0, + "step": 7061 + }, + { + "epoch": 0.8983589874061825, + "ewc_loss": 0.05182461440563202, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021429108164738864, + "grad_norm": 6.028029441833496, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8586299419403076, + "num_tokens": 269407734.0, + "step": 7062 + }, + { + "epoch": 0.8984861976847729, + "ewc_loss": 0.05184725299477577, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002145174512406811, + "grad_norm": 6.091654300689697, + "learning_rate": 1e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8396555185317993, + "num_tokens": 269444042.0, + "step": 7063 + }, + { + "epoch": 0.8986134079633634, + "ewc_loss": 0.051846712827682495, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021451202337630093, + "grad_norm": 6.026590347290039, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8833016157150269, + "num_tokens": 269480979.0, + "step": 7064 + }, + { + "epoch": 0.898740618241954, + "ewc_loss": 0.051819413900375366, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.000214239043998532, + "grad_norm": 6.036264896392822, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8602503538131714, + "num_tokens": 269515594.0, + "step": 7065 + }, + { + "epoch": 0.8988678285205445, + "ewc_loss": 0.0518239289522171, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002142841840395704, + "grad_norm": 5.976660251617432, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8622698187828064, + "num_tokens": 269556974.0, + "step": 7066 + }, + { + "epoch": 0.898995038799135, + "ewc_loss": 0.05182550847530365, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002142999874195084, + "grad_norm": 6.081991195678711, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8542364835739136, + "num_tokens": 269591088.0, + "step": 7067 + }, + { + "epoch": 0.8991222490777255, + "ewc_loss": 0.051879022270441055, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021483514865394682, + "grad_norm": 6.039628505706787, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.869123101234436, + "num_tokens": 269627921.0, + "step": 7068 + }, + { + "epoch": 0.899249459356316, + "ewc_loss": 0.05183134227991104, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021435835515148938, + "grad_norm": 6.052077770233154, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8438735008239746, + "num_tokens": 269666586.0, + "step": 7069 + }, + { + "epoch": 0.8993766696349065, + "ewc_loss": 0.051900312304496765, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021504805772565305, + "grad_norm": 6.063914775848389, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8632786273956299, + "num_tokens": 269701205.0, + "step": 7070 + }, + { + "epoch": 0.899503879913497, + "ewc_loss": 0.051826223731040955, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021430714696180075, + "grad_norm": 6.035029411315918, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8733359575271606, + "num_tokens": 269739982.0, + "step": 7071 + }, + { + "epoch": 0.8996310901920875, + "ewc_loss": 0.05191502720117569, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002151952066924423, + "grad_norm": 6.067389011383057, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8613506555557251, + "num_tokens": 269773048.0, + "step": 7072 + }, + { + "epoch": 0.8997583004706781, + "ewc_loss": 0.05184914544224739, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021453638328239322, + "grad_norm": 6.020317077636719, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8600618839263916, + "num_tokens": 269811343.0, + "step": 7073 + }, + { + "epoch": 0.8998855107492686, + "ewc_loss": 0.0519489049911499, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021553396072704345, + "grad_norm": 5.999768257141113, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8644827604293823, + "num_tokens": 269851601.0, + "step": 7074 + }, + { + "epoch": 0.900012721027859, + "ewc_loss": 0.05192328989505768, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021527783246710896, + "grad_norm": 6.02012825012207, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8686243295669556, + "num_tokens": 269891396.0, + "step": 7075 + }, + { + "epoch": 0.9001399313064495, + "ewc_loss": 0.05197244882583618, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021576941071543843, + "grad_norm": 6.068334579467773, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8590550422668457, + "num_tokens": 269930292.0, + "step": 7076 + }, + { + "epoch": 0.9002671415850401, + "ewc_loss": 0.051944635808467865, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021549127995967865, + "grad_norm": 6.079282760620117, + "learning_rate": 1e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8325310945510864, + "num_tokens": 269968464.0, + "step": 7077 + }, + { + "epoch": 0.9003943518636306, + "ewc_loss": 0.051922667771577835, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021527160424739122, + "grad_norm": 6.058160781860352, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8466795682907104, + "num_tokens": 270011569.0, + "step": 7078 + }, + { + "epoch": 0.9005215621422211, + "ewc_loss": 0.05195065960288048, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021555151033680886, + "grad_norm": 6.054263591766357, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.856296718120575, + "num_tokens": 270050608.0, + "step": 7079 + }, + { + "epoch": 0.9006487724208116, + "ewc_loss": 0.05191085487604141, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002151534572476521, + "grad_norm": 6.006282329559326, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8454753756523132, + "num_tokens": 270096100.0, + "step": 7080 + }, + { + "epoch": 0.9007759826994021, + "ewc_loss": 0.051986053586006165, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021590547112282366, + "grad_norm": 6.046765327453613, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8707867860794067, + "num_tokens": 270140915.0, + "step": 7081 + }, + { + "epoch": 0.9009031929779926, + "ewc_loss": 0.051909130066633224, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002151362132281065, + "grad_norm": 5.999648571014404, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8607977032661438, + "num_tokens": 270184233.0, + "step": 7082 + }, + { + "epoch": 0.9010304032565831, + "ewc_loss": 0.05191154405474663, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021516035485547036, + "grad_norm": 6.021685600280762, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8587110042572021, + "num_tokens": 270223889.0, + "step": 7083 + }, + { + "epoch": 0.9011576135351737, + "ewc_loss": 0.05190551280975342, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021510003716684878, + "grad_norm": 5.99881649017334, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8653014898300171, + "num_tokens": 270266357.0, + "step": 7084 + }, + { + "epoch": 0.9012848238137642, + "ewc_loss": 0.0519513376057148, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021555829152930528, + "grad_norm": 6.123243808746338, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8687410354614258, + "num_tokens": 270298252.0, + "step": 7085 + }, + { + "epoch": 0.9014120340923547, + "ewc_loss": 0.05189923942089081, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021503728930838406, + "grad_norm": 5.96481466293335, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8529506921768188, + "num_tokens": 270343187.0, + "step": 7086 + }, + { + "epoch": 0.9015392443709451, + "ewc_loss": 0.05196645110845566, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002157094277208671, + "grad_norm": 6.108601093292236, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8488562107086182, + "num_tokens": 270385094.0, + "step": 7087 + }, + { + "epoch": 0.9016664546495357, + "ewc_loss": 0.051877763122320175, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021482254669535905, + "grad_norm": 6.057962894439697, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8479971885681152, + "num_tokens": 270421772.0, + "step": 7088 + }, + { + "epoch": 0.9017936649281262, + "ewc_loss": 0.051926784217357635, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021531277161557227, + "grad_norm": 6.016364574432373, + "learning_rate": 1e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8417325615882874, + "num_tokens": 270465033.0, + "step": 7089 + }, + { + "epoch": 0.9019208752067167, + "ewc_loss": 0.05191706120967865, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002152155211661011, + "grad_norm": 6.063248157501221, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8822758197784424, + "num_tokens": 270501370.0, + "step": 7090 + }, + { + "epoch": 0.9020480854853072, + "ewc_loss": 0.051917292177677155, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002152178349206224, + "grad_norm": 6.000452518463135, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8652000427246094, + "num_tokens": 270543571.0, + "step": 7091 + }, + { + "epoch": 0.9021752957638978, + "ewc_loss": 0.0519399493932724, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021544443734455854, + "grad_norm": 6.009126663208008, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8704645037651062, + "num_tokens": 270580635.0, + "step": 7092 + }, + { + "epoch": 0.9023025060424882, + "ewc_loss": 0.05199038237333298, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021594874851871282, + "grad_norm": 6.0660080909729, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8733439445495605, + "num_tokens": 270616534.0, + "step": 7093 + }, + { + "epoch": 0.9024297163210787, + "ewc_loss": 0.05196428671479225, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021568778902292252, + "grad_norm": 6.080512523651123, + "learning_rate": 1e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8415684700012207, + "num_tokens": 270653998.0, + "step": 7094 + }, + { + "epoch": 0.9025569265996692, + "ewc_loss": 0.05197861045598984, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021583102352451533, + "grad_norm": 6.066878318786621, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8603582978248596, + "num_tokens": 270688023.0, + "step": 7095 + }, + { + "epoch": 0.9026841368782598, + "ewc_loss": 0.05187857896089554, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021483069576788694, + "grad_norm": 6.056580066680908, + "learning_rate": 1e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8431077003479004, + "num_tokens": 270721298.0, + "step": 7096 + }, + { + "epoch": 0.9028113471568503, + "ewc_loss": 0.05202630162239075, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021630796254612505, + "grad_norm": 6.024369716644287, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8656046986579895, + "num_tokens": 270762182.0, + "step": 7097 + }, + { + "epoch": 0.9029385574354408, + "ewc_loss": 0.051969170570373535, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002157366106985137, + "grad_norm": 6.044558525085449, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8749492168426514, + "num_tokens": 270799577.0, + "step": 7098 + }, + { + "epoch": 0.9030657677140312, + "ewc_loss": 0.05204135924577713, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021645853121299297, + "grad_norm": 6.0287766456604, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8622841238975525, + "num_tokens": 270840218.0, + "step": 7099 + }, + { + "epoch": 0.9031929779926218, + "ewc_loss": 0.05201680585741997, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002162129821954295, + "grad_norm": 6.116352081298828, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8808878660202026, + "num_tokens": 270872982.0, + "step": 7100 + }, + { + "epoch": 0.9033201882712123, + "ewc_loss": 0.05196245014667511, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021566943905781955, + "grad_norm": 6.029960632324219, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8653327822685242, + "num_tokens": 270908381.0, + "step": 7101 + }, + { + "epoch": 0.9034473985498028, + "ewc_loss": 0.05203288421034813, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021637376630678773, + "grad_norm": 6.047207355499268, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8592820167541504, + "num_tokens": 270949217.0, + "step": 7102 + }, + { + "epoch": 0.9035746088283934, + "ewc_loss": 0.05201871693134308, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021623210341203958, + "grad_norm": 6.065789699554443, + "learning_rate": 1e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8429172039031982, + "num_tokens": 270989667.0, + "step": 7103 + }, + { + "epoch": 0.9037018191069839, + "ewc_loss": 0.051976293325424194, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021580785687547177, + "grad_norm": 6.014706134796143, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8498826622962952, + "num_tokens": 271030395.0, + "step": 7104 + }, + { + "epoch": 0.9038290293855743, + "ewc_loss": 0.05206652730703354, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002167102211387828, + "grad_norm": 6.1211838722229, + "learning_rate": 1e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8428998589515686, + "num_tokens": 271068239.0, + "step": 7105 + }, + { + "epoch": 0.9039562396641648, + "ewc_loss": 0.051969483494758606, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021573978301603347, + "grad_norm": 6.08128547668457, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8536808490753174, + "num_tokens": 271110294.0, + "step": 7106 + }, + { + "epoch": 0.9040834499427554, + "ewc_loss": 0.05200833082199097, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021612824639305472, + "grad_norm": 6.085159778594971, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8777581453323364, + "num_tokens": 271143288.0, + "step": 7107 + }, + { + "epoch": 0.9042106602213459, + "ewc_loss": 0.051976483315229416, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002158097631763667, + "grad_norm": 6.0033860206604, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8674722909927368, + "num_tokens": 271181183.0, + "step": 7108 + }, + { + "epoch": 0.9043378704999364, + "ewc_loss": 0.05202847719192505, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021632970310747623, + "grad_norm": 6.065964698791504, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8590536713600159, + "num_tokens": 271216201.0, + "step": 7109 + }, + { + "epoch": 0.9044650807785269, + "ewc_loss": 0.05197129026055336, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021575782739091665, + "grad_norm": 6.0135817527771, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8668768405914307, + "num_tokens": 271250158.0, + "step": 7110 + }, + { + "epoch": 0.9045922910571175, + "ewc_loss": 0.052044596523046494, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021649089467246085, + "grad_norm": 6.0455851554870605, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8728078007698059, + "num_tokens": 271289994.0, + "step": 7111 + }, + { + "epoch": 0.9047195013357079, + "ewc_loss": 0.052066005766391754, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002167049824493006, + "grad_norm": 6.012392044067383, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8654253482818604, + "num_tokens": 271325782.0, + "step": 7112 + }, + { + "epoch": 0.9048467116142984, + "ewc_loss": 0.05202990770339966, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002163439930882305, + "grad_norm": 6.089801788330078, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8468905091285706, + "num_tokens": 271361604.0, + "step": 7113 + }, + { + "epoch": 0.9049739218928889, + "ewc_loss": 0.05208009481430054, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002168458595406264, + "grad_norm": 6.021933078765869, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8699951171875, + "num_tokens": 271398520.0, + "step": 7114 + }, + { + "epoch": 0.9051011321714795, + "ewc_loss": 0.0522429421544075, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021725364786107093, + "grad_norm": 6.065321922302246, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8550366759300232, + "num_tokens": 271445138.0, + "step": 7115 + }, + { + "epoch": 0.90522834245007, + "ewc_loss": 0.05221918970346451, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021701610239688307, + "grad_norm": 6.066394329071045, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.864596962928772, + "num_tokens": 271479147.0, + "step": 7116 + }, + { + "epoch": 0.9053555527286605, + "ewc_loss": 0.05218052864074707, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021662951621692628, + "grad_norm": 6.104875564575195, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8622171878814697, + "num_tokens": 271512477.0, + "step": 7117 + }, + { + "epoch": 0.905482763007251, + "ewc_loss": 0.05218103528022766, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002166345511795953, + "grad_norm": 6.023232936859131, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8467937111854553, + "num_tokens": 271552355.0, + "step": 7118 + }, + { + "epoch": 0.9056099732858415, + "ewc_loss": 0.05239749327301979, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.0002163577446481213, + "grad_norm": 12.828171730041504, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8565958738327026, + "num_tokens": 271595825.0, + "step": 7119 + }, + { + "epoch": 0.905737183564432, + "ewc_loss": 0.06139947474002838, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00030881896964274347, + "grad_norm": 7.441843032836914, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.848555326461792, + "num_tokens": 271627384.0, + "step": 7120 + }, + { + "epoch": 0.9058643938430225, + "ewc_loss": 0.050640203058719635, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00019878483726643026, + "grad_norm": 5.447669982910156, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8697662353515625, + "num_tokens": 271666711.0, + "step": 7121 + }, + { + "epoch": 0.9059916041216131, + "ewc_loss": 0.05451230704784393, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002399472869001329, + "grad_norm": 6.721134662628174, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8663492202758789, + "num_tokens": 271703145.0, + "step": 7122 + }, + { + "epoch": 0.9061188144002036, + "ewc_loss": 0.05370841920375824, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00023190838692244142, + "grad_norm": 6.024446964263916, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8489121198654175, + "num_tokens": 271743131.0, + "step": 7123 + }, + { + "epoch": 0.906246024678794, + "ewc_loss": 0.05317389965057373, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002265632210765034, + "grad_norm": 6.393265247344971, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.864080548286438, + "num_tokens": 271778245.0, + "step": 7124 + }, + { + "epoch": 0.9063732349573845, + "ewc_loss": 0.05345184728503227, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022934269509278238, + "grad_norm": 6.164615631103516, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8529223203659058, + "num_tokens": 271817279.0, + "step": 7125 + }, + { + "epoch": 0.9065004452359751, + "ewc_loss": 0.052840277552604675, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022322697623167187, + "grad_norm": 6.208583354949951, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.868740439414978, + "num_tokens": 271857136.0, + "step": 7126 + }, + { + "epoch": 0.9066276555145656, + "ewc_loss": 0.0530308373272419, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002251325931865722, + "grad_norm": 6.244185447692871, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8686666488647461, + "num_tokens": 271891651.0, + "step": 7127 + }, + { + "epoch": 0.9067548657931561, + "ewc_loss": 0.05257183685898781, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022054258442949504, + "grad_norm": 6.111281871795654, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8649954199790955, + "num_tokens": 271925805.0, + "step": 7128 + }, + { + "epoch": 0.9068820760717466, + "ewc_loss": 0.05266077443957329, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022143196838442236, + "grad_norm": 6.144622325897217, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8644640445709229, + "num_tokens": 271964954.0, + "step": 7129 + }, + { + "epoch": 0.9070092863503371, + "ewc_loss": 0.0524570494890213, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000219394700252451, + "grad_norm": 6.1633992195129395, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8599657416343689, + "num_tokens": 272000331.0, + "step": 7130 + }, + { + "epoch": 0.9071364966289276, + "ewc_loss": 0.05241828411817551, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002190070372307673, + "grad_norm": 6.048784255981445, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8678240180015564, + "num_tokens": 272037853.0, + "step": 7131 + }, + { + "epoch": 0.9072637069075181, + "ewc_loss": 0.05238643288612366, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021868852491024882, + "grad_norm": 6.057486057281494, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8705215454101562, + "num_tokens": 272077663.0, + "step": 7132 + }, + { + "epoch": 0.9073909171861086, + "ewc_loss": 0.05234304070472717, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021825461590196937, + "grad_norm": 6.052524089813232, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8558652400970459, + "num_tokens": 272116942.0, + "step": 7133 + }, + { + "epoch": 0.9075181274646992, + "ewc_loss": 0.05230192095041275, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002178434224333614, + "grad_norm": 6.0518951416015625, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8541646003723145, + "num_tokens": 272151319.0, + "step": 7134 + }, + { + "epoch": 0.9076453377432897, + "ewc_loss": 0.052379757165908813, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021862178982701153, + "grad_norm": 6.085643291473389, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8476548790931702, + "num_tokens": 272190817.0, + "step": 7135 + }, + { + "epoch": 0.9077725480218801, + "ewc_loss": 0.052324436604976654, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021806858421768993, + "grad_norm": 6.043209075927734, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8548912405967712, + "num_tokens": 272230461.0, + "step": 7136 + }, + { + "epoch": 0.9078997583004706, + "ewc_loss": 0.05235131457448006, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021833735809195787, + "grad_norm": 6.053587436676025, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8554748296737671, + "num_tokens": 272268510.0, + "step": 7137 + }, + { + "epoch": 0.9080269685790612, + "ewc_loss": 0.052311886101961136, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021794307394884527, + "grad_norm": 6.036157608032227, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8601245880126953, + "num_tokens": 272312430.0, + "step": 7138 + }, + { + "epoch": 0.9081541788576517, + "ewc_loss": 0.05233892425894737, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021821346308570355, + "grad_norm": 6.111435890197754, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.859096884727478, + "num_tokens": 272348437.0, + "step": 7139 + }, + { + "epoch": 0.9082813891362422, + "ewc_loss": 0.05230311304330826, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002178553695557639, + "grad_norm": 6.110405445098877, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8519695997238159, + "num_tokens": 272378902.0, + "step": 7140 + }, + { + "epoch": 0.9084085994148328, + "ewc_loss": 0.05237230658531189, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021854729857295752, + "grad_norm": 12.872918128967285, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8736134767532349, + "num_tokens": 272421257.0, + "step": 7141 + }, + { + "epoch": 0.9085358096934232, + "ewc_loss": 0.06141785532236099, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0003090027894359082, + "grad_norm": 7.338412761688232, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8600720167160034, + "num_tokens": 272459800.0, + "step": 7142 + }, + { + "epoch": 0.9086630199720137, + "ewc_loss": 0.05060601234436035, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00020088434393983334, + "grad_norm": 5.4931182861328125, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.879150390625, + "num_tokens": 272501245.0, + "step": 7143 + }, + { + "epoch": 0.9087902302506042, + "ewc_loss": 0.05456288158893585, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00024045305326581, + "grad_norm": 6.724648475646973, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8618794083595276, + "num_tokens": 272535873.0, + "step": 7144 + }, + { + "epoch": 0.9089174405291948, + "ewc_loss": 0.053693003952503204, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002317542675882578, + "grad_norm": 6.037021636962891, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8584115505218506, + "num_tokens": 272575213.0, + "step": 7145 + }, + { + "epoch": 0.9090446508077853, + "ewc_loss": 0.05328243225812912, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002276485611218959, + "grad_norm": 6.361364841461182, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8629457354545593, + "num_tokens": 272611960.0, + "step": 7146 + }, + { + "epoch": 0.9091718610863758, + "ewc_loss": 0.05346110835671425, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002294353034812957, + "grad_norm": 6.127390384674072, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8608042597770691, + "num_tokens": 272650895.0, + "step": 7147 + }, + { + "epoch": 0.9092990713649662, + "ewc_loss": 0.052940644323825836, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022423063637688756, + "grad_norm": 6.185431003570557, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8609386086463928, + "num_tokens": 272690617.0, + "step": 7148 + }, + { + "epoch": 0.9094262816435568, + "ewc_loss": 0.05309244245290756, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002257486485177651, + "grad_norm": 6.200471878051758, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8520992994308472, + "num_tokens": 272732526.0, + "step": 7149 + }, + { + "epoch": 0.9095534919221473, + "ewc_loss": 0.05271746590733528, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022199888189788908, + "grad_norm": 6.087385177612305, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8588217496871948, + "num_tokens": 272769882.0, + "step": 7150 + }, + { + "epoch": 0.9096807022007378, + "ewc_loss": 0.05275857448577881, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022240998805500567, + "grad_norm": 6.148904323577881, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8634690046310425, + "num_tokens": 272809614.0, + "step": 7151 + }, + { + "epoch": 0.9098079124793284, + "ewc_loss": 0.052618786692619324, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002210121019743383, + "grad_norm": 6.074159622192383, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8541781902313232, + "num_tokens": 272844819.0, + "step": 7152 + }, + { + "epoch": 0.9099351227579189, + "ewc_loss": 0.05253971368074417, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002202213363489136, + "grad_norm": 6.126651763916016, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8664539456367493, + "num_tokens": 272882798.0, + "step": 7153 + }, + { + "epoch": 0.9100623330365093, + "ewc_loss": 0.05251183733344078, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021994259441271424, + "grad_norm": 6.098209857940674, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8630479574203491, + "num_tokens": 272917472.0, + "step": 7154 + }, + { + "epoch": 0.9101895433150998, + "ewc_loss": 0.05244932323694229, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021931745868641883, + "grad_norm": 6.108043670654297, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8497783541679382, + "num_tokens": 272959947.0, + "step": 7155 + }, + { + "epoch": 0.9103167535936904, + "ewc_loss": 0.05238120257854462, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021863626898266375, + "grad_norm": 6.097341060638428, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8440800905227661, + "num_tokens": 272998194.0, + "step": 7156 + }, + { + "epoch": 0.9104439638722809, + "ewc_loss": 0.052374597638845444, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021857020328752697, + "grad_norm": 6.122453212738037, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8776842355728149, + "num_tokens": 273034688.0, + "step": 7157 + }, + { + "epoch": 0.9105711741508714, + "ewc_loss": 0.05214845389127731, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021752945031039417, + "grad_norm": 6.031070232391357, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8714814186096191, + "num_tokens": 273071575.0, + "step": 7158 + }, + { + "epoch": 0.9106983844294619, + "ewc_loss": 0.052189938724040985, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021794429630972445, + "grad_norm": 6.220840930938721, + "learning_rate": 1e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.8353568315505981, + "num_tokens": 273106106.0, + "step": 7159 + }, + { + "epoch": 0.9108255947080524, + "ewc_loss": 0.0521053671836853, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021709859720431268, + "grad_norm": 6.037154674530029, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8546597957611084, + "num_tokens": 273142959.0, + "step": 7160 + }, + { + "epoch": 0.9109528049866429, + "ewc_loss": 0.05213943123817444, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021743924298789352, + "grad_norm": 6.123511791229248, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8453824520111084, + "num_tokens": 273182781.0, + "step": 7161 + }, + { + "epoch": 0.9110800152652334, + "ewc_loss": 0.05208050459623337, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021684994862880558, + "grad_norm": 5.974530220031738, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8671013116836548, + "num_tokens": 273225738.0, + "step": 7162 + }, + { + "epoch": 0.9112072255438239, + "ewc_loss": 0.05216959863901138, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.0002177409187424928, + "grad_norm": 6.071033954620361, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8682458400726318, + "num_tokens": 273269166.0, + "step": 7163 + }, + { + "epoch": 0.9113344358224145, + "ewc_loss": 0.052280839532613754, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002176326233893633, + "grad_norm": 6.087274551391602, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8553062677383423, + "num_tokens": 273309093.0, + "step": 7164 + }, + { + "epoch": 0.911461646101005, + "ewc_loss": 0.05230925232172012, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021791676408611238, + "grad_norm": 6.023630619049072, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8656260967254639, + "num_tokens": 273346857.0, + "step": 7165 + }, + { + "epoch": 0.9115888563795955, + "ewc_loss": 0.052218906581401825, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021823399583809078, + "grad_norm": 6.065744400024414, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8564238548278809, + "num_tokens": 273385720.0, + "step": 7166 + }, + { + "epoch": 0.9117160666581859, + "ewc_loss": 0.05221433937549591, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021696762996725738, + "grad_norm": 6.068160533905029, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8613511919975281, + "num_tokens": 273431079.0, + "step": 7167 + }, + { + "epoch": 0.9118432769367765, + "ewc_loss": 0.052351921796798706, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002183434262406081, + "grad_norm": 6.054676055908203, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8615920543670654, + "num_tokens": 273477316.0, + "step": 7168 + }, + { + "epoch": 0.911970487215367, + "ewc_loss": 0.05235667526721954, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021839099645148963, + "grad_norm": 6.104747772216797, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8636571168899536, + "num_tokens": 273512831.0, + "step": 7169 + }, + { + "epoch": 0.9120976974939575, + "ewc_loss": 0.052065830677747726, + "ewc_loss_diag": 3.0279159545898438e-05, + "ewc_loss_parallel": 0.00021792392362840474, + "grad_norm": 6.053065776824951, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8548668622970581, + "num_tokens": 273551927.0, + "step": 7170 + }, + { + "epoch": 0.912224907772548, + "ewc_loss": 0.05226150155067444, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021865993039682508, + "grad_norm": 6.097899913787842, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8642700910568237, + "num_tokens": 273583053.0, + "step": 7171 + }, + { + "epoch": 0.9123521180511386, + "ewc_loss": 0.05221102386713028, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021815513900946826, + "grad_norm": 6.063782691955566, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8506617546081543, + "num_tokens": 273617953.0, + "step": 7172 + }, + { + "epoch": 0.912479328329729, + "ewc_loss": 0.05223638564348221, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021840879344381392, + "grad_norm": 6.110560894012451, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8456696271896362, + "num_tokens": 273649959.0, + "step": 7173 + }, + { + "epoch": 0.9126065386083195, + "ewc_loss": 0.05227627605199814, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021880767599213868, + "grad_norm": 6.087267875671387, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8501821756362915, + "num_tokens": 273682666.0, + "step": 7174 + }, + { + "epoch": 0.9127337488869101, + "ewc_loss": 0.052214719355106354, + "ewc_loss_diag": 3.039836883544922e-05, + "ewc_loss_parallel": 0.00021819211542606354, + "grad_norm": 6.081056118011475, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8791494965553284, + "num_tokens": 273718641.0, + "step": 7175 + }, + { + "epoch": 0.9128609591655006, + "ewc_loss": 0.05240655690431595, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021888979244977236, + "grad_norm": 5.981131553649902, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8698652386665344, + "num_tokens": 273760502.0, + "step": 7176 + }, + { + "epoch": 0.9129881694440911, + "ewc_loss": 0.05243439972400665, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021916824334766716, + "grad_norm": 6.125795841217041, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8548588752746582, + "num_tokens": 273795068.0, + "step": 7177 + }, + { + "epoch": 0.9131153797226816, + "ewc_loss": 0.0524338036775589, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021916223340667784, + "grad_norm": 6.168181419372559, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8436583876609802, + "num_tokens": 273825099.0, + "step": 7178 + }, + { + "epoch": 0.9132425900012721, + "ewc_loss": 0.05239100754261017, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002187343197874725, + "grad_norm": 6.087541580200195, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8663790225982666, + "num_tokens": 273862566.0, + "step": 7179 + }, + { + "epoch": 0.9133698002798626, + "ewc_loss": 0.05237656459212303, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002185898629250005, + "grad_norm": 6.055242538452148, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8608883619308472, + "num_tokens": 273896107.0, + "step": 7180 + }, + { + "epoch": 0.9134970105584531, + "ewc_loss": 0.05233233422040939, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021814754290971905, + "grad_norm": 6.0119428634643555, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8655308485031128, + "num_tokens": 273934066.0, + "step": 7181 + }, + { + "epoch": 0.9136242208370436, + "ewc_loss": 0.05248444527387619, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002196686837123707, + "grad_norm": 6.081424236297607, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8599201440811157, + "num_tokens": 273971843.0, + "step": 7182 + }, + { + "epoch": 0.9137514311156342, + "ewc_loss": 0.05239146202802658, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002187388454331085, + "grad_norm": 6.114565372467041, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8531916737556458, + "num_tokens": 274001646.0, + "step": 7183 + }, + { + "epoch": 0.9138786413942247, + "ewc_loss": 0.05243941396474838, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021921836014371365, + "grad_norm": 6.076542377471924, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8452662825584412, + "num_tokens": 274042656.0, + "step": 7184 + }, + { + "epoch": 0.9140058516728151, + "ewc_loss": 0.05239761993288994, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021880041458643973, + "grad_norm": 6.033859729766846, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8462635278701782, + "num_tokens": 274084609.0, + "step": 7185 + }, + { + "epoch": 0.9141330619514056, + "ewc_loss": 0.052400194108486176, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021882618602830917, + "grad_norm": 6.01010274887085, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8636415004730225, + "num_tokens": 274127503.0, + "step": 7186 + }, + { + "epoch": 0.9142602722299962, + "ewc_loss": 0.052471376955509186, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021953799296170473, + "grad_norm": 6.0909528732299805, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.864707887172699, + "num_tokens": 274164126.0, + "step": 7187 + }, + { + "epoch": 0.9143874825085867, + "ewc_loss": 0.05249399691820145, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002197642024839297, + "grad_norm": 6.097795009613037, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8602266311645508, + "num_tokens": 274198104.0, + "step": 7188 + }, + { + "epoch": 0.9145146927871772, + "ewc_loss": 0.052368856966495514, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002185127668781206, + "grad_norm": 6.046594619750977, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8498064279556274, + "num_tokens": 274233487.0, + "step": 7189 + }, + { + "epoch": 0.9146419030657678, + "ewc_loss": 0.05238949880003929, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002187192003475502, + "grad_norm": 6.0625739097595215, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8632984161376953, + "num_tokens": 274267317.0, + "step": 7190 + }, + { + "epoch": 0.9147691133443582, + "ewc_loss": 0.05238012224435806, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021862545690964907, + "grad_norm": 6.020409107208252, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8523898124694824, + "num_tokens": 274304169.0, + "step": 7191 + }, + { + "epoch": 0.9148963236229487, + "ewc_loss": 0.05244208872318268, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002192451065639034, + "grad_norm": 6.078972339630127, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8608253002166748, + "num_tokens": 274343265.0, + "step": 7192 + }, + { + "epoch": 0.9150235339015392, + "ewc_loss": 0.05235018581151962, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021832608035765588, + "grad_norm": 6.042054176330566, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8438253998756409, + "num_tokens": 274380756.0, + "step": 7193 + }, + { + "epoch": 0.9151507441801298, + "ewc_loss": 0.052439238876104355, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021921661391388625, + "grad_norm": 6.026878833770752, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8714348077774048, + "num_tokens": 274422035.0, + "step": 7194 + }, + { + "epoch": 0.9152779544587203, + "ewc_loss": 0.05240447074174881, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002188689395552501, + "grad_norm": 6.032078266143799, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8510083556175232, + "num_tokens": 274460875.0, + "step": 7195 + }, + { + "epoch": 0.9154051647373108, + "ewc_loss": 0.05238461494445801, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021867037867195904, + "grad_norm": 6.043480396270752, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8626703023910522, + "num_tokens": 274495807.0, + "step": 7196 + }, + { + "epoch": 0.9155323750159012, + "ewc_loss": 0.052478305995464325, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021960730373393744, + "grad_norm": 6.052663326263428, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8679978251457214, + "num_tokens": 274533098.0, + "step": 7197 + }, + { + "epoch": 0.9156595852944918, + "ewc_loss": 0.052421197295188904, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021903619926888496, + "grad_norm": 6.104278087615967, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8552514314651489, + "num_tokens": 274565282.0, + "step": 7198 + }, + { + "epoch": 0.9157867955730823, + "ewc_loss": 0.052431195974349976, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002191362000303343, + "grad_norm": 6.07229471206665, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8633108139038086, + "num_tokens": 274605401.0, + "step": 7199 + }, + { + "epoch": 0.9159140058516728, + "ewc_loss": 0.05235877260565758, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021841193665750325, + "grad_norm": 6.1735520362854, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8541786670684814, + "num_tokens": 274645018.0, + "step": 7200 + }, + { + "epoch": 0.9160412161302633, + "ewc_loss": 0.0523516908288002, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021834114158991724, + "grad_norm": 6.057901859283447, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8691335916519165, + "num_tokens": 274681541.0, + "step": 7201 + }, + { + "epoch": 0.9161684264088539, + "ewc_loss": 0.05230942741036415, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021791849576402456, + "grad_norm": 6.087785243988037, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8637264370918274, + "num_tokens": 274716883.0, + "step": 7202 + }, + { + "epoch": 0.9162956366874443, + "ewc_loss": 0.05233548581600189, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021817907691001892, + "grad_norm": 6.051176071166992, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8542697429656982, + "num_tokens": 274753038.0, + "step": 7203 + }, + { + "epoch": 0.9164228469660348, + "ewc_loss": 0.0523572713136673, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021839694818481803, + "grad_norm": 6.030778408050537, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8665189743041992, + "num_tokens": 274790760.0, + "step": 7204 + }, + { + "epoch": 0.9165500572446253, + "ewc_loss": 0.05229896306991577, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002178138674935326, + "grad_norm": 6.058267593383789, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8529052734375, + "num_tokens": 274830872.0, + "step": 7205 + }, + { + "epoch": 0.9166772675232159, + "ewc_loss": 0.052371032536029816, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021853455109521747, + "grad_norm": 6.0871968269348145, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8478309512138367, + "num_tokens": 274869832.0, + "step": 7206 + }, + { + "epoch": 0.9168044778018064, + "ewc_loss": 0.052333489060401917, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021815912623424083, + "grad_norm": 6.113442420959473, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8524684309959412, + "num_tokens": 274903853.0, + "step": 7207 + }, + { + "epoch": 0.9169316880803969, + "ewc_loss": 0.05224665254354477, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021729075524490327, + "grad_norm": 6.021080493927002, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8580999374389648, + "num_tokens": 274942911.0, + "step": 7208 + }, + { + "epoch": 0.9170588983589874, + "ewc_loss": 0.05230693146586418, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002178935392294079, + "grad_norm": 6.022401809692383, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.866736114025116, + "num_tokens": 274986427.0, + "step": 7209 + }, + { + "epoch": 0.9171861086375779, + "ewc_loss": 0.0523037388920784, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002178616268793121, + "grad_norm": 6.089443683624268, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8620920777320862, + "num_tokens": 275021557.0, + "step": 7210 + }, + { + "epoch": 0.9173133189161684, + "ewc_loss": 0.05232175439596176, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021804175048600882, + "grad_norm": 6.057076454162598, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8483729362487793, + "num_tokens": 275061308.0, + "step": 7211 + }, + { + "epoch": 0.9174405291947589, + "ewc_loss": 0.052319880574941635, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021802302217110991, + "grad_norm": 6.173329830169678, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8535863161087036, + "num_tokens": 275101497.0, + "step": 7212 + }, + { + "epoch": 0.9175677394733495, + "ewc_loss": 0.05226556211709976, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002174798137275502, + "grad_norm": 6.069242477416992, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8483930826187134, + "num_tokens": 275138317.0, + "step": 7213 + }, + { + "epoch": 0.91769494975194, + "ewc_loss": 0.052327051758766174, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021809474856127053, + "grad_norm": 6.061341762542725, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8593564629554749, + "num_tokens": 275180119.0, + "step": 7214 + }, + { + "epoch": 0.9178221600305305, + "ewc_loss": 0.05217422544956207, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021656646276824176, + "grad_norm": 6.030155181884766, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8574167490005493, + "num_tokens": 275212083.0, + "step": 7215 + }, + { + "epoch": 0.9179493703091209, + "ewc_loss": 0.05229918658733368, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021781609393656254, + "grad_norm": 6.107657432556152, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8489170670509338, + "num_tokens": 275247872.0, + "step": 7216 + }, + { + "epoch": 0.9180765805877115, + "ewc_loss": 0.05225134640932083, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021733769972342998, + "grad_norm": 6.023703575134277, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8646002411842346, + "num_tokens": 275287022.0, + "step": 7217 + }, + { + "epoch": 0.918203790866302, + "ewc_loss": 0.05233057960867882, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002181300224037841, + "grad_norm": 6.106771469116211, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.861513614654541, + "num_tokens": 275322259.0, + "step": 7218 + }, + { + "epoch": 0.9183310011448925, + "ewc_loss": 0.05228318274021149, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002176560665247962, + "grad_norm": 6.040948867797852, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8738890886306763, + "num_tokens": 275357410.0, + "step": 7219 + }, + { + "epoch": 0.918458211423483, + "ewc_loss": 0.05222013220191002, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021702553203795105, + "grad_norm": 6.084245681762695, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8715978264808655, + "num_tokens": 275388720.0, + "step": 7220 + }, + { + "epoch": 0.9185854217020736, + "ewc_loss": 0.05232596397399902, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021808383462484926, + "grad_norm": 6.057023048400879, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8661117553710938, + "num_tokens": 275427699.0, + "step": 7221 + }, + { + "epoch": 0.918712631980664, + "ewc_loss": 0.0522867813706398, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021769200975541025, + "grad_norm": 6.003383636474609, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8634354472160339, + "num_tokens": 275472220.0, + "step": 7222 + }, + { + "epoch": 0.9188398422592545, + "ewc_loss": 0.052336402237415314, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021818821551278234, + "grad_norm": 6.074087619781494, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8483430743217468, + "num_tokens": 275506693.0, + "step": 7223 + }, + { + "epoch": 0.918967052537845, + "ewc_loss": 0.05227607488632202, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021758495131507516, + "grad_norm": 6.0193190574646, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8635317087173462, + "num_tokens": 275547295.0, + "step": 7224 + }, + { + "epoch": 0.9190942628164356, + "ewc_loss": 0.05233330652117729, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021815729269292206, + "grad_norm": 6.062063694000244, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8561524152755737, + "num_tokens": 275591552.0, + "step": 7225 + }, + { + "epoch": 0.9192214730950261, + "ewc_loss": 0.05228486657142639, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021767285943496972, + "grad_norm": 6.03613805770874, + "learning_rate": 1e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.8346861600875854, + "num_tokens": 275632197.0, + "step": 7226 + }, + { + "epoch": 0.9193486833736166, + "ewc_loss": 0.052346717566251755, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021829138859175146, + "grad_norm": 6.068300724029541, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8522466421127319, + "num_tokens": 275670680.0, + "step": 7227 + }, + { + "epoch": 0.919475893652207, + "ewc_loss": 0.05228066071867943, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002176308335037902, + "grad_norm": 6.070666790008545, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8580821752548218, + "num_tokens": 275709315.0, + "step": 7228 + }, + { + "epoch": 0.9196031039307976, + "ewc_loss": 0.05234123021364212, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002182365278713405, + "grad_norm": 6.183187484741211, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8571392297744751, + "num_tokens": 275740405.0, + "step": 7229 + }, + { + "epoch": 0.9197303142093881, + "ewc_loss": 0.05219321697950363, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021675637981388718, + "grad_norm": 6.029252529144287, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8778208494186401, + "num_tokens": 275780915.0, + "step": 7230 + }, + { + "epoch": 0.9198575244879786, + "ewc_loss": 0.052254609763622284, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021737029601354152, + "grad_norm": 6.060101509094238, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8539326190948486, + "num_tokens": 275817832.0, + "step": 7231 + }, + { + "epoch": 0.9199847347665692, + "ewc_loss": 0.05219292268157005, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021675344032701105, + "grad_norm": 6.016284942626953, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.863174319267273, + "num_tokens": 275857621.0, + "step": 7232 + }, + { + "epoch": 0.9201119450451597, + "ewc_loss": 0.05228646099567413, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021768882288597524, + "grad_norm": 6.1077656745910645, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8565853834152222, + "num_tokens": 275898917.0, + "step": 7233 + }, + { + "epoch": 0.9202391553237501, + "ewc_loss": 0.052247270941734314, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002172969252569601, + "grad_norm": 6.066981792449951, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8639183640480042, + "num_tokens": 275930151.0, + "step": 7234 + }, + { + "epoch": 0.9203663656023406, + "ewc_loss": 0.052287861704826355, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002177028509322554, + "grad_norm": 6.092715263366699, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.865691602230072, + "num_tokens": 275966901.0, + "step": 7235 + }, + { + "epoch": 0.9204935758809312, + "ewc_loss": 0.052338264882564545, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021820685651618987, + "grad_norm": 6.0415191650390625, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8820158243179321, + "num_tokens": 276002845.0, + "step": 7236 + }, + { + "epoch": 0.9206207861595217, + "ewc_loss": 0.05226362124085426, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021746043057646602, + "grad_norm": 6.090011119842529, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8579126596450806, + "num_tokens": 276038476.0, + "step": 7237 + }, + { + "epoch": 0.9207479964381122, + "ewc_loss": 0.05232112854719162, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021803550771437585, + "grad_norm": 6.0894246101379395, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8552883863449097, + "num_tokens": 276077867.0, + "step": 7238 + }, + { + "epoch": 0.9208752067167028, + "ewc_loss": 0.05227141082286835, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002175383415305987, + "grad_norm": 6.024246692657471, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8499685525894165, + "num_tokens": 276123289.0, + "step": 7239 + }, + { + "epoch": 0.9210024169952932, + "ewc_loss": 0.052263837307691574, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021746259881183505, + "grad_norm": 6.103513717651367, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8468067646026611, + "num_tokens": 276157357.0, + "step": 7240 + }, + { + "epoch": 0.9211296272738837, + "ewc_loss": 0.052290692925453186, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002177311253035441, + "grad_norm": 6.006288528442383, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8807802200317383, + "num_tokens": 276195548.0, + "step": 7241 + }, + { + "epoch": 0.9212568375524742, + "ewc_loss": 0.05233597755432129, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002181839954573661, + "grad_norm": 6.070040225982666, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8617079257965088, + "num_tokens": 276233533.0, + "step": 7242 + }, + { + "epoch": 0.9213840478310648, + "ewc_loss": 0.05237000435590744, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000218524262891151, + "grad_norm": 6.1151018142700195, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8622844815254211, + "num_tokens": 276274310.0, + "step": 7243 + }, + { + "epoch": 0.9215112581096553, + "ewc_loss": 0.052374109625816345, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021856531384401023, + "grad_norm": 6.1093430519104, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8577079772949219, + "num_tokens": 276310272.0, + "step": 7244 + }, + { + "epoch": 0.9216384683882458, + "ewc_loss": 0.05237241089344025, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021854833175893873, + "grad_norm": 6.090553283691406, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8550172448158264, + "num_tokens": 276350334.0, + "step": 7245 + }, + { + "epoch": 0.9217656786668362, + "ewc_loss": 0.05230887234210968, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021791293693240732, + "grad_norm": 6.072845935821533, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8544226884841919, + "num_tokens": 276393913.0, + "step": 7246 + }, + { + "epoch": 0.9218928889454268, + "ewc_loss": 0.0523698553442955, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021852279314771295, + "grad_norm": 6.188884735107422, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8554051518440247, + "num_tokens": 276430368.0, + "step": 7247 + }, + { + "epoch": 0.9220200992240173, + "ewc_loss": 0.05226047709584236, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021742898388765752, + "grad_norm": 6.033590793609619, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8542506694793701, + "num_tokens": 276470930.0, + "step": 7248 + }, + { + "epoch": 0.9221473095026078, + "ewc_loss": 0.05229505896568298, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021777481015305966, + "grad_norm": 6.132638454437256, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8637429475784302, + "num_tokens": 276509359.0, + "step": 7249 + }, + { + "epoch": 0.9222745197811983, + "ewc_loss": 0.05226718634366989, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021749608276877552, + "grad_norm": 6.014597415924072, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.850931704044342, + "num_tokens": 276546980.0, + "step": 7250 + }, + { + "epoch": 0.9224017300597889, + "ewc_loss": 0.05232053995132446, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021802964329253882, + "grad_norm": 6.107321739196777, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8473347425460815, + "num_tokens": 276586212.0, + "step": 7251 + }, + { + "epoch": 0.9225289403383793, + "ewc_loss": 0.052317842841148376, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021800266404170543, + "grad_norm": 6.063704490661621, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8612514138221741, + "num_tokens": 276623946.0, + "step": 7252 + }, + { + "epoch": 0.9226561506169698, + "ewc_loss": 0.05233794450759888, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021820366964675486, + "grad_norm": 6.051494121551514, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8556649684906006, + "num_tokens": 276667567.0, + "step": 7253 + }, + { + "epoch": 0.9227833608955603, + "ewc_loss": 0.05231909826397896, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021801520779263228, + "grad_norm": 6.065056800842285, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8537822365760803, + "num_tokens": 276708479.0, + "step": 7254 + }, + { + "epoch": 0.9229105711741509, + "ewc_loss": 0.052354007959365845, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021836427913513035, + "grad_norm": 6.12516450881958, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8542657494544983, + "num_tokens": 276744627.0, + "step": 7255 + }, + { + "epoch": 0.9230377814527414, + "ewc_loss": 0.052391208708286285, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000218736327951774, + "grad_norm": 6.073034286499023, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8603651523590088, + "num_tokens": 276782830.0, + "step": 7256 + }, + { + "epoch": 0.9231649917313319, + "ewc_loss": 0.05233876407146454, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021821186237502843, + "grad_norm": 6.067899703979492, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8575615286827087, + "num_tokens": 276825745.0, + "step": 7257 + }, + { + "epoch": 0.9232922020099223, + "ewc_loss": 0.052425116300582886, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021907538757659495, + "grad_norm": 6.074070930480957, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8647873401641846, + "num_tokens": 276866743.0, + "step": 7258 + }, + { + "epoch": 0.9234194122885129, + "ewc_loss": 0.05242280662059784, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021905226458329707, + "grad_norm": 6.092662334442139, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8546309471130371, + "num_tokens": 276903884.0, + "step": 7259 + }, + { + "epoch": 0.9235466225671034, + "ewc_loss": 0.05241134762763977, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021893771190661937, + "grad_norm": 6.1097002029418945, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8653496503829956, + "num_tokens": 276939142.0, + "step": 7260 + }, + { + "epoch": 0.9236738328456939, + "ewc_loss": 0.05239298194646835, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021875405218452215, + "grad_norm": 6.081323623657227, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8605009317398071, + "num_tokens": 276979041.0, + "step": 7261 + }, + { + "epoch": 0.9238010431242845, + "ewc_loss": 0.052459727972745895, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021942150488030165, + "grad_norm": 6.128690719604492, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.872302770614624, + "num_tokens": 277015590.0, + "step": 7262 + }, + { + "epoch": 0.923928253402875, + "ewc_loss": 0.052379608154296875, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000218620290979743, + "grad_norm": 6.109591484069824, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8519579172134399, + "num_tokens": 277055745.0, + "step": 7263 + }, + { + "epoch": 0.9240554636814655, + "ewc_loss": 0.052439481019973755, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021921901497989893, + "grad_norm": 6.094381809234619, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8541581034660339, + "num_tokens": 277098856.0, + "step": 7264 + }, + { + "epoch": 0.9241826739600559, + "ewc_loss": 0.05239231139421463, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021874731464777142, + "grad_norm": 6.04847526550293, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8705912828445435, + "num_tokens": 277139165.0, + "step": 7265 + }, + { + "epoch": 0.9243098842386465, + "ewc_loss": 0.052484385669231415, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021966810163576156, + "grad_norm": 6.139777183532715, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8635925650596619, + "num_tokens": 277171879.0, + "step": 7266 + }, + { + "epoch": 0.924437094517237, + "ewc_loss": 0.05246194452047348, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021944368199910969, + "grad_norm": 6.097775936126709, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8604758977890015, + "num_tokens": 277211719.0, + "step": 7267 + }, + { + "epoch": 0.9245643047958275, + "ewc_loss": 0.05248555913567543, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021967980137560517, + "grad_norm": 6.105128765106201, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8500092029571533, + "num_tokens": 277252675.0, + "step": 7268 + }, + { + "epoch": 0.924691515074418, + "ewc_loss": 0.05249028652906418, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021972708054818213, + "grad_norm": 6.15546178817749, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8472194671630859, + "num_tokens": 277287529.0, + "step": 7269 + }, + { + "epoch": 0.9248187253530086, + "ewc_loss": 0.05247371643781662, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002195613633375615, + "grad_norm": 6.03175687789917, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8587809801101685, + "num_tokens": 277328392.0, + "step": 7270 + }, + { + "epoch": 0.924945935631599, + "ewc_loss": 0.0525321364402771, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002201455645263195, + "grad_norm": 6.09886360168457, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8693290948867798, + "num_tokens": 277368412.0, + "step": 7271 + }, + { + "epoch": 0.9250731459101895, + "ewc_loss": 0.05241501331329346, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021897432452533394, + "grad_norm": 6.0364813804626465, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8652780652046204, + "num_tokens": 277404373.0, + "step": 7272 + }, + { + "epoch": 0.92520035618878, + "ewc_loss": 0.05255335196852684, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002203577314503491, + "grad_norm": 6.13065242767334, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8555890321731567, + "num_tokens": 277444988.0, + "step": 7273 + }, + { + "epoch": 0.9253275664673706, + "ewc_loss": 0.05244654417037964, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021928966452833265, + "grad_norm": 6.112678050994873, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8682879209518433, + "num_tokens": 277487501.0, + "step": 7274 + }, + { + "epoch": 0.9254547767459611, + "ewc_loss": 0.052528511732816696, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022010933025740087, + "grad_norm": 6.087120056152344, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8563987016677856, + "num_tokens": 277529056.0, + "step": 7275 + }, + { + "epoch": 0.9255819870245516, + "ewc_loss": 0.052499353885650635, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002198177680838853, + "grad_norm": 6.134450435638428, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8705156445503235, + "num_tokens": 277563022.0, + "step": 7276 + }, + { + "epoch": 0.925709197303142, + "ewc_loss": 0.05250196158885956, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021984384511597455, + "grad_norm": 6.066819667816162, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.866100549697876, + "num_tokens": 277603879.0, + "step": 7277 + }, + { + "epoch": 0.9258364075817326, + "ewc_loss": 0.05250868201255798, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002199110167566687, + "grad_norm": 6.124682903289795, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.855240523815155, + "num_tokens": 277643711.0, + "step": 7278 + }, + { + "epoch": 0.9259636178603231, + "ewc_loss": 0.052481379359960556, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002196380082750693, + "grad_norm": 6.1620683670043945, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.859678328037262, + "num_tokens": 277680667.0, + "step": 7279 + }, + { + "epoch": 0.9260908281389136, + "ewc_loss": 0.052473172545433044, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021955592092126608, + "grad_norm": 6.159815788269043, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8566478490829468, + "num_tokens": 277715548.0, + "step": 7280 + }, + { + "epoch": 0.9262180384175042, + "ewc_loss": 0.05242542922496796, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021907850168645382, + "grad_norm": 6.151886940002441, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8600385189056396, + "num_tokens": 277745287.0, + "step": 7281 + }, + { + "epoch": 0.9263452486960947, + "ewc_loss": 0.052496880292892456, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002197930298279971, + "grad_norm": 6.104771614074707, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8538945913314819, + "num_tokens": 277781484.0, + "step": 7282 + }, + { + "epoch": 0.9264724589746851, + "ewc_loss": 0.0524304062128067, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021912828378845006, + "grad_norm": 6.17189359664917, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8615496158599854, + "num_tokens": 277818429.0, + "step": 7283 + }, + { + "epoch": 0.9265996692532756, + "ewc_loss": 0.05239930376410484, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021881725115235895, + "grad_norm": 6.105072498321533, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.867631733417511, + "num_tokens": 277853077.0, + "step": 7284 + }, + { + "epoch": 0.9267268795318662, + "ewc_loss": 0.052417345345020294, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000218997651245445, + "grad_norm": 6.0530619621276855, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8771499395370483, + "num_tokens": 277893997.0, + "step": 7285 + }, + { + "epoch": 0.9268540898104567, + "ewc_loss": 0.05244613438844681, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021928556088823825, + "grad_norm": 6.184294700622559, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8486608266830444, + "num_tokens": 277934680.0, + "step": 7286 + }, + { + "epoch": 0.9269813000890472, + "ewc_loss": 0.05239415168762207, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021876573737245053, + "grad_norm": 6.00147008895874, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8517584800720215, + "num_tokens": 277980370.0, + "step": 7287 + }, + { + "epoch": 0.9271085103676378, + "ewc_loss": 0.05252104625105858, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022003467893227935, + "grad_norm": 6.157470703125, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.871292233467102, + "num_tokens": 278017257.0, + "step": 7288 + }, + { + "epoch": 0.9272357206462282, + "ewc_loss": 0.05245714634656906, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021939568978268653, + "grad_norm": 6.108157157897949, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8592429161071777, + "num_tokens": 278054835.0, + "step": 7289 + }, + { + "epoch": 0.9273629309248187, + "ewc_loss": 0.052519045770168304, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002200146991526708, + "grad_norm": 6.084000110626221, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8584775924682617, + "num_tokens": 278089510.0, + "step": 7290 + }, + { + "epoch": 0.9274901412034092, + "ewc_loss": 0.052465248852968216, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021947671484667808, + "grad_norm": 6.088350296020508, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8611675500869751, + "num_tokens": 278126179.0, + "step": 7291 + }, + { + "epoch": 0.9276173514819998, + "ewc_loss": 0.05244234949350357, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021924769680481404, + "grad_norm": 6.0920515060424805, + "learning_rate": 1e-06, + "loss": 0.5418, + "mean_token_accuracy": 0.8376220464706421, + "num_tokens": 278170837.0, + "step": 7292 + }, + { + "epoch": 0.9277445617605903, + "ewc_loss": 0.05249850079417229, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021980922610964626, + "grad_norm": 6.043142795562744, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8636683225631714, + "num_tokens": 278207410.0, + "step": 7293 + }, + { + "epoch": 0.9278717720391808, + "ewc_loss": 0.05252627283334732, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022008696396369487, + "grad_norm": 6.133537292480469, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8797948360443115, + "num_tokens": 278240495.0, + "step": 7294 + }, + { + "epoch": 0.9279989823177712, + "ewc_loss": 0.052460525184869766, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002194294793298468, + "grad_norm": 6.062084197998047, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8513939380645752, + "num_tokens": 278278449.0, + "step": 7295 + }, + { + "epoch": 0.9281261925963618, + "ewc_loss": 0.05259903520345688, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022081458882894367, + "grad_norm": 6.092609405517578, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8692790865898132, + "num_tokens": 278321434.0, + "step": 7296 + }, + { + "epoch": 0.9282534028749523, + "ewc_loss": 0.05246688798069954, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021949310030322522, + "grad_norm": 6.104271411895752, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8545651435852051, + "num_tokens": 278363730.0, + "step": 7297 + }, + { + "epoch": 0.9283806131535428, + "ewc_loss": 0.052523404359817505, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002200582530349493, + "grad_norm": 6.09205436706543, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8606539964675903, + "num_tokens": 278395493.0, + "step": 7298 + }, + { + "epoch": 0.9285078234321333, + "ewc_loss": 0.05255855619907379, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022040978365112096, + "grad_norm": 6.129880428314209, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8625075221061707, + "num_tokens": 278433024.0, + "step": 7299 + }, + { + "epoch": 0.9286350337107239, + "ewc_loss": 0.052425600588321686, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021908024791628122, + "grad_norm": 6.045478343963623, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8605494499206543, + "num_tokens": 278471311.0, + "step": 7300 + }, + { + "epoch": 0.9287622439893143, + "ewc_loss": 0.05256885662674904, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022051278210710734, + "grad_norm": 6.149183750152588, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8786381483078003, + "num_tokens": 278509151.0, + "step": 7301 + }, + { + "epoch": 0.9288894542679048, + "ewc_loss": 0.052469294518232346, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021951716917101294, + "grad_norm": 6.066302299499512, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8671037554740906, + "num_tokens": 278548743.0, + "step": 7302 + }, + { + "epoch": 0.9290166645464953, + "ewc_loss": 0.05250734090805054, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021989764354657382, + "grad_norm": 6.056524753570557, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8717948198318481, + "num_tokens": 278590722.0, + "step": 7303 + }, + { + "epoch": 0.9291438748250859, + "ewc_loss": 0.05246490612626076, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021947328059468418, + "grad_norm": 6.1363935470581055, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8468165397644043, + "num_tokens": 278628150.0, + "step": 7304 + }, + { + "epoch": 0.9292710851036764, + "ewc_loss": 0.052514590322971344, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021997009753249586, + "grad_norm": 6.104333400726318, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8445683717727661, + "num_tokens": 278672543.0, + "step": 7305 + }, + { + "epoch": 0.9293982953822669, + "ewc_loss": 0.052480295300483704, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021962715254630893, + "grad_norm": 6.053510665893555, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8691266775131226, + "num_tokens": 278707234.0, + "step": 7306 + }, + { + "epoch": 0.9295255056608573, + "ewc_loss": 0.05253702774643898, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002201945026172325, + "grad_norm": 6.216513633728027, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8611392974853516, + "num_tokens": 278734678.0, + "step": 7307 + }, + { + "epoch": 0.9296527159394479, + "ewc_loss": 0.05249062180519104, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021973042748868465, + "grad_norm": 5.988269329071045, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.869631290435791, + "num_tokens": 278778284.0, + "step": 7308 + }, + { + "epoch": 0.9297799262180384, + "ewc_loss": 0.05263473093509674, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022117150365374982, + "grad_norm": 6.154801845550537, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8587404489517212, + "num_tokens": 278815363.0, + "step": 7309 + }, + { + "epoch": 0.9299071364966289, + "ewc_loss": 0.05259695649147034, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022079376503825188, + "grad_norm": 6.077602863311768, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8480372428894043, + "num_tokens": 278860777.0, + "step": 7310 + }, + { + "epoch": 0.9300343467752195, + "ewc_loss": 0.05271635949611664, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002219878078904003, + "grad_norm": 6.197990417480469, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.873772919178009, + "num_tokens": 278890552.0, + "step": 7311 + }, + { + "epoch": 0.93016155705381, + "ewc_loss": 0.05261799693107605, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002210042002843693, + "grad_norm": 6.144880294799805, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8483067154884338, + "num_tokens": 278924741.0, + "step": 7312 + }, + { + "epoch": 0.9302887673324005, + "ewc_loss": 0.05260338634252548, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022085808450356126, + "grad_norm": 6.106060981750488, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8694023489952087, + "num_tokens": 278965185.0, + "step": 7313 + }, + { + "epoch": 0.9304159776109909, + "ewc_loss": 0.052664924412965775, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022147345589473844, + "grad_norm": 6.168700218200684, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8489450216293335, + "num_tokens": 278998961.0, + "step": 7314 + }, + { + "epoch": 0.9305431878895815, + "ewc_loss": 0.052573852241039276, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022056275338400155, + "grad_norm": 6.180756092071533, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8481069207191467, + "num_tokens": 279032689.0, + "step": 7315 + }, + { + "epoch": 0.930670398168172, + "ewc_loss": 0.05257029086351395, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022052710119169205, + "grad_norm": 6.0458197593688965, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8472087383270264, + "num_tokens": 279080305.0, + "step": 7316 + }, + { + "epoch": 0.9307976084467625, + "ewc_loss": 0.052587319165468216, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022069740225560963, + "grad_norm": 6.072131156921387, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8601289987564087, + "num_tokens": 279120581.0, + "step": 7317 + }, + { + "epoch": 0.930924818725353, + "ewc_loss": 0.05261177569627762, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022094199084676802, + "grad_norm": 6.082483291625977, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8715379238128662, + "num_tokens": 279165050.0, + "step": 7318 + }, + { + "epoch": 0.9310520290039436, + "ewc_loss": 0.052617140114307404, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022099561465438455, + "grad_norm": 6.077065944671631, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8640853762626648, + "num_tokens": 279209393.0, + "step": 7319 + }, + { + "epoch": 0.931179239282534, + "ewc_loss": 0.05261274427175522, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022095165331847966, + "grad_norm": 6.132159233093262, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8473677635192871, + "num_tokens": 279250569.0, + "step": 7320 + }, + { + "epoch": 0.9313064495611245, + "ewc_loss": 0.05259595438838005, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022078375332057476, + "grad_norm": 6.0709686279296875, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.86586993932724, + "num_tokens": 279291154.0, + "step": 7321 + }, + { + "epoch": 0.931433659839715, + "ewc_loss": 0.05255715548992157, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022039577015675604, + "grad_norm": 6.186067581176758, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8590242862701416, + "num_tokens": 279326967.0, + "step": 7322 + }, + { + "epoch": 0.9315608701183056, + "ewc_loss": 0.0525456927716732, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002202811447205022, + "grad_norm": 6.30239200592041, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8730161190032959, + "num_tokens": 279367129.0, + "step": 7323 + }, + { + "epoch": 0.9316880803968961, + "ewc_loss": 0.052472591400146484, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021955014381092042, + "grad_norm": 6.103690147399902, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8661807775497437, + "num_tokens": 279403651.0, + "step": 7324 + }, + { + "epoch": 0.9318152906754866, + "ewc_loss": 0.05244920402765274, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002193162654293701, + "grad_norm": 6.179919719696045, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8527557849884033, + "num_tokens": 279439166.0, + "step": 7325 + }, + { + "epoch": 0.931942500954077, + "ewc_loss": 0.05243082344532013, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021913243108429015, + "grad_norm": 6.099496364593506, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8582030534744263, + "num_tokens": 279473489.0, + "step": 7326 + }, + { + "epoch": 0.9320697112326676, + "ewc_loss": 0.05253777280449867, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002202019386459142, + "grad_norm": 6.13778018951416, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8636966347694397, + "num_tokens": 279511709.0, + "step": 7327 + }, + { + "epoch": 0.9321969215112581, + "ewc_loss": 0.052418336272239685, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021900756109971553, + "grad_norm": 6.090169429779053, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8489203453063965, + "num_tokens": 279551638.0, + "step": 7328 + }, + { + "epoch": 0.9323241317898486, + "ewc_loss": 0.052520401775836945, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022002823243383318, + "grad_norm": 6.291171073913574, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8707718849182129, + "num_tokens": 279587304.0, + "step": 7329 + }, + { + "epoch": 0.9324513420684392, + "ewc_loss": 0.05243731290102005, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021919736173003912, + "grad_norm": 7.0095109939575195, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8508336544036865, + "num_tokens": 279628138.0, + "step": 7330 + }, + { + "epoch": 0.9325785523470297, + "ewc_loss": 0.05246948078274727, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021951903181616217, + "grad_norm": 6.241546630859375, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.86397385597229, + "num_tokens": 279663064.0, + "step": 7331 + }, + { + "epoch": 0.9327057626256201, + "ewc_loss": 0.05200919881463051, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021491620282176882, + "grad_norm": 6.0491132736206055, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8516379594802856, + "num_tokens": 279704937.0, + "step": 7332 + }, + { + "epoch": 0.9328329729042106, + "ewc_loss": 0.0521911159157753, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002167353522963822, + "grad_norm": 6.3867669105529785, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8622365593910217, + "num_tokens": 279738215.0, + "step": 7333 + }, + { + "epoch": 0.9329601831828012, + "ewc_loss": 0.052121661603450775, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021604083303827792, + "grad_norm": 6.0219316482543945, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8433826565742493, + "num_tokens": 279771996.0, + "step": 7334 + }, + { + "epoch": 0.9330873934613917, + "ewc_loss": 0.052333228290081024, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002181564923375845, + "grad_norm": 6.126950740814209, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8740111589431763, + "num_tokens": 279810037.0, + "step": 7335 + }, + { + "epoch": 0.9332146037399822, + "ewc_loss": 0.052244171500205994, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021726595878135413, + "grad_norm": 6.176289081573486, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8527224659919739, + "num_tokens": 279847394.0, + "step": 7336 + }, + { + "epoch": 0.9333418140185727, + "ewc_loss": 0.05232717841863632, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021809602912981063, + "grad_norm": 6.1034674644470215, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8452602624893188, + "num_tokens": 279879057.0, + "step": 7337 + }, + { + "epoch": 0.9334690242971632, + "ewc_loss": 0.05234292149543762, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021825343719683588, + "grad_norm": 6.133648872375488, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8532793521881104, + "num_tokens": 279911775.0, + "step": 7338 + }, + { + "epoch": 0.9335962345757537, + "ewc_loss": 0.05237029492855072, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021852714417036623, + "grad_norm": 6.191295146942139, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8621459007263184, + "num_tokens": 279947661.0, + "step": 7339 + }, + { + "epoch": 0.9337234448543442, + "ewc_loss": 0.05240527167916298, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002188769431086257, + "grad_norm": 6.153835773468018, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8610421419143677, + "num_tokens": 279977507.0, + "step": 7340 + }, + { + "epoch": 0.9338506551329347, + "ewc_loss": 0.05236463248729706, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002184705517720431, + "grad_norm": 6.0512518882751465, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8565378785133362, + "num_tokens": 280016749.0, + "step": 7341 + }, + { + "epoch": 0.9339778654115253, + "ewc_loss": 0.05242373049259186, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021906154870521277, + "grad_norm": 6.131416320800781, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8616442680358887, + "num_tokens": 280056092.0, + "step": 7342 + }, + { + "epoch": 0.9341050756901158, + "ewc_loss": 0.05245015770196915, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021932581148575991, + "grad_norm": 6.089735984802246, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8714545965194702, + "num_tokens": 280089803.0, + "step": 7343 + }, + { + "epoch": 0.9342322859687062, + "ewc_loss": 0.052511200308799744, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021993622067384422, + "grad_norm": 6.1237711906433105, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8480113744735718, + "num_tokens": 280126190.0, + "step": 7344 + }, + { + "epoch": 0.9343594962472968, + "ewc_loss": 0.052492834627628326, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021975259005557746, + "grad_norm": 6.067357063293457, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8705559968948364, + "num_tokens": 280162743.0, + "step": 7345 + }, + { + "epoch": 0.9344867065258873, + "ewc_loss": 0.05257868766784668, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022061110939830542, + "grad_norm": 6.0979838371276855, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8594802021980286, + "num_tokens": 280203676.0, + "step": 7346 + }, + { + "epoch": 0.9346139168044778, + "ewc_loss": 0.05254098027944565, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022023404017090797, + "grad_norm": 6.08229923248291, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.876787543296814, + "num_tokens": 280241765.0, + "step": 7347 + }, + { + "epoch": 0.9347411270830683, + "ewc_loss": 0.05262518301606178, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022107604308985174, + "grad_norm": 6.153250694274902, + "learning_rate": 1e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8390474319458008, + "num_tokens": 280280103.0, + "step": 7348 + }, + { + "epoch": 0.9348683373616589, + "ewc_loss": 0.05255896598100662, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002204138581873849, + "grad_norm": 6.109264850616455, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8432563543319702, + "num_tokens": 280314674.0, + "step": 7349 + }, + { + "epoch": 0.9349955476402493, + "ewc_loss": 0.052556898444890976, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022039319446776062, + "grad_norm": 6.079817295074463, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8618112802505493, + "num_tokens": 280353890.0, + "step": 7350 + }, + { + "epoch": 0.9351227579188398, + "ewc_loss": 0.052602458745241165, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002208488149335608, + "grad_norm": 6.068936824798584, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8528284430503845, + "num_tokens": 280396403.0, + "step": 7351 + }, + { + "epoch": 0.9352499681974303, + "ewc_loss": 0.05259794741868973, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022080368944443762, + "grad_norm": 6.110069751739502, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8598397374153137, + "num_tokens": 280436154.0, + "step": 7352 + }, + { + "epoch": 0.9353771784760209, + "ewc_loss": 0.05261138454079628, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022093806182965636, + "grad_norm": 6.083787441253662, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8662518262863159, + "num_tokens": 280476480.0, + "step": 7353 + }, + { + "epoch": 0.9355043887546114, + "ewc_loss": 0.052558548748493195, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022040972544346005, + "grad_norm": 6.184356689453125, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8639065623283386, + "num_tokens": 280505976.0, + "step": 7354 + }, + { + "epoch": 0.9356315990332019, + "ewc_loss": 0.05254778265953064, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002203020267188549, + "grad_norm": 6.031294822692871, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8562167882919312, + "num_tokens": 280548365.0, + "step": 7355 + }, + { + "epoch": 0.9357588093117923, + "ewc_loss": 0.0525522455573082, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022034667199477553, + "grad_norm": 6.084533214569092, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8780717253684998, + "num_tokens": 280588074.0, + "step": 7356 + }, + { + "epoch": 0.9358860195903829, + "ewc_loss": 0.052565477788448334, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022047897800803185, + "grad_norm": 6.064183235168457, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8696674108505249, + "num_tokens": 280626915.0, + "step": 7357 + }, + { + "epoch": 0.9360132298689734, + "ewc_loss": 0.05257954075932503, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000220619622268714, + "grad_norm": 6.115379333496094, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8563560247421265, + "num_tokens": 280660088.0, + "step": 7358 + }, + { + "epoch": 0.9361404401475639, + "ewc_loss": 0.05256298556923866, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002204540796810761, + "grad_norm": 6.053190231323242, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8582924604415894, + "num_tokens": 280701326.0, + "step": 7359 + }, + { + "epoch": 0.9362676504261545, + "ewc_loss": 0.05252145975828171, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022003882622811943, + "grad_norm": 6.068948745727539, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8700246810913086, + "num_tokens": 280740256.0, + "step": 7360 + }, + { + "epoch": 0.936394860704745, + "ewc_loss": 0.05257409065961838, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000220565139898099, + "grad_norm": 6.05247688293457, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8741037249565125, + "num_tokens": 280773515.0, + "step": 7361 + }, + { + "epoch": 0.9365220709833355, + "ewc_loss": 0.052639953792095184, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022122373047750443, + "grad_norm": 6.123908996582031, + "learning_rate": 1e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8353244066238403, + "num_tokens": 280817207.0, + "step": 7362 + }, + { + "epoch": 0.9366492812619259, + "ewc_loss": 0.05261678621172905, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022099207853898406, + "grad_norm": 6.041905879974365, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8475481271743774, + "num_tokens": 280860253.0, + "step": 7363 + }, + { + "epoch": 0.9367764915405165, + "ewc_loss": 0.05270978808403015, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022192207688931376, + "grad_norm": 6.193585395812988, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8564136028289795, + "num_tokens": 280892634.0, + "step": 7364 + }, + { + "epoch": 0.936903701819107, + "ewc_loss": 0.05263569951057434, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022118119522929192, + "grad_norm": 6.041409969329834, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8689691424369812, + "num_tokens": 280930912.0, + "step": 7365 + }, + { + "epoch": 0.9370309120976975, + "ewc_loss": 0.052687469869852066, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022169892326928675, + "grad_norm": 6.071292400360107, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8628284931182861, + "num_tokens": 280971588.0, + "step": 7366 + }, + { + "epoch": 0.937158122376288, + "ewc_loss": 0.05270387977361679, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002218629961134866, + "grad_norm": 6.17018461227417, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8687101602554321, + "num_tokens": 281009898.0, + "step": 7367 + }, + { + "epoch": 0.9372853326548786, + "ewc_loss": 0.052634336054325104, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022116760374046862, + "grad_norm": 6.092916965484619, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8516142964363098, + "num_tokens": 281046726.0, + "step": 7368 + }, + { + "epoch": 0.937412542933469, + "ewc_loss": 0.05264792591333389, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002213034895248711, + "grad_norm": 6.10752534866333, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8741291165351868, + "num_tokens": 281085163.0, + "step": 7369 + }, + { + "epoch": 0.9375397532120595, + "ewc_loss": 0.05266615003347397, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022148570860736072, + "grad_norm": 6.110100269317627, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8657150268554688, + "num_tokens": 281128660.0, + "step": 7370 + }, + { + "epoch": 0.93766696349065, + "ewc_loss": 0.05265188217163086, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022134302707854658, + "grad_norm": 6.179376125335693, + "learning_rate": 1e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8414031863212585, + "num_tokens": 281163780.0, + "step": 7371 + }, + { + "epoch": 0.9377941737692406, + "ewc_loss": 0.052672430872917175, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022154852922540158, + "grad_norm": 6.0598039627075195, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8689203858375549, + "num_tokens": 281204247.0, + "step": 7372 + }, + { + "epoch": 0.9379213840478311, + "ewc_loss": 0.05271465331315994, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022197075304575264, + "grad_norm": 6.140765190124512, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8848583102226257, + "num_tokens": 281239518.0, + "step": 7373 + }, + { + "epoch": 0.9380485943264216, + "ewc_loss": 0.05263254791498184, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022114967578090727, + "grad_norm": 6.076378345489502, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.868810772895813, + "num_tokens": 281275916.0, + "step": 7374 + }, + { + "epoch": 0.938175804605012, + "ewc_loss": 0.05266312137246132, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002214554260717705, + "grad_norm": 6.172363758087158, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8702640533447266, + "num_tokens": 281307615.0, + "step": 7375 + }, + { + "epoch": 0.9383030148836026, + "ewc_loss": 0.05264190956950188, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022124331735540181, + "grad_norm": 6.147608757019043, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8654307126998901, + "num_tokens": 281342677.0, + "step": 7376 + }, + { + "epoch": 0.9384302251621931, + "ewc_loss": 0.052579645067453384, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022062067000661045, + "grad_norm": 6.219204425811768, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8557902574539185, + "num_tokens": 281379545.0, + "step": 7377 + }, + { + "epoch": 0.9385574354407836, + "ewc_loss": 0.052562568336725235, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022044990328140557, + "grad_norm": 6.062166690826416, + "learning_rate": 1e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8420352935791016, + "num_tokens": 281424169.0, + "step": 7378 + }, + { + "epoch": 0.9386846457193742, + "ewc_loss": 0.05252416059374809, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022006582003086805, + "grad_norm": 6.066212177276611, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8805118799209595, + "num_tokens": 281461306.0, + "step": 7379 + }, + { + "epoch": 0.9388118559979647, + "ewc_loss": 0.052610844373703, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002209326485171914, + "grad_norm": 6.12565279006958, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8743960857391357, + "num_tokens": 281498753.0, + "step": 7380 + }, + { + "epoch": 0.9389390662765551, + "ewc_loss": 0.0525345578789711, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022016980801708996, + "grad_norm": 6.106989860534668, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8697468042373657, + "num_tokens": 281537163.0, + "step": 7381 + }, + { + "epoch": 0.9390662765551456, + "ewc_loss": 0.05257934331893921, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022061762865632772, + "grad_norm": 6.139345645904541, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.852136492729187, + "num_tokens": 281577926.0, + "step": 7382 + }, + { + "epoch": 0.9391934868337362, + "ewc_loss": 0.0525958277285099, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022078250185586512, + "grad_norm": 6.075511455535889, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8795024752616882, + "num_tokens": 281621283.0, + "step": 7383 + }, + { + "epoch": 0.9393206971123267, + "ewc_loss": 0.052679643034935, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022162063396535814, + "grad_norm": 6.19968318939209, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8634108304977417, + "num_tokens": 281658336.0, + "step": 7384 + }, + { + "epoch": 0.9394479073909172, + "ewc_loss": 0.052510783076286316, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021993202972225845, + "grad_norm": 6.039172172546387, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8645089864730835, + "num_tokens": 281695137.0, + "step": 7385 + }, + { + "epoch": 0.9395751176695077, + "ewc_loss": 0.052723292261362076, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022205714776646346, + "grad_norm": 6.268100738525391, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8750086426734924, + "num_tokens": 281733570.0, + "step": 7386 + }, + { + "epoch": 0.9397023279480982, + "ewc_loss": 0.052562326192855835, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002204474585596472, + "grad_norm": 6.098948955535889, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8618476390838623, + "num_tokens": 281771163.0, + "step": 7387 + }, + { + "epoch": 0.9398295382266887, + "ewc_loss": 0.05263683199882507, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022119256027508527, + "grad_norm": 6.241265773773193, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8635618686676025, + "num_tokens": 281813715.0, + "step": 7388 + }, + { + "epoch": 0.9399567485052792, + "ewc_loss": 0.05251489207148552, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021997313888277858, + "grad_norm": 6.050343990325928, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8587446808815002, + "num_tokens": 281856921.0, + "step": 7389 + }, + { + "epoch": 0.9400839587838697, + "ewc_loss": 0.05263088643550873, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002211330720456317, + "grad_norm": 6.2771148681640625, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8519459962844849, + "num_tokens": 281893273.0, + "step": 7390 + }, + { + "epoch": 0.9402111690624603, + "ewc_loss": 0.05249699950218201, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00021979423763696104, + "grad_norm": 6.108547210693359, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8651612997055054, + "num_tokens": 281927452.0, + "step": 7391 + }, + { + "epoch": 0.9403383793410508, + "ewc_loss": 0.05262278765439987, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022105207608547062, + "grad_norm": 6.092408180236816, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8860254287719727, + "num_tokens": 281965359.0, + "step": 7392 + }, + { + "epoch": 0.9404655896196412, + "ewc_loss": 0.05259448289871216, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022076904133427888, + "grad_norm": 6.175586700439453, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8577011823654175, + "num_tokens": 281999209.0, + "step": 7393 + }, + { + "epoch": 0.9405927998982317, + "ewc_loss": 0.052615661174058914, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022098082990851253, + "grad_norm": 6.106069087982178, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.850309431552887, + "num_tokens": 282042129.0, + "step": 7394 + }, + { + "epoch": 0.9407200101768223, + "ewc_loss": 0.05289872735738754, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022137009364087135, + "grad_norm": 6.09901762008667, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8520934581756592, + "num_tokens": 282087941.0, + "step": 7395 + }, + { + "epoch": 0.9408472204554128, + "ewc_loss": 0.052615270018577576, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022097694454714656, + "grad_norm": 6.193798542022705, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8656893968582153, + "num_tokens": 282123130.0, + "step": 7396 + }, + { + "epoch": 0.9409744307340033, + "ewc_loss": 0.05257284641265869, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022055268345866352, + "grad_norm": 6.120781898498535, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8571678996086121, + "num_tokens": 282161142.0, + "step": 7397 + }, + { + "epoch": 0.9411016410125939, + "ewc_loss": 0.05263567343354225, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022118096239864826, + "grad_norm": 6.124969482421875, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.884589433670044, + "num_tokens": 282196363.0, + "step": 7398 + }, + { + "epoch": 0.9412288512911843, + "ewc_loss": 0.05262623727321625, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022108657867647707, + "grad_norm": 6.15317964553833, + "learning_rate": 1e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.841699481010437, + "num_tokens": 282233202.0, + "step": 7399 + }, + { + "epoch": 0.9413560615697748, + "ewc_loss": 0.05286495387554169, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022103235824033618, + "grad_norm": 6.192080974578857, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8624045848846436, + "num_tokens": 282270976.0, + "step": 7400 + }, + { + "epoch": 0.9414832718483653, + "ewc_loss": 0.052604757249355316, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002208718069596216, + "grad_norm": 6.110222339630127, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8704979419708252, + "num_tokens": 282312223.0, + "step": 7401 + }, + { + "epoch": 0.9416104821269559, + "ewc_loss": 0.05261358246207237, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002209600352216512, + "grad_norm": 6.1804375648498535, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.844611644744873, + "num_tokens": 282350418.0, + "step": 7402 + }, + { + "epoch": 0.9417376924055464, + "ewc_loss": 0.052622608840465546, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002210503298556432, + "grad_norm": 6.152971267700195, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8617234230041504, + "num_tokens": 282380751.0, + "step": 7403 + }, + { + "epoch": 0.9418649026841369, + "ewc_loss": 0.05267662554979324, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022159046784508973, + "grad_norm": 6.158840656280518, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8726012110710144, + "num_tokens": 282420248.0, + "step": 7404 + }, + { + "epoch": 0.9419921129627273, + "ewc_loss": 0.05263940244913101, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022121824440546334, + "grad_norm": 6.135243892669678, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.864874005317688, + "num_tokens": 282457951.0, + "step": 7405 + }, + { + "epoch": 0.9421193232413179, + "ewc_loss": 0.052683137357234955, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022165557311382145, + "grad_norm": 6.0879058837890625, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8611263036727905, + "num_tokens": 282507379.0, + "step": 7406 + }, + { + "epoch": 0.9422465335199084, + "ewc_loss": 0.05269666761159897, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002217908768216148, + "grad_norm": 6.146311283111572, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8489010334014893, + "num_tokens": 282546905.0, + "step": 7407 + }, + { + "epoch": 0.9423737437984989, + "ewc_loss": 0.052670352160930634, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022152771998662502, + "grad_norm": 6.090559482574463, + "learning_rate": 1e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.8324089050292969, + "num_tokens": 282591368.0, + "step": 7408 + }, + { + "epoch": 0.9425009540770894, + "ewc_loss": 0.05280141532421112, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022283836733549833, + "grad_norm": 6.2677321434021, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8590967655181885, + "num_tokens": 282617331.0, + "step": 7409 + }, + { + "epoch": 0.94262816435568, + "ewc_loss": 0.052742138504981995, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022224558051675558, + "grad_norm": 6.1060686111450195, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8738217949867249, + "num_tokens": 282652346.0, + "step": 7410 + }, + { + "epoch": 0.9427553746342705, + "ewc_loss": 0.05280328914523125, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022285711020231247, + "grad_norm": 6.127701759338379, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8657400608062744, + "num_tokens": 282691564.0, + "step": 7411 + }, + { + "epoch": 0.9428825849128609, + "ewc_loss": 0.05283219367265701, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002231461403425783, + "grad_norm": 6.1566033363342285, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8714512586593628, + "num_tokens": 282728186.0, + "step": 7412 + }, + { + "epoch": 0.9430097951914514, + "ewc_loss": 0.05282832682132721, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022310747590381652, + "grad_norm": 6.160372257232666, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8545436859130859, + "num_tokens": 282770491.0, + "step": 7413 + }, + { + "epoch": 0.943137005470042, + "ewc_loss": 0.052774760872125626, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022257183445617557, + "grad_norm": 6.068332195281982, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8543215990066528, + "num_tokens": 282809824.0, + "step": 7414 + }, + { + "epoch": 0.9432642157486325, + "ewc_loss": 0.0528256930410862, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002230811514891684, + "grad_norm": 6.153544902801514, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8667676448822021, + "num_tokens": 282846735.0, + "step": 7415 + }, + { + "epoch": 0.943391426027223, + "ewc_loss": 0.05281251296401024, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002229493547929451, + "grad_norm": 6.114861011505127, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8806005716323853, + "num_tokens": 282882242.0, + "step": 7416 + }, + { + "epoch": 0.9435186363058136, + "ewc_loss": 0.05275285616517067, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022235278447624296, + "grad_norm": 6.150211811065674, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8489747047424316, + "num_tokens": 282918774.0, + "step": 7417 + }, + { + "epoch": 0.943645846584404, + "ewc_loss": 0.052854668349027634, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022337090922519565, + "grad_norm": 6.144460678100586, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8526226878166199, + "num_tokens": 282957766.0, + "step": 7418 + }, + { + "epoch": 0.9437730568629945, + "ewc_loss": 0.052779197692871094, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022261621779762208, + "grad_norm": 6.158621311187744, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8491272926330566, + "num_tokens": 282995980.0, + "step": 7419 + }, + { + "epoch": 0.943900267141585, + "ewc_loss": 0.05279286950826645, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022275290393736213, + "grad_norm": 6.160756587982178, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8612037897109985, + "num_tokens": 283033148.0, + "step": 7420 + }, + { + "epoch": 0.9440274774201756, + "ewc_loss": 0.05281279981136322, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022295220696832985, + "grad_norm": 6.152403354644775, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8647940158843994, + "num_tokens": 283069781.0, + "step": 7421 + }, + { + "epoch": 0.9441546876987661, + "ewc_loss": 0.05285303294658661, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022335453832056373, + "grad_norm": 6.20396614074707, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8527459502220154, + "num_tokens": 283107270.0, + "step": 7422 + }, + { + "epoch": 0.9442818979773566, + "ewc_loss": 0.052749939262866974, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022232363699004054, + "grad_norm": 6.178895473480225, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8725122213363647, + "num_tokens": 283143571.0, + "step": 7423 + }, + { + "epoch": 0.944409108255947, + "ewc_loss": 0.052776649594306946, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022259069373831153, + "grad_norm": 6.172308921813965, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8653436303138733, + "num_tokens": 283179995.0, + "step": 7424 + }, + { + "epoch": 0.9445363185345376, + "ewc_loss": 0.05276693403720856, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002224935742560774, + "grad_norm": 6.170412540435791, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8610388040542603, + "num_tokens": 283218370.0, + "step": 7425 + }, + { + "epoch": 0.9446635288131281, + "ewc_loss": 0.0527222603559494, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022204683045856655, + "grad_norm": 6.269734859466553, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8545328378677368, + "num_tokens": 283247879.0, + "step": 7426 + }, + { + "epoch": 0.9447907390917186, + "ewc_loss": 0.052672263234853745, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022154685575515032, + "grad_norm": 6.104212760925293, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8767350912094116, + "num_tokens": 283285027.0, + "step": 7427 + }, + { + "epoch": 0.9449179493703092, + "ewc_loss": 0.052709080278873444, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022191500465851277, + "grad_norm": 6.204014301300049, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8641021251678467, + "num_tokens": 283324934.0, + "step": 7428 + }, + { + "epoch": 0.9450451596488997, + "ewc_loss": 0.05267077311873436, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022153195459395647, + "grad_norm": 6.1238250732421875, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8681455850601196, + "num_tokens": 283357540.0, + "step": 7429 + }, + { + "epoch": 0.9451723699274901, + "ewc_loss": 0.05269525572657585, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022177677601575851, + "grad_norm": 6.223931312561035, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.858553946018219, + "num_tokens": 283393470.0, + "step": 7430 + }, + { + "epoch": 0.9452995802060806, + "ewc_loss": 0.05259161815047264, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022074040316510946, + "grad_norm": 6.084449291229248, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8611252903938293, + "num_tokens": 283429916.0, + "step": 7431 + }, + { + "epoch": 0.9454267904846712, + "ewc_loss": 0.05279960855841637, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022282030840869993, + "grad_norm": 6.196103572845459, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8486181497573853, + "num_tokens": 283467997.0, + "step": 7432 + }, + { + "epoch": 0.9455540007632617, + "ewc_loss": 0.05266926437616348, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022151684970594943, + "grad_norm": 6.188347816467285, + "learning_rate": 1e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8255741000175476, + "num_tokens": 283504741.0, + "step": 7433 + }, + { + "epoch": 0.9456812110418522, + "ewc_loss": 0.052937157452106476, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022175439517013729, + "grad_norm": 6.242062091827393, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8693761825561523, + "num_tokens": 283534535.0, + "step": 7434 + }, + { + "epoch": 0.9458084213204427, + "ewc_loss": 0.05265315622091293, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022135577455628663, + "grad_norm": 6.120373725891113, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8723543882369995, + "num_tokens": 283571600.0, + "step": 7435 + }, + { + "epoch": 0.9459356315990332, + "ewc_loss": 0.05264827609062195, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022130698198452592, + "grad_norm": 6.1831135749816895, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8476446270942688, + "num_tokens": 283605662.0, + "step": 7436 + }, + { + "epoch": 0.9460628418776237, + "ewc_loss": 0.05275210365653038, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002223452611360699, + "grad_norm": 6.126378536224365, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8608689308166504, + "num_tokens": 283642868.0, + "step": 7437 + }, + { + "epoch": 0.9461900521562142, + "ewc_loss": 0.05295432358980179, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022192606411408633, + "grad_norm": 6.192876815795898, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.851048469543457, + "num_tokens": 283675604.0, + "step": 7438 + }, + { + "epoch": 0.9463172624348047, + "ewc_loss": 0.053229883313179016, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000222240254515782, + "grad_norm": 6.125519275665283, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.881271481513977, + "num_tokens": 283709459.0, + "step": 7439 + }, + { + "epoch": 0.9464444727133953, + "ewc_loss": 0.053197070956230164, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002219120942754671, + "grad_norm": 6.169441223144531, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8609970211982727, + "num_tokens": 283748020.0, + "step": 7440 + }, + { + "epoch": 0.9465716829919858, + "ewc_loss": 0.053254250437021255, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022248391178436577, + "grad_norm": 6.182078838348389, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.855228841304779, + "num_tokens": 283778514.0, + "step": 7441 + }, + { + "epoch": 0.9466988932705762, + "ewc_loss": 0.05318724364042282, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002218138542957604, + "grad_norm": 6.111086845397949, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.864514172077179, + "num_tokens": 283819764.0, + "step": 7442 + }, + { + "epoch": 0.9468261035491667, + "ewc_loss": 0.05322551727294922, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022219658421818167, + "grad_norm": 6.205623149871826, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8462244272232056, + "num_tokens": 283853897.0, + "step": 7443 + }, + { + "epoch": 0.9469533138277573, + "ewc_loss": 0.053111083805561066, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022105223615653813, + "grad_norm": 6.124383926391602, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8750938177108765, + "num_tokens": 283891826.0, + "step": 7444 + }, + { + "epoch": 0.9470805241063478, + "ewc_loss": 0.05316370353102684, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002215784479631111, + "grad_norm": 6.15074348449707, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8602993488311768, + "num_tokens": 283931690.0, + "step": 7445 + }, + { + "epoch": 0.9472077343849383, + "ewc_loss": 0.053342655301094055, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00022092655126471072, + "grad_norm": 6.122260093688965, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.861587643623352, + "num_tokens": 283971107.0, + "step": 7446 + }, + { + "epoch": 0.9473349446635289, + "ewc_loss": 0.05315377563238144, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002214791893493384, + "grad_norm": 6.174375534057617, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8676104545593262, + "num_tokens": 284005286.0, + "step": 7447 + }, + { + "epoch": 0.9474621549421193, + "ewc_loss": 0.053134314715862274, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000221284557483159, + "grad_norm": 6.146517753601074, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8750326037406921, + "num_tokens": 284044089.0, + "step": 7448 + }, + { + "epoch": 0.9475893652207098, + "ewc_loss": 0.0531591922044754, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022153332247398794, + "grad_norm": 6.200688362121582, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8562102913856506, + "num_tokens": 284081172.0, + "step": 7449 + }, + { + "epoch": 0.9477165754993003, + "ewc_loss": 0.053102388978004456, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002209652739111334, + "grad_norm": 6.2447686195373535, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8689922094345093, + "num_tokens": 284113362.0, + "step": 7450 + }, + { + "epoch": 0.9478437857778909, + "ewc_loss": 0.05310657247900963, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002210071252193302, + "grad_norm": 6.157207489013672, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8703126907348633, + "num_tokens": 284155272.0, + "step": 7451 + }, + { + "epoch": 0.9479709960564814, + "ewc_loss": 0.05309327691793442, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022087417892180383, + "grad_norm": 6.271384239196777, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8681174516677856, + "num_tokens": 284188218.0, + "step": 7452 + }, + { + "epoch": 0.9480982063350719, + "ewc_loss": 0.05304039642214775, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022034536232240498, + "grad_norm": 6.149269104003906, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8636454939842224, + "num_tokens": 284228838.0, + "step": 7453 + }, + { + "epoch": 0.9482254166136623, + "ewc_loss": 0.05305540934205055, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022049550898373127, + "grad_norm": 6.212156772613525, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8579993844032288, + "num_tokens": 284265570.0, + "step": 7454 + }, + { + "epoch": 0.9483526268922529, + "ewc_loss": 0.05297430232167244, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002196844288846478, + "grad_norm": 6.321892738342285, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8516262769699097, + "num_tokens": 284307062.0, + "step": 7455 + }, + { + "epoch": 0.9484798371708434, + "ewc_loss": 0.053031373769044876, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002202551404479891, + "grad_norm": 6.162454605102539, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8589562773704529, + "num_tokens": 284340488.0, + "step": 7456 + }, + { + "epoch": 0.9486070474494339, + "ewc_loss": 0.05298228561878204, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021976424613967538, + "grad_norm": 6.15992546081543, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8600465059280396, + "num_tokens": 284382399.0, + "step": 7457 + }, + { + "epoch": 0.9487342577280244, + "ewc_loss": 0.052963003516197205, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021957143326289952, + "grad_norm": 6.194179058074951, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8477528095245361, + "num_tokens": 284416391.0, + "step": 7458 + }, + { + "epoch": 0.948861468006615, + "ewc_loss": 0.05302193760871887, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022016078582964838, + "grad_norm": 6.109724521636963, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8611431121826172, + "num_tokens": 284457049.0, + "step": 7459 + }, + { + "epoch": 0.9489886782852054, + "ewc_loss": 0.05302749574184418, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022021638869773597, + "grad_norm": 6.1768646240234375, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8601736426353455, + "num_tokens": 284494426.0, + "step": 7460 + }, + { + "epoch": 0.9491158885637959, + "ewc_loss": 0.05302846059203148, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022022600751370192, + "grad_norm": 6.158609867095947, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8791556358337402, + "num_tokens": 284533349.0, + "step": 7461 + }, + { + "epoch": 0.9492430988423864, + "ewc_loss": 0.05305054038763046, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002204467891715467, + "grad_norm": 6.164799213409424, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.867323637008667, + "num_tokens": 284567777.0, + "step": 7462 + }, + { + "epoch": 0.949370309120977, + "ewc_loss": 0.05307409167289734, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022068232647143304, + "grad_norm": 6.144430637359619, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8538614511489868, + "num_tokens": 284609120.0, + "step": 7463 + }, + { + "epoch": 0.9494975193995675, + "ewc_loss": 0.05311380326747894, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022107944823801517, + "grad_norm": 6.1427130699157715, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8534854054450989, + "num_tokens": 284649221.0, + "step": 7464 + }, + { + "epoch": 0.949624729678158, + "ewc_loss": 0.05322353541851044, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022217673540581018, + "grad_norm": 6.2320685386657715, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8671669960021973, + "num_tokens": 284682595.0, + "step": 7465 + }, + { + "epoch": 0.9497519399567486, + "ewc_loss": 0.053041763603687286, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022035905567463487, + "grad_norm": 6.111929416656494, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8567413091659546, + "num_tokens": 284722903.0, + "step": 7466 + }, + { + "epoch": 0.949879150235339, + "ewc_loss": 0.05316407233476639, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002215821441495791, + "grad_norm": 6.23268461227417, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8661246299743652, + "num_tokens": 284759166.0, + "step": 7467 + }, + { + "epoch": 0.9500063605139295, + "ewc_loss": 0.053073398768901825, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022067541431169957, + "grad_norm": 6.063308238983154, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8573517799377441, + "num_tokens": 284803192.0, + "step": 7468 + }, + { + "epoch": 0.95013357079252, + "ewc_loss": 0.05317725986242294, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022171401360537857, + "grad_norm": 6.24252462387085, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8635485172271729, + "num_tokens": 284837088.0, + "step": 7469 + }, + { + "epoch": 0.9502607810711106, + "ewc_loss": 0.05304425209760666, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022038391034584492, + "grad_norm": 6.058028221130371, + "learning_rate": 1e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8421478271484375, + "num_tokens": 284879641.0, + "step": 7470 + }, + { + "epoch": 0.9503879913497011, + "ewc_loss": 0.05324554070830345, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000222396818571724, + "grad_norm": 6.241909503936768, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8504078388214111, + "num_tokens": 284917519.0, + "step": 7471 + }, + { + "epoch": 0.9505152016282916, + "ewc_loss": 0.053108811378479004, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022102950606495142, + "grad_norm": 6.121989727020264, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8511410355567932, + "num_tokens": 284950422.0, + "step": 7472 + }, + { + "epoch": 0.950642411906882, + "ewc_loss": 0.053162313997745514, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000221564550884068, + "grad_norm": 6.2018280029296875, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8680601119995117, + "num_tokens": 284986392.0, + "step": 7473 + }, + { + "epoch": 0.9507696221854726, + "ewc_loss": 0.053200364112854004, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002219450252596289, + "grad_norm": 6.13762903213501, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.862748920917511, + "num_tokens": 285027684.0, + "step": 7474 + }, + { + "epoch": 0.9508968324640631, + "ewc_loss": 0.05317063629627228, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022164775873534381, + "grad_norm": 6.212615013122559, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8513643741607666, + "num_tokens": 285059551.0, + "step": 7475 + }, + { + "epoch": 0.9510240427426536, + "ewc_loss": 0.053166985511779785, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022161126253195107, + "grad_norm": 6.108191967010498, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8616976141929626, + "num_tokens": 285100197.0, + "step": 7476 + }, + { + "epoch": 0.9511512530212441, + "ewc_loss": 0.052749454975128174, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022231876209843904, + "grad_norm": 6.255136013031006, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8620429635047913, + "num_tokens": 285130115.0, + "step": 7477 + }, + { + "epoch": 0.9512784632998347, + "ewc_loss": 0.05316196009516716, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002215610002167523, + "grad_norm": 6.084201812744141, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8617256879806519, + "num_tokens": 285169069.0, + "step": 7478 + }, + { + "epoch": 0.9514056735784251, + "ewc_loss": 0.053259432315826416, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000222535731154494, + "grad_norm": 6.187116622924805, + "learning_rate": 1e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.8409937620162964, + "num_tokens": 285209971.0, + "step": 7479 + }, + { + "epoch": 0.9515328838570156, + "ewc_loss": 0.053223736584186554, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022217878722585738, + "grad_norm": 6.113589763641357, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8560449481010437, + "num_tokens": 285250024.0, + "step": 7480 + }, + { + "epoch": 0.9516600941356061, + "ewc_loss": 0.05324152484536171, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002223566552856937, + "grad_norm": 6.165018081665039, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8571839928627014, + "num_tokens": 285284900.0, + "step": 7481 + }, + { + "epoch": 0.9517873044141967, + "ewc_loss": 0.053315579891204834, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022309721680358052, + "grad_norm": 6.142886638641357, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8656753897666931, + "num_tokens": 285325657.0, + "step": 7482 + }, + { + "epoch": 0.9519145146927872, + "ewc_loss": 0.053277816623449326, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022271956549957395, + "grad_norm": 6.231198310852051, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8615192770957947, + "num_tokens": 285356362.0, + "step": 7483 + }, + { + "epoch": 0.9520417249713777, + "ewc_loss": 0.05322378873825073, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022217928199097514, + "grad_norm": 6.154808521270752, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8602875471115112, + "num_tokens": 285394321.0, + "step": 7484 + }, + { + "epoch": 0.9521689352499682, + "ewc_loss": 0.053305450826883316, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022299590636976063, + "grad_norm": 6.161266803741455, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8797447085380554, + "num_tokens": 285435260.0, + "step": 7485 + }, + { + "epoch": 0.9522961455285587, + "ewc_loss": 0.053245119750499725, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000222392613068223, + "grad_norm": 6.124301910400391, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8690825700759888, + "num_tokens": 285479449.0, + "step": 7486 + }, + { + "epoch": 0.9524233558071492, + "ewc_loss": 0.052784502506256104, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022266925952862948, + "grad_norm": 6.23947286605835, + "learning_rate": 1e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8403843641281128, + "num_tokens": 285520492.0, + "step": 7487 + }, + { + "epoch": 0.9525505660857397, + "ewc_loss": 0.052697502076625824, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022179925872478634, + "grad_norm": 6.1689372062683105, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8575890064239502, + "num_tokens": 285558410.0, + "step": 7488 + }, + { + "epoch": 0.9526777763643303, + "ewc_loss": 0.05325014144182205, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002224428317276761, + "grad_norm": 6.189043045043945, + "learning_rate": 1e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8416767120361328, + "num_tokens": 285598871.0, + "step": 7489 + }, + { + "epoch": 0.9528049866429208, + "ewc_loss": 0.05269348993897438, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002217591245425865, + "grad_norm": 6.130621910095215, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8530384302139282, + "num_tokens": 285641374.0, + "step": 7490 + }, + { + "epoch": 0.9529321969215112, + "ewc_loss": 0.052737146615982056, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022219566744752228, + "grad_norm": 6.2044548988342285, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8717117309570312, + "num_tokens": 285675416.0, + "step": 7491 + }, + { + "epoch": 0.9530594072001017, + "ewc_loss": 0.05271239951252937, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002219482121290639, + "grad_norm": 6.20750617980957, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8659173846244812, + "num_tokens": 285707491.0, + "step": 7492 + }, + { + "epoch": 0.9531866174786923, + "ewc_loss": 0.05270775407552719, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022190177696757019, + "grad_norm": 6.143826007843018, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8504937887191772, + "num_tokens": 285747571.0, + "step": 7493 + }, + { + "epoch": 0.9533138277572828, + "ewc_loss": 0.05270922929048538, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022191648895386606, + "grad_norm": 6.208388328552246, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8835123181343079, + "num_tokens": 285786362.0, + "step": 7494 + }, + { + "epoch": 0.9534410380358733, + "ewc_loss": 0.052652888000011444, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022135311155579984, + "grad_norm": 6.1473870277404785, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8685978055000305, + "num_tokens": 285829401.0, + "step": 7495 + }, + { + "epoch": 0.9535682483144639, + "ewc_loss": 0.053160473704338074, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002215461281593889, + "grad_norm": 6.141805171966553, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8592541813850403, + "num_tokens": 285867853.0, + "step": 7496 + }, + { + "epoch": 0.9536954585930543, + "ewc_loss": 0.05315279960632324, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002214693813584745, + "grad_norm": 6.148558616638184, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8726319670677185, + "num_tokens": 285908497.0, + "step": 7497 + }, + { + "epoch": 0.9538226688716448, + "ewc_loss": 0.05265958234667778, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022142005036585033, + "grad_norm": 6.207174301147461, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8470034003257751, + "num_tokens": 285940528.0, + "step": 7498 + }, + { + "epoch": 0.9539498791502353, + "ewc_loss": 0.052634067833423615, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022116491163615137, + "grad_norm": 6.123776912689209, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.861533522605896, + "num_tokens": 285980206.0, + "step": 7499 + }, + { + "epoch": 0.9540770894288259, + "ewc_loss": 0.05275140330195427, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022233824711292982, + "grad_norm": 6.178671836853027, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8559683561325073, + "num_tokens": 286019409.0, + "step": 7500 + }, + { + "epoch": 0.9542042997074164, + "ewc_loss": 0.05266525596380234, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002214767737314105, + "grad_norm": 6.156696796417236, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8695940375328064, + "num_tokens": 286050955.0, + "step": 7501 + }, + { + "epoch": 0.9543315099860069, + "ewc_loss": 0.0527961403131485, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022278560209088027, + "grad_norm": 6.204761028289795, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8672346472740173, + "num_tokens": 286087567.0, + "step": 7502 + }, + { + "epoch": 0.9544587202645973, + "ewc_loss": 0.05270390957593918, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022186333080753684, + "grad_norm": 6.193416118621826, + "learning_rate": 1e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8403704166412354, + "num_tokens": 286126893.0, + "step": 7503 + }, + { + "epoch": 0.9545859305431879, + "ewc_loss": 0.05273081362247467, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022213235206436366, + "grad_norm": 6.1460137367248535, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.855347216129303, + "num_tokens": 286165818.0, + "step": 7504 + }, + { + "epoch": 0.9547131408217784, + "ewc_loss": 0.05280166119337082, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002228408120572567, + "grad_norm": 6.237333297729492, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8628854155540466, + "num_tokens": 286208026.0, + "step": 7505 + }, + { + "epoch": 0.9548403511003689, + "ewc_loss": 0.05273069441318512, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022213117335923016, + "grad_norm": 6.206499099731445, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8603518009185791, + "num_tokens": 286245468.0, + "step": 7506 + }, + { + "epoch": 0.9549675613789594, + "ewc_loss": 0.05277588963508606, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000222583141294308, + "grad_norm": 6.235201358795166, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8641489744186401, + "num_tokens": 286283955.0, + "step": 7507 + }, + { + "epoch": 0.95509477165755, + "ewc_loss": 0.05267557501792908, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002215799904661253, + "grad_norm": 6.122581958770752, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8661960363388062, + "num_tokens": 286326560.0, + "step": 7508 + }, + { + "epoch": 0.9552219819361404, + "ewc_loss": 0.05284298211336136, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022325405734591186, + "grad_norm": 6.207897186279297, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8611191511154175, + "num_tokens": 286365791.0, + "step": 7509 + }, + { + "epoch": 0.9553491922147309, + "ewc_loss": 0.05274525284767151, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022227676527108997, + "grad_norm": 6.137948036193848, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8708758354187012, + "num_tokens": 286404132.0, + "step": 7510 + }, + { + "epoch": 0.9554764024933214, + "ewc_loss": 0.0529090017080307, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022391423408407718, + "grad_norm": 6.2015509605407715, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8508577942848206, + "num_tokens": 286444828.0, + "step": 7511 + }, + { + "epoch": 0.955603612771912, + "ewc_loss": 0.052838727831840515, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022321147844195366, + "grad_norm": 6.175329685211182, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8658716678619385, + "num_tokens": 286481725.0, + "step": 7512 + }, + { + "epoch": 0.9557308230505025, + "ewc_loss": 0.05282191187143326, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002230433456134051, + "grad_norm": 6.170334815979004, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8612406253814697, + "num_tokens": 286520354.0, + "step": 7513 + }, + { + "epoch": 0.955858033329093, + "ewc_loss": 0.052891358733177185, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022373780666384846, + "grad_norm": 6.188190460205078, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8588545322418213, + "num_tokens": 286564314.0, + "step": 7514 + }, + { + "epoch": 0.9559852436076836, + "ewc_loss": 0.052845586091279984, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022328007617034018, + "grad_norm": 6.181418418884277, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8590155243873596, + "num_tokens": 286606955.0, + "step": 7515 + }, + { + "epoch": 0.956112453886274, + "ewc_loss": 0.05308922380208969, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022327504120767117, + "grad_norm": 6.355620861053467, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.866759181022644, + "num_tokens": 286640091.0, + "step": 7516 + }, + { + "epoch": 0.9562396641648645, + "ewc_loss": 0.05280390381813049, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002228632802143693, + "grad_norm": 6.165486812591553, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8605351448059082, + "num_tokens": 286679090.0, + "step": 7517 + }, + { + "epoch": 0.956366874443455, + "ewc_loss": 0.052849117666482925, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022331539366859943, + "grad_norm": 6.199223518371582, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.864822506904602, + "num_tokens": 286720260.0, + "step": 7518 + }, + { + "epoch": 0.9564940847220456, + "ewc_loss": 0.05281056463718414, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002229298697784543, + "grad_norm": 6.135024070739746, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8683474659919739, + "num_tokens": 286763992.0, + "step": 7519 + }, + { + "epoch": 0.9566212950006361, + "ewc_loss": 0.052857961505651474, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002234038256574422, + "grad_norm": 6.159980773925781, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8422171473503113, + "num_tokens": 286807068.0, + "step": 7520 + }, + { + "epoch": 0.9567485052792266, + "ewc_loss": 0.05286398157477379, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002234640414826572, + "grad_norm": 6.129114627838135, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8677850365638733, + "num_tokens": 286853020.0, + "step": 7521 + }, + { + "epoch": 0.956875715557817, + "ewc_loss": 0.052887924015522, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022370343504007906, + "grad_norm": 6.188467979431152, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8511629104614258, + "num_tokens": 286896098.0, + "step": 7522 + }, + { + "epoch": 0.9570029258364076, + "ewc_loss": 0.05284823477268219, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022330654610414058, + "grad_norm": 6.145959377288818, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8452951908111572, + "num_tokens": 286938873.0, + "step": 7523 + }, + { + "epoch": 0.9571301361149981, + "ewc_loss": 0.05294104665517807, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022423469636123627, + "grad_norm": 6.215336799621582, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8653905987739563, + "num_tokens": 286972941.0, + "step": 7524 + }, + { + "epoch": 0.9572573463935886, + "ewc_loss": 0.052901241928339005, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022383664327207953, + "grad_norm": 6.198749542236328, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8500829935073853, + "num_tokens": 287010818.0, + "step": 7525 + }, + { + "epoch": 0.9573845566721791, + "ewc_loss": 0.05286674201488495, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022349163191393018, + "grad_norm": 6.129140853881836, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8648688793182373, + "num_tokens": 287053841.0, + "step": 7526 + }, + { + "epoch": 0.9575117669507697, + "ewc_loss": 0.0529605895280838, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000224430114030838, + "grad_norm": 6.2073798179626465, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8663229942321777, + "num_tokens": 287090900.0, + "step": 7527 + }, + { + "epoch": 0.9576389772293601, + "ewc_loss": 0.05293538048863411, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022417801665142179, + "grad_norm": 6.139710903167725, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8581331372261047, + "num_tokens": 287131203.0, + "step": 7528 + }, + { + "epoch": 0.9577661875079506, + "ewc_loss": 0.053013868629932404, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002249629033030942, + "grad_norm": 6.230352401733398, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8714317679405212, + "num_tokens": 287165949.0, + "step": 7529 + }, + { + "epoch": 0.9578933977865411, + "ewc_loss": 0.05291948467493057, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022401906608138233, + "grad_norm": 6.18239688873291, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8597047328948975, + "num_tokens": 287203376.0, + "step": 7530 + }, + { + "epoch": 0.9580206080651317, + "ewc_loss": 0.053020767867565155, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022503187938127667, + "grad_norm": 6.230107307434082, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.852439284324646, + "num_tokens": 287240391.0, + "step": 7531 + }, + { + "epoch": 0.9581478183437222, + "ewc_loss": 0.052940838038921356, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022423261543735862, + "grad_norm": 6.1434760093688965, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8580958843231201, + "num_tokens": 287278485.0, + "step": 7532 + }, + { + "epoch": 0.9582750286223127, + "ewc_loss": 0.05302250012755394, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022504921071231365, + "grad_norm": 6.268393516540527, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.855724036693573, + "num_tokens": 287313626.0, + "step": 7533 + }, + { + "epoch": 0.9584022389009031, + "ewc_loss": 0.052966661751270294, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022449085372500122, + "grad_norm": 6.1404948234558105, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8763680458068848, + "num_tokens": 287344532.0, + "step": 7534 + }, + { + "epoch": 0.9585294491794937, + "ewc_loss": 0.05304298177361488, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022525404347106814, + "grad_norm": 6.209747791290283, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8424124717712402, + "num_tokens": 287383069.0, + "step": 7535 + }, + { + "epoch": 0.9586566594580842, + "ewc_loss": 0.05324268341064453, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022480962797999382, + "grad_norm": 13.055173873901367, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8602693676948547, + "num_tokens": 287418554.0, + "step": 7536 + }, + { + "epoch": 0.9587838697366747, + "ewc_loss": 0.062156207859516144, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00031638628570362926, + "grad_norm": 7.427807331085205, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8605043888092041, + "num_tokens": 287452449.0, + "step": 7537 + }, + { + "epoch": 0.9589110800152653, + "ewc_loss": 0.05117303505539894, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00020655457046814263, + "grad_norm": 5.6630778312683105, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8461042046546936, + "num_tokens": 287492247.0, + "step": 7538 + }, + { + "epoch": 0.9590382902938558, + "ewc_loss": 0.05511487275362015, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002459729730617255, + "grad_norm": 6.798236846923828, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8658950328826904, + "num_tokens": 287528651.0, + "step": 7539 + }, + { + "epoch": 0.9591655005724462, + "ewc_loss": 0.054122984409332275, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002360540529480204, + "grad_norm": 6.13728666305542, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.851375937461853, + "num_tokens": 287565837.0, + "step": 7540 + }, + { + "epoch": 0.9592927108510367, + "ewc_loss": 0.0539562813937664, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00023438702919520438, + "grad_norm": 6.511457920074463, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8604240417480469, + "num_tokens": 287599469.0, + "step": 7541 + }, + { + "epoch": 0.9594199211296273, + "ewc_loss": 0.05394086241722107, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00023423286620527506, + "grad_norm": 6.253382682800293, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8597029447555542, + "num_tokens": 287637104.0, + "step": 7542 + }, + { + "epoch": 0.9595471314082178, + "ewc_loss": 0.05363604426383972, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00023118466197047383, + "grad_norm": 6.331425189971924, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8612734079360962, + "num_tokens": 287679134.0, + "step": 7543 + }, + { + "epoch": 0.9596743416868083, + "ewc_loss": 0.053679320961236954, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00023161743592936546, + "grad_norm": 6.255364418029785, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.868225634098053, + "num_tokens": 287720340.0, + "step": 7544 + }, + { + "epoch": 0.9598015519653988, + "ewc_loss": 0.05334802344441414, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002283044595969841, + "grad_norm": 6.231341361999512, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8652088642120361, + "num_tokens": 287760931.0, + "step": 7545 + }, + { + "epoch": 0.9599287622439893, + "ewc_loss": 0.05369601398706436, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022934297157917172, + "grad_norm": 6.250006675720215, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8473855257034302, + "num_tokens": 287801723.0, + "step": 7546 + }, + { + "epoch": 0.9600559725225798, + "ewc_loss": 0.053236089646816254, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022718514082953334, + "grad_norm": 6.246949672698975, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.859709620475769, + "num_tokens": 287837010.0, + "step": 7547 + }, + { + "epoch": 0.9601831828011703, + "ewc_loss": 0.05324825271964073, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002273067511850968, + "grad_norm": 6.241258144378662, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8627448081970215, + "num_tokens": 287874381.0, + "step": 7548 + }, + { + "epoch": 0.9603103930797608, + "ewc_loss": 0.05323968455195427, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022722105495631695, + "grad_norm": 6.224949359893799, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8531651496887207, + "num_tokens": 287916673.0, + "step": 7549 + }, + { + "epoch": 0.9604376033583514, + "ewc_loss": 0.053204379975795746, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002268679963890463, + "grad_norm": 6.238563060760498, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8562865257263184, + "num_tokens": 287956764.0, + "step": 7550 + }, + { + "epoch": 0.9605648136369419, + "ewc_loss": 0.05336620658636093, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022604486730415374, + "grad_norm": 6.197361469268799, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8690835237503052, + "num_tokens": 287996373.0, + "step": 7551 + }, + { + "epoch": 0.9606920239155323, + "ewc_loss": 0.053179092705249786, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022661517141386867, + "grad_norm": 6.272982120513916, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8562150001525879, + "num_tokens": 288033671.0, + "step": 7552 + }, + { + "epoch": 0.9608192341941229, + "ewc_loss": 0.05308717489242554, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002256959560327232, + "grad_norm": 6.219427585601807, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8700069189071655, + "num_tokens": 288068254.0, + "step": 7553 + }, + { + "epoch": 0.9609464444727134, + "ewc_loss": 0.053140074014663696, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022622496180702, + "grad_norm": 6.225125789642334, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.845600962638855, + "num_tokens": 288102833.0, + "step": 7554 + }, + { + "epoch": 0.9610736547513039, + "ewc_loss": 0.05303546041250229, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022517883917316794, + "grad_norm": 6.23574686050415, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8741533756256104, + "num_tokens": 288135927.0, + "step": 7555 + }, + { + "epoch": 0.9612008650298944, + "ewc_loss": 0.0530509315431118, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022533352603204548, + "grad_norm": 6.1767988204956055, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8806507587432861, + "num_tokens": 288173842.0, + "step": 7556 + }, + { + "epoch": 0.961328075308485, + "ewc_loss": 0.05302165448665619, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022504078515339643, + "grad_norm": 6.202942848205566, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8651881814002991, + "num_tokens": 288215007.0, + "step": 7557 + }, + { + "epoch": 0.9614552855870754, + "ewc_loss": 0.05301613733172417, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022498558973893523, + "grad_norm": 6.232407569885254, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.850042462348938, + "num_tokens": 288251598.0, + "step": 7558 + }, + { + "epoch": 0.9615824958656659, + "ewc_loss": 0.05319548025727272, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.0002243376220576465, + "grad_norm": 6.198779582977295, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8622623085975647, + "num_tokens": 288292811.0, + "step": 7559 + }, + { + "epoch": 0.9617097061442564, + "ewc_loss": 0.053020983934402466, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022503403306473047, + "grad_norm": 6.181947708129883, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8653583526611328, + "num_tokens": 288337790.0, + "step": 7560 + }, + { + "epoch": 0.961836916422847, + "ewc_loss": 0.05302274599671364, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022505168453790247, + "grad_norm": 6.233567714691162, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.857965350151062, + "num_tokens": 288373347.0, + "step": 7561 + }, + { + "epoch": 0.9619641267014375, + "ewc_loss": 0.05296923220157623, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022451653785537928, + "grad_norm": 6.185120582580566, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8812058568000793, + "num_tokens": 288405650.0, + "step": 7562 + }, + { + "epoch": 0.962091336980028, + "ewc_loss": 0.05303077772259712, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022513199655804783, + "grad_norm": 6.2359185218811035, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8671813011169434, + "num_tokens": 288443849.0, + "step": 7563 + }, + { + "epoch": 0.9622185472586186, + "ewc_loss": 0.05296386033296585, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002244628412881866, + "grad_norm": 6.182197093963623, + "learning_rate": 1e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8409554362297058, + "num_tokens": 288487557.0, + "step": 7564 + }, + { + "epoch": 0.962345757537209, + "ewc_loss": 0.052954137325286865, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002243655762868002, + "grad_norm": 6.199584007263184, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8511010408401489, + "num_tokens": 288529932.0, + "step": 7565 + }, + { + "epoch": 0.9624729678157995, + "ewc_loss": 0.05290181189775467, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022384236217476428, + "grad_norm": 6.205485820770264, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8572759628295898, + "num_tokens": 288568276.0, + "step": 7566 + }, + { + "epoch": 0.96260017809439, + "ewc_loss": 0.05320055037736893, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022438833548221737, + "grad_norm": 12.987800598144531, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8629682064056396, + "num_tokens": 288602387.0, + "step": 7567 + }, + { + "epoch": 0.9627273883729806, + "ewc_loss": 0.062219295650720596, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00031701716943643987, + "grad_norm": 7.557741641998291, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8692163228988647, + "num_tokens": 288638157.0, + "step": 7568 + }, + { + "epoch": 0.9628545986515711, + "ewc_loss": 0.05114975571632385, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000206321754376404, + "grad_norm": 5.731993675231934, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.85771644115448, + "num_tokens": 288675054.0, + "step": 7569 + }, + { + "epoch": 0.9629818089301616, + "ewc_loss": 0.05514364689588547, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00024626069352962077, + "grad_norm": 6.863055229187012, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8659594058990479, + "num_tokens": 288714222.0, + "step": 7570 + }, + { + "epoch": 0.963109019208752, + "ewc_loss": 0.05403668433427811, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00023519108071923256, + "grad_norm": 6.195247173309326, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8511202335357666, + "num_tokens": 288752772.0, + "step": 7571 + }, + { + "epoch": 0.9632362294873426, + "ewc_loss": 0.05384999141097069, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.000233324128203094, + "grad_norm": 6.59174108505249, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.845242977142334, + "num_tokens": 288792067.0, + "step": 7572 + }, + { + "epoch": 0.9633634397659331, + "ewc_loss": 0.05386582762002945, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00023348246759269387, + "grad_norm": 6.3264641761779785, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8728330135345459, + "num_tokens": 288826172.0, + "step": 7573 + }, + { + "epoch": 0.9634906500445236, + "ewc_loss": 0.05345051735639572, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002293293655384332, + "grad_norm": 6.411537170410156, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8716855049133301, + "num_tokens": 288863774.0, + "step": 7574 + }, + { + "epoch": 0.9636178603231141, + "ewc_loss": 0.05356266722083092, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00023045088164508343, + "grad_norm": 6.3358354568481445, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8546695709228516, + "num_tokens": 288901892.0, + "step": 7575 + }, + { + "epoch": 0.9637450706017047, + "ewc_loss": 0.05326075106859207, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022743175213690847, + "grad_norm": 6.246399879455566, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8501957058906555, + "num_tokens": 288941470.0, + "step": 7576 + }, + { + "epoch": 0.9638722808802951, + "ewc_loss": 0.05327976495027542, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022762187290936708, + "grad_norm": 6.273881912231445, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8474574089050293, + "num_tokens": 288988616.0, + "step": 7577 + }, + { + "epoch": 0.9639994911588856, + "ewc_loss": 0.05320992320775986, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022692345373798162, + "grad_norm": 6.273102760314941, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.877210259437561, + "num_tokens": 289024589.0, + "step": 7578 + }, + { + "epoch": 0.9641267014374761, + "ewc_loss": 0.05310828238725662, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022590703156311065, + "grad_norm": 6.214968204498291, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8630293011665344, + "num_tokens": 289064334.0, + "step": 7579 + }, + { + "epoch": 0.9642539117160667, + "ewc_loss": 0.053165748715400696, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022648168669547886, + "grad_norm": 6.277777194976807, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8501850366592407, + "num_tokens": 289099385.0, + "step": 7580 + }, + { + "epoch": 0.9643811219946572, + "ewc_loss": 0.05315573513507843, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002263815695187077, + "grad_norm": 6.244647979736328, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.86479651927948, + "num_tokens": 289139143.0, + "step": 7581 + }, + { + "epoch": 0.9645083322732477, + "ewc_loss": 0.05303601920604706, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022518442710861564, + "grad_norm": 6.222583293914795, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8542601466178894, + "num_tokens": 289179303.0, + "step": 7582 + }, + { + "epoch": 0.9646355425518381, + "ewc_loss": 0.053080350160598755, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022562769299838692, + "grad_norm": 6.29826021194458, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8522448539733887, + "num_tokens": 289214218.0, + "step": 7583 + }, + { + "epoch": 0.9647627528304287, + "ewc_loss": 0.05300527065992355, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022487694513984025, + "grad_norm": 6.275749683380127, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8577073812484741, + "num_tokens": 289240991.0, + "step": 7584 + }, + { + "epoch": 0.9648899631090192, + "ewc_loss": 0.05303072929382324, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002251315163448453, + "grad_norm": 6.234570503234863, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8547983169555664, + "num_tokens": 289273374.0, + "step": 7585 + }, + { + "epoch": 0.9650171733876097, + "ewc_loss": 0.05305016040802002, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022532579896505922, + "grad_norm": 6.222139835357666, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.847277045249939, + "num_tokens": 289314904.0, + "step": 7586 + }, + { + "epoch": 0.9651443836662003, + "ewc_loss": 0.053524233400821686, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022518372861668468, + "grad_norm": 6.174632549285889, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8538644313812256, + "num_tokens": 289358110.0, + "step": 7587 + }, + { + "epoch": 0.9652715939447908, + "ewc_loss": 0.053081341087818146, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022563760285265744, + "grad_norm": 6.240452289581299, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8512990474700928, + "num_tokens": 289391581.0, + "step": 7588 + }, + { + "epoch": 0.9653988042233812, + "ewc_loss": 0.0529940128326416, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022476434241980314, + "grad_norm": 6.144260883331299, + "learning_rate": 1e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8346893191337585, + "num_tokens": 289432280.0, + "step": 7589 + }, + { + "epoch": 0.9655260145019717, + "ewc_loss": 0.05335544794797897, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022593731409870088, + "grad_norm": 6.269571304321289, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8449101448059082, + "num_tokens": 289473222.0, + "step": 7590 + }, + { + "epoch": 0.9656532247805623, + "ewc_loss": 0.05309668555855751, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022579108190257102, + "grad_norm": 6.192045211791992, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8567006587982178, + "num_tokens": 289512898.0, + "step": 7591 + }, + { + "epoch": 0.9657804350591528, + "ewc_loss": 0.05312370881438255, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.0002260613109683618, + "grad_norm": 6.212050914764404, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8641641736030579, + "num_tokens": 289554394.0, + "step": 7592 + }, + { + "epoch": 0.9659076453377433, + "ewc_loss": 0.053151343017816544, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022633765183854848, + "grad_norm": 6.227896690368652, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8569434285163879, + "num_tokens": 289594871.0, + "step": 7593 + }, + { + "epoch": 0.9660348556163338, + "ewc_loss": 0.05352012813091278, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022514269221574068, + "grad_norm": 6.2414422035217285, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.849744439125061, + "num_tokens": 289629165.0, + "step": 7594 + }, + { + "epoch": 0.9661620658949243, + "ewc_loss": 0.05355992913246155, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022554070164915174, + "grad_norm": 6.164697647094727, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8592286109924316, + "num_tokens": 289670131.0, + "step": 7595 + }, + { + "epoch": 0.9662892761735148, + "ewc_loss": 0.05308227613568306, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022564698883797973, + "grad_norm": 6.224544525146484, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8633928298950195, + "num_tokens": 289705238.0, + "step": 7596 + }, + { + "epoch": 0.9664164864521053, + "ewc_loss": 0.053077585995197296, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022560010256711394, + "grad_norm": 6.213593006134033, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.863426148891449, + "num_tokens": 289740146.0, + "step": 7597 + }, + { + "epoch": 0.9665436967306958, + "ewc_loss": 0.05355970561504364, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022553844610229135, + "grad_norm": 6.154791355133057, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8645666241645813, + "num_tokens": 289781145.0, + "step": 7598 + }, + { + "epoch": 0.9666709070092864, + "ewc_loss": 0.053098518401384354, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022580940276384354, + "grad_norm": 6.368585109710693, + "learning_rate": 1e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8433862328529358, + "num_tokens": 289816347.0, + "step": 7599 + }, + { + "epoch": 0.9667981172878769, + "ewc_loss": 0.05353512614965439, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022529269335791469, + "grad_norm": 6.138360023498535, + "learning_rate": 1e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8463035821914673, + "num_tokens": 289852974.0, + "step": 7600 + }, + { + "epoch": 0.9669253275664673, + "ewc_loss": 0.05364196002483368, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022636099311057478, + "grad_norm": 6.224916934967041, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.847895622253418, + "num_tokens": 289894553.0, + "step": 7601 + }, + { + "epoch": 0.9670525378450578, + "ewc_loss": 0.053576670587062836, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022570809233002365, + "grad_norm": 6.204261302947998, + "learning_rate": 1e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.8419755697250366, + "num_tokens": 289930641.0, + "step": 7602 + }, + { + "epoch": 0.9671797481236484, + "ewc_loss": 0.05357339233160019, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022567535052075982, + "grad_norm": 6.202609539031982, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.853918194770813, + "num_tokens": 289972690.0, + "step": 7603 + }, + { + "epoch": 0.9673069584022389, + "ewc_loss": 0.053619399666786194, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002261353947687894, + "grad_norm": 6.183341979980469, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8537366986274719, + "num_tokens": 290006518.0, + "step": 7604 + }, + { + "epoch": 0.9674341686808294, + "ewc_loss": 0.053630903363227844, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022625042765866965, + "grad_norm": 6.204195022583008, + "learning_rate": 1e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8469454050064087, + "num_tokens": 290045206.0, + "step": 7605 + }, + { + "epoch": 0.96756137895942, + "ewc_loss": 0.05361216515302658, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022606304264627397, + "grad_norm": 6.147992134094238, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8651812076568604, + "num_tokens": 290086976.0, + "step": 7606 + }, + { + "epoch": 0.9676885892380104, + "ewc_loss": 0.053648777306079865, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002264291833853349, + "grad_norm": 6.1654534339904785, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8731496930122375, + "num_tokens": 290126894.0, + "step": 7607 + }, + { + "epoch": 0.9678157995166009, + "ewc_loss": 0.05366365239024162, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022657793306279927, + "grad_norm": 6.177051544189453, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8742104768753052, + "num_tokens": 290161684.0, + "step": 7608 + }, + { + "epoch": 0.9679430097951914, + "ewc_loss": 0.05373801290988922, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022732153593096882, + "grad_norm": 6.180927753448486, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8621201515197754, + "num_tokens": 290203407.0, + "step": 7609 + }, + { + "epoch": 0.968070220073782, + "ewc_loss": 0.05371864140033722, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022712780628353357, + "grad_norm": 6.1660332679748535, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8672451972961426, + "num_tokens": 290238645.0, + "step": 7610 + }, + { + "epoch": 0.9681974303523725, + "ewc_loss": 0.05370248109102249, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022696623636875302, + "grad_norm": 6.1887431144714355, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8597351312637329, + "num_tokens": 290280279.0, + "step": 7611 + }, + { + "epoch": 0.968324640630963, + "ewc_loss": 0.05383043363690376, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022824574261903763, + "grad_norm": 6.278118133544922, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8595970273017883, + "num_tokens": 290317739.0, + "step": 7612 + }, + { + "epoch": 0.9684518509095535, + "ewc_loss": 0.05361386761069298, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002260800829390064, + "grad_norm": 6.132600784301758, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8581929802894592, + "num_tokens": 290360384.0, + "step": 7613 + }, + { + "epoch": 0.968579061188144, + "ewc_loss": 0.0537324883043766, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002272662823088467, + "grad_norm": 6.236444473266602, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8668721914291382, + "num_tokens": 290398706.0, + "step": 7614 + }, + { + "epoch": 0.9687062714667345, + "ewc_loss": 0.053683627396821976, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022677767265122384, + "grad_norm": 6.176823616027832, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8731980919837952, + "num_tokens": 290439524.0, + "step": 7615 + }, + { + "epoch": 0.968833481745325, + "ewc_loss": 0.053705330938100815, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022699471446685493, + "grad_norm": 6.333960056304932, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8540459275245667, + "num_tokens": 290472125.0, + "step": 7616 + }, + { + "epoch": 0.9689606920239155, + "ewc_loss": 0.053620509803295135, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022614649788010865, + "grad_norm": 6.169620990753174, + "learning_rate": 1e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8442348837852478, + "num_tokens": 290514348.0, + "step": 7617 + }, + { + "epoch": 0.9690879023025061, + "ewc_loss": 0.053637199103832245, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002263133937958628, + "grad_norm": 6.251199722290039, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8542575836181641, + "num_tokens": 290551699.0, + "step": 7618 + }, + { + "epoch": 0.9692151125810966, + "ewc_loss": 0.053617820143699646, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002261195913888514, + "grad_norm": 6.208548069000244, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8620550632476807, + "num_tokens": 290588644.0, + "step": 7619 + }, + { + "epoch": 0.969342322859687, + "ewc_loss": 0.053682632744312286, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022676773369312286, + "grad_norm": 6.194770336151123, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8620917797088623, + "num_tokens": 290629546.0, + "step": 7620 + }, + { + "epoch": 0.9694695331382776, + "ewc_loss": 0.05365796387195587, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022652106417808682, + "grad_norm": 6.204475402832031, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8685201406478882, + "num_tokens": 290665527.0, + "step": 7621 + }, + { + "epoch": 0.9695967434168681, + "ewc_loss": 0.05361839383840561, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022612535394728184, + "grad_norm": 6.170851230621338, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8507211208343506, + "num_tokens": 290706667.0, + "step": 7622 + }, + { + "epoch": 0.9697239536954586, + "ewc_loss": 0.05368741601705551, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002268155658384785, + "grad_norm": 6.276490688323975, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8551881909370422, + "num_tokens": 290745698.0, + "step": 7623 + }, + { + "epoch": 0.9698511639740491, + "ewc_loss": 0.05358287692070007, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022577015624847263, + "grad_norm": 6.184074878692627, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8655374050140381, + "num_tokens": 290777983.0, + "step": 7624 + }, + { + "epoch": 0.9699783742526397, + "ewc_loss": 0.05369916185736656, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022693302889820188, + "grad_norm": 6.234246730804443, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8541609048843384, + "num_tokens": 290814184.0, + "step": 7625 + }, + { + "epoch": 0.9701055845312301, + "ewc_loss": 0.05358235910534859, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002257650048704818, + "grad_norm": 6.152849197387695, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8519407510757446, + "num_tokens": 290854054.0, + "step": 7626 + }, + { + "epoch": 0.9702327948098206, + "ewc_loss": 0.05365065485239029, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022644794080406427, + "grad_norm": 6.2533979415893555, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8583594560623169, + "num_tokens": 290891325.0, + "step": 7627 + }, + { + "epoch": 0.9703600050884111, + "ewc_loss": 0.053650226444005966, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022644367709290236, + "grad_norm": 6.14581823348999, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8587517738342285, + "num_tokens": 290930101.0, + "step": 7628 + }, + { + "epoch": 0.9704872153670017, + "ewc_loss": 0.05360516905784607, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022599307703785598, + "grad_norm": 6.264736652374268, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8596975803375244, + "num_tokens": 290964691.0, + "step": 7629 + }, + { + "epoch": 0.9706144256455922, + "ewc_loss": 0.05360107496380806, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022595215705223382, + "grad_norm": 6.1508378982543945, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8567755222320557, + "num_tokens": 291008140.0, + "step": 7630 + }, + { + "epoch": 0.9707416359241827, + "ewc_loss": 0.05366198718547821, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022656125656794757, + "grad_norm": 6.213939189910889, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8539758920669556, + "num_tokens": 291041484.0, + "step": 7631 + }, + { + "epoch": 0.9708688462027731, + "ewc_loss": 0.05358476564288139, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022578905918635428, + "grad_norm": 6.199221611022949, + "learning_rate": 1e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8417725563049316, + "num_tokens": 291082337.0, + "step": 7632 + }, + { + "epoch": 0.9709960564813637, + "ewc_loss": 0.053619593381881714, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022613733017351478, + "grad_norm": 6.2462334632873535, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8591879606246948, + "num_tokens": 291115921.0, + "step": 7633 + }, + { + "epoch": 0.9711232667599542, + "ewc_loss": 0.05366778373718262, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022661921684630215, + "grad_norm": 6.235396862030029, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8683418035507202, + "num_tokens": 291154367.0, + "step": 7634 + }, + { + "epoch": 0.9712504770385447, + "ewc_loss": 0.05361691489815712, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002261105546494946, + "grad_norm": 6.156406879425049, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8599798083305359, + "num_tokens": 291198919.0, + "step": 7635 + }, + { + "epoch": 0.9713776873171353, + "ewc_loss": 0.0536905974149704, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022684739087708294, + "grad_norm": 6.233457088470459, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8481711745262146, + "num_tokens": 291237062.0, + "step": 7636 + }, + { + "epoch": 0.9715048975957258, + "ewc_loss": 0.053611427545547485, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022605567937716842, + "grad_norm": 6.161396026611328, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8716343641281128, + "num_tokens": 291275676.0, + "step": 7637 + }, + { + "epoch": 0.9716321078743162, + "ewc_loss": 0.05361708998680115, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022611232998315245, + "grad_norm": 6.1875200271606445, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8519760370254517, + "num_tokens": 291316785.0, + "step": 7638 + }, + { + "epoch": 0.9717593181529067, + "ewc_loss": 0.05367940664291382, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022673547209706157, + "grad_norm": 6.19696569442749, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8571252822875977, + "num_tokens": 291360448.0, + "step": 7639 + }, + { + "epoch": 0.9718865284314973, + "ewc_loss": 0.05367749556899071, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022671636543236673, + "grad_norm": 6.182798862457275, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8569408655166626, + "num_tokens": 291401650.0, + "step": 7640 + }, + { + "epoch": 0.9720137387100878, + "ewc_loss": 0.05367716774344444, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022671307669952512, + "grad_norm": 6.20466423034668, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8604898452758789, + "num_tokens": 291439736.0, + "step": 7641 + }, + { + "epoch": 0.9721409489886783, + "ewc_loss": 0.05369844287633896, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002269258111482486, + "grad_norm": 6.243852138519287, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8649781942367554, + "num_tokens": 291479077.0, + "step": 7642 + }, + { + "epoch": 0.9722681592672688, + "ewc_loss": 0.05358900874853134, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022583150712307543, + "grad_norm": 6.233030319213867, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.870231568813324, + "num_tokens": 291512470.0, + "step": 7643 + }, + { + "epoch": 0.9723953695458593, + "ewc_loss": 0.05363190919160843, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022626048303209245, + "grad_norm": 6.184804916381836, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.870926022529602, + "num_tokens": 291552797.0, + "step": 7644 + }, + { + "epoch": 0.9725225798244498, + "ewc_loss": 0.05360833555459976, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022602475655730814, + "grad_norm": 6.223114967346191, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8632750511169434, + "num_tokens": 291589885.0, + "step": 7645 + }, + { + "epoch": 0.9726497901030403, + "ewc_loss": 0.05362927168607712, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022623412951361388, + "grad_norm": 6.148478031158447, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8593529462814331, + "num_tokens": 291633005.0, + "step": 7646 + }, + { + "epoch": 0.9727770003816308, + "ewc_loss": 0.053671695291996, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022665836149826646, + "grad_norm": 6.326552391052246, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8564021587371826, + "num_tokens": 291666083.0, + "step": 7647 + }, + { + "epoch": 0.9729042106602214, + "ewc_loss": 0.05362214148044586, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002261628396809101, + "grad_norm": 6.183821201324463, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.865210235118866, + "num_tokens": 291706421.0, + "step": 7648 + }, + { + "epoch": 0.9730314209388119, + "ewc_loss": 0.0536830760538578, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002267721574753523, + "grad_norm": 6.222743988037109, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8645461797714233, + "num_tokens": 291744641.0, + "step": 7649 + }, + { + "epoch": 0.9731586312174023, + "ewc_loss": 0.05351670831441879, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022510849521495402, + "grad_norm": 6.107792854309082, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8852478265762329, + "num_tokens": 291780586.0, + "step": 7650 + }, + { + "epoch": 0.9732858414959928, + "ewc_loss": 0.05374503880739212, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002273918071296066, + "grad_norm": 6.251657962799072, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8559413552284241, + "num_tokens": 291820482.0, + "step": 7651 + }, + { + "epoch": 0.9734130517745834, + "ewc_loss": 0.053616344928741455, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022610485029872507, + "grad_norm": 6.177188873291016, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8699765801429749, + "num_tokens": 291858568.0, + "step": 7652 + }, + { + "epoch": 0.9735402620531739, + "ewc_loss": 0.05365060269832611, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022644743148703128, + "grad_norm": 6.281825065612793, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8512399196624756, + "num_tokens": 291889383.0, + "step": 7653 + }, + { + "epoch": 0.9736674723317644, + "ewc_loss": 0.05366165190935135, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022655790962744504, + "grad_norm": 6.223104000091553, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8689649105072021, + "num_tokens": 291927541.0, + "step": 7654 + }, + { + "epoch": 0.973794682610355, + "ewc_loss": 0.05360995978116989, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000226040996494703, + "grad_norm": 6.208578586578369, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8443759679794312, + "num_tokens": 291969074.0, + "step": 7655 + }, + { + "epoch": 0.9739218928889454, + "ewc_loss": 0.05359993502497673, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022594074835069478, + "grad_norm": 6.355595588684082, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8576357364654541, + "num_tokens": 292008311.0, + "step": 7656 + }, + { + "epoch": 0.9740491031675359, + "ewc_loss": 0.053540196269750595, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002253433776786551, + "grad_norm": 6.2931437492370605, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8510864973068237, + "num_tokens": 292051810.0, + "step": 7657 + }, + { + "epoch": 0.9741763134461264, + "ewc_loss": 0.05344558507204056, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022439724125433713, + "grad_norm": 6.143674850463867, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8674190044403076, + "num_tokens": 292090411.0, + "step": 7658 + }, + { + "epoch": 0.974303523724717, + "ewc_loss": 0.0535525307059288, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022546669060830027, + "grad_norm": 6.215096950531006, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8519231081008911, + "num_tokens": 292124411.0, + "step": 7659 + }, + { + "epoch": 0.9744307340033075, + "ewc_loss": 0.053485751152038574, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022479890321847051, + "grad_norm": 6.10704231262207, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8593136072158813, + "num_tokens": 292166428.0, + "step": 7660 + }, + { + "epoch": 0.974557944281898, + "ewc_loss": 0.05364900454878807, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022643145348411053, + "grad_norm": 6.215052604675293, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8618019223213196, + "num_tokens": 292202687.0, + "step": 7661 + }, + { + "epoch": 0.9746851545604885, + "ewc_loss": 0.053133029490709305, + "ewc_loss_diag": 3.0517578125e-05, + "ewc_loss_parallel": 0.00022615451598539948, + "grad_norm": 6.142388820648193, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8689333200454712, + "num_tokens": 292246364.0, + "step": 7662 + }, + { + "epoch": 0.974812364839079, + "ewc_loss": 0.05367886275053024, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022673002968076617, + "grad_norm": 6.262362003326416, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.861741304397583, + "num_tokens": 292285789.0, + "step": 7663 + }, + { + "epoch": 0.9749395751176695, + "ewc_loss": 0.05335867404937744, + "ewc_loss_diag": 3.075599670410156e-05, + "ewc_loss_parallel": 0.00022596953203901649, + "grad_norm": 6.211252689361572, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8541773557662964, + "num_tokens": 292320755.0, + "step": 7664 + }, + { + "epoch": 0.97506678539626, + "ewc_loss": 0.053676456212997437, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022670597536489367, + "grad_norm": 6.1925201416015625, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8687777519226074, + "num_tokens": 292362818.0, + "step": 7665 + }, + { + "epoch": 0.9751939956748505, + "ewc_loss": 0.05365192890167236, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002264606737298891, + "grad_norm": 6.225503444671631, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8522928953170776, + "num_tokens": 292404016.0, + "step": 7666 + }, + { + "epoch": 0.9753212059534411, + "ewc_loss": 0.05369120091199875, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022685340081807226, + "grad_norm": 6.224383354187012, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8471700549125671, + "num_tokens": 292442117.0, + "step": 7667 + }, + { + "epoch": 0.9754484162320316, + "ewc_loss": 0.05366719886660576, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002266133960802108, + "grad_norm": 6.427980899810791, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8671534061431885, + "num_tokens": 292472523.0, + "step": 7668 + }, + { + "epoch": 0.975575626510622, + "ewc_loss": 0.053631119430065155, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022625258134212345, + "grad_norm": 6.171937942504883, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8651353120803833, + "num_tokens": 292508571.0, + "step": 7669 + }, + { + "epoch": 0.9757028367892125, + "ewc_loss": 0.05368244647979736, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022676588559988886, + "grad_norm": 6.263340473175049, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.857505202293396, + "num_tokens": 292545358.0, + "step": 7670 + }, + { + "epoch": 0.9758300470678031, + "ewc_loss": 0.05356839299201965, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022562533558811992, + "grad_norm": 6.162159442901611, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8452426195144653, + "num_tokens": 292583962.0, + "step": 7671 + }, + { + "epoch": 0.9759572573463936, + "ewc_loss": 0.05370841920375824, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022702560818288475, + "grad_norm": 6.25250244140625, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8587098717689514, + "num_tokens": 292616186.0, + "step": 7672 + }, + { + "epoch": 0.9760844676249841, + "ewc_loss": 0.05371157452464104, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022705714218318462, + "grad_norm": 6.200186252593994, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8552179336547852, + "num_tokens": 292657378.0, + "step": 7673 + }, + { + "epoch": 0.9762116779035747, + "ewc_loss": 0.053695324808359146, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022689465549774468, + "grad_norm": 6.256999492645264, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8742895126342773, + "num_tokens": 292692041.0, + "step": 7674 + }, + { + "epoch": 0.9763388881821651, + "ewc_loss": 0.05369477719068527, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002268891839776188, + "grad_norm": 6.249310493469238, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8578158020973206, + "num_tokens": 292725062.0, + "step": 7675 + }, + { + "epoch": 0.9764660984607556, + "ewc_loss": 0.05361471325159073, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002260885521536693, + "grad_norm": 6.118619918823242, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8679503202438354, + "num_tokens": 292766522.0, + "step": 7676 + }, + { + "epoch": 0.9765933087393461, + "ewc_loss": 0.05373833328485489, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022732473735231906, + "grad_norm": 6.230899333953857, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.859005331993103, + "num_tokens": 292811371.0, + "step": 7677 + }, + { + "epoch": 0.9767205190179367, + "ewc_loss": 0.053633131086826324, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022627269208896905, + "grad_norm": 6.183091163635254, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8603075742721558, + "num_tokens": 292850237.0, + "step": 7678 + }, + { + "epoch": 0.9768477292965272, + "ewc_loss": 0.05374421179294586, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002273835416417569, + "grad_norm": 6.255546569824219, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8647598624229431, + "num_tokens": 292887291.0, + "step": 7679 + }, + { + "epoch": 0.9769749395751177, + "ewc_loss": 0.05369115620851517, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022685294970870018, + "grad_norm": 6.140115737915039, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.872978925704956, + "num_tokens": 292928396.0, + "step": 7680 + }, + { + "epoch": 0.9771021498537081, + "ewc_loss": 0.053788382560014725, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002278252359246835, + "grad_norm": 6.2642502784729, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8638825416564941, + "num_tokens": 292962748.0, + "step": 7681 + }, + { + "epoch": 0.9772293601322987, + "ewc_loss": 0.05367998033761978, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022674119099974632, + "grad_norm": 6.175071716308594, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8716887831687927, + "num_tokens": 293002330.0, + "step": 7682 + }, + { + "epoch": 0.9773565704108892, + "ewc_loss": 0.05380886420607567, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022803003957960755, + "grad_norm": 6.277716636657715, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8612544536590576, + "num_tokens": 293038288.0, + "step": 7683 + }, + { + "epoch": 0.9774837806894797, + "ewc_loss": 0.05367250740528107, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022666646691504866, + "grad_norm": 6.159571170806885, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8626803159713745, + "num_tokens": 293078121.0, + "step": 7684 + }, + { + "epoch": 0.9776109909680702, + "ewc_loss": 0.053869158029556274, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022863296908326447, + "grad_norm": 6.252040386199951, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8571457862854004, + "num_tokens": 293116846.0, + "step": 7685 + }, + { + "epoch": 0.9777382012466608, + "ewc_loss": 0.0537673756480217, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002276151644764468, + "grad_norm": 6.15448522567749, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8564544916152954, + "num_tokens": 293162237.0, + "step": 7686 + }, + { + "epoch": 0.9778654115252512, + "ewc_loss": 0.05384790152311325, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002284203947056085, + "grad_norm": 6.299983978271484, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8570699691772461, + "num_tokens": 293196631.0, + "step": 7687 + }, + { + "epoch": 0.9779926218038417, + "ewc_loss": 0.053764283657073975, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022758424165658653, + "grad_norm": 6.17011022567749, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8701496720314026, + "num_tokens": 293238883.0, + "step": 7688 + }, + { + "epoch": 0.9781198320824323, + "ewc_loss": 0.05384164676070213, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022835787967778742, + "grad_norm": 6.309521675109863, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8510658740997314, + "num_tokens": 293270802.0, + "step": 7689 + }, + { + "epoch": 0.9782470423610228, + "ewc_loss": 0.05373778194189072, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022731925128027797, + "grad_norm": 6.208279132843018, + "learning_rate": 1e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.8473838567733765, + "num_tokens": 293314910.0, + "step": 7690 + }, + { + "epoch": 0.9783742526396133, + "ewc_loss": 0.05381382629275322, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002280796761624515, + "grad_norm": 6.237789154052734, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8743590116500854, + "num_tokens": 293353083.0, + "step": 7691 + }, + { + "epoch": 0.9785014629182038, + "ewc_loss": 0.05376766622066498, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022761804575566202, + "grad_norm": 6.236522674560547, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8618341684341431, + "num_tokens": 293389473.0, + "step": 7692 + }, + { + "epoch": 0.9786286731967943, + "ewc_loss": 0.05371156334877014, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022705704031977803, + "grad_norm": 6.188930034637451, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8734005093574524, + "num_tokens": 293426893.0, + "step": 7693 + }, + { + "epoch": 0.9787558834753848, + "ewc_loss": 0.053754232823848724, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002274837315781042, + "grad_norm": 6.2028398513793945, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8659089207649231, + "num_tokens": 293468116.0, + "step": 7694 + }, + { + "epoch": 0.9788830937539753, + "ewc_loss": 0.053718239068984985, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000227123819058761, + "grad_norm": 6.249076843261719, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8639277219772339, + "num_tokens": 293503050.0, + "step": 7695 + }, + { + "epoch": 0.9790103040325658, + "ewc_loss": 0.0536326989531517, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002262684138258919, + "grad_norm": 6.189690589904785, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8498980402946472, + "num_tokens": 293541673.0, + "step": 7696 + }, + { + "epoch": 0.9791375143111564, + "ewc_loss": 0.05368853732943535, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022682677081320435, + "grad_norm": 6.232369899749756, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8581242561340332, + "num_tokens": 293578472.0, + "step": 7697 + }, + { + "epoch": 0.9792647245897469, + "ewc_loss": 0.05367588251829147, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002267002419102937, + "grad_norm": 6.216128826141357, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8625010848045349, + "num_tokens": 293611204.0, + "step": 7698 + }, + { + "epoch": 0.9793919348683373, + "ewc_loss": 0.05370054394006729, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022694685321766883, + "grad_norm": 6.228155612945557, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8506797552108765, + "num_tokens": 293651352.0, + "step": 7699 + }, + { + "epoch": 0.9795191451469278, + "ewc_loss": 0.05374162271618843, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002273576392326504, + "grad_norm": 6.23139762878418, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8721494674682617, + "num_tokens": 293687396.0, + "step": 7700 + }, + { + "epoch": 0.9796463554255184, + "ewc_loss": 0.053823329508304596, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002281746856169775, + "grad_norm": 6.24979829788208, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8568549156188965, + "num_tokens": 293725083.0, + "step": 7701 + }, + { + "epoch": 0.9797735657041089, + "ewc_loss": 0.053730979561805725, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022725117742083967, + "grad_norm": 6.22442626953125, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8607585430145264, + "num_tokens": 293766136.0, + "step": 7702 + }, + { + "epoch": 0.9799007759826994, + "ewc_loss": 0.05375014990568161, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022744292800780386, + "grad_norm": 6.260828495025635, + "learning_rate": 1e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8434134721755981, + "num_tokens": 293805810.0, + "step": 7703 + }, + { + "epoch": 0.98002798626129, + "ewc_loss": 0.05373826250433922, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022732402430847287, + "grad_norm": 6.308192729949951, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8528819680213928, + "num_tokens": 293839041.0, + "step": 7704 + }, + { + "epoch": 0.9801551965398804, + "ewc_loss": 0.05369385704398155, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022687997261527926, + "grad_norm": 6.191029071807861, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8754969835281372, + "num_tokens": 293876541.0, + "step": 7705 + }, + { + "epoch": 0.9802824068184709, + "ewc_loss": 0.05370621383190155, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022700356203131378, + "grad_norm": 6.220681667327881, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8753483295440674, + "num_tokens": 293918141.0, + "step": 7706 + }, + { + "epoch": 0.9804096170970614, + "ewc_loss": 0.05367133766412735, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002266547962790355, + "grad_norm": 6.236323356628418, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8476475477218628, + "num_tokens": 293958078.0, + "step": 7707 + }, + { + "epoch": 0.980536827375652, + "ewc_loss": 0.05375554412603378, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022749682830180973, + "grad_norm": 6.309598445892334, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.870037317276001, + "num_tokens": 293990347.0, + "step": 7708 + }, + { + "epoch": 0.9806640376542425, + "ewc_loss": 0.05369637534022331, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022690516198053956, + "grad_norm": 6.229071140289307, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8667029142379761, + "num_tokens": 294032136.0, + "step": 7709 + }, + { + "epoch": 0.980791247932833, + "ewc_loss": 0.053708139806985855, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022702279966324568, + "grad_norm": 6.344923496246338, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.847233772277832, + "num_tokens": 294065136.0, + "step": 7710 + }, + { + "epoch": 0.9809184582114235, + "ewc_loss": 0.05368362367153168, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002267776580993086, + "grad_norm": 6.248900413513184, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8738290071487427, + "num_tokens": 294104539.0, + "step": 7711 + }, + { + "epoch": 0.981045668490014, + "ewc_loss": 0.053637176752090454, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022631317551713437, + "grad_norm": 6.245602607727051, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8585001230239868, + "num_tokens": 294148962.0, + "step": 7712 + }, + { + "epoch": 0.9811728787686045, + "ewc_loss": 0.053643181920051575, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002263732167193666, + "grad_norm": 6.273194789886475, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8654280304908752, + "num_tokens": 294181866.0, + "step": 7713 + }, + { + "epoch": 0.981300089047195, + "ewc_loss": 0.05363722890615463, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022631367028225213, + "grad_norm": 6.193045139312744, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8692553043365479, + "num_tokens": 294223558.0, + "step": 7714 + }, + { + "epoch": 0.9814272993257855, + "ewc_loss": 0.05364854261279106, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022642684052698314, + "grad_norm": 6.243871688842773, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8656584620475769, + "num_tokens": 294260487.0, + "step": 7715 + }, + { + "epoch": 0.9815545096043761, + "ewc_loss": 0.05362996459007263, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002262410707771778, + "grad_norm": 6.2666497230529785, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8613541126251221, + "num_tokens": 294302642.0, + "step": 7716 + }, + { + "epoch": 0.9816817198829666, + "ewc_loss": 0.05365260690450668, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022646746947430074, + "grad_norm": 6.251917839050293, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8507957458496094, + "num_tokens": 294338376.0, + "step": 7717 + }, + { + "epoch": 0.981808930161557, + "ewc_loss": 0.053612418472766876, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002260655746795237, + "grad_norm": 6.27357292175293, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.855270266532898, + "num_tokens": 294372089.0, + "step": 7718 + }, + { + "epoch": 0.9819361404401475, + "ewc_loss": 0.053631268441677094, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022625410929322243, + "grad_norm": 6.33419132232666, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8652468323707581, + "num_tokens": 294403801.0, + "step": 7719 + }, + { + "epoch": 0.9820633507187381, + "ewc_loss": 0.053632594645023346, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022626735153608024, + "grad_norm": 6.1884260177612305, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8637428283691406, + "num_tokens": 294440301.0, + "step": 7720 + }, + { + "epoch": 0.9821905609973286, + "ewc_loss": 0.053701162338256836, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002269530377816409, + "grad_norm": 6.295997619628906, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8599079847335815, + "num_tokens": 294472967.0, + "step": 7721 + }, + { + "epoch": 0.9823177712759191, + "ewc_loss": 0.053666144609451294, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002266028750455007, + "grad_norm": 6.229053020477295, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8500406742095947, + "num_tokens": 294518248.0, + "step": 7722 + }, + { + "epoch": 0.9824449815545097, + "ewc_loss": 0.05372209846973419, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022716236708220094, + "grad_norm": 6.260018348693848, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8592337965965271, + "num_tokens": 294553899.0, + "step": 7723 + }, + { + "epoch": 0.9825721918331001, + "ewc_loss": 0.05375414714217186, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022748287301510572, + "grad_norm": 6.329730987548828, + "learning_rate": 1e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.83803391456604, + "num_tokens": 294591382.0, + "step": 7724 + }, + { + "epoch": 0.9826994021116906, + "ewc_loss": 0.05372181907296181, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022715960221830755, + "grad_norm": 6.244970321655273, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8614304065704346, + "num_tokens": 294629606.0, + "step": 7725 + }, + { + "epoch": 0.9828266123902811, + "ewc_loss": 0.053740181028842926, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022734318918082863, + "grad_norm": 6.241018772125244, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8636483550071716, + "num_tokens": 294669417.0, + "step": 7726 + }, + { + "epoch": 0.9829538226688717, + "ewc_loss": 0.05372956395149231, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022723701840732247, + "grad_norm": 6.224822521209717, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8698879480361938, + "num_tokens": 294709876.0, + "step": 7727 + }, + { + "epoch": 0.9830810329474622, + "ewc_loss": 0.05374089628458023, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022735039237886667, + "grad_norm": 6.323673248291016, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8516431450843811, + "num_tokens": 294750565.0, + "step": 7728 + }, + { + "epoch": 0.9832082432260527, + "ewc_loss": 0.05364370718598366, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022637848451267928, + "grad_norm": 6.210486888885498, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.861595094203949, + "num_tokens": 294790586.0, + "step": 7729 + }, + { + "epoch": 0.9833354535046431, + "ewc_loss": 0.053776323795318604, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022770465875510126, + "grad_norm": 6.377394676208496, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8588014841079712, + "num_tokens": 294830176.0, + "step": 7730 + }, + { + "epoch": 0.9834626637832337, + "ewc_loss": 0.05366746336221695, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002266160590806976, + "grad_norm": 6.231563568115234, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8485758900642395, + "num_tokens": 294874199.0, + "step": 7731 + }, + { + "epoch": 0.9835898740618242, + "ewc_loss": 0.05370745807886124, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002270159893669188, + "grad_norm": 6.240722179412842, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8672472238540649, + "num_tokens": 294914219.0, + "step": 7732 + }, + { + "epoch": 0.9837170843404147, + "ewc_loss": 0.053667232394218445, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022661371622234583, + "grad_norm": 6.311983108520508, + "learning_rate": 1e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8380661606788635, + "num_tokens": 294950726.0, + "step": 7733 + }, + { + "epoch": 0.9838442946190052, + "ewc_loss": 0.05360353738069534, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022597676434088498, + "grad_norm": 6.191773891448975, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8608399033546448, + "num_tokens": 294993815.0, + "step": 7734 + }, + { + "epoch": 0.9839715048975958, + "ewc_loss": 0.05375143140554428, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022745573369320482, + "grad_norm": 6.285004138946533, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8699325323104858, + "num_tokens": 295029275.0, + "step": 7735 + }, + { + "epoch": 0.9840987151761862, + "ewc_loss": 0.05373719334602356, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022731334320269525, + "grad_norm": 6.220250129699707, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8600649833679199, + "num_tokens": 295066166.0, + "step": 7736 + }, + { + "epoch": 0.9842259254547767, + "ewc_loss": 0.053714677691459656, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002270882105221972, + "grad_norm": 6.337826728820801, + "learning_rate": 1e-06, + "loss": 0.546, + "mean_token_accuracy": 0.8386378288269043, + "num_tokens": 295100436.0, + "step": 7737 + }, + { + "epoch": 0.9843531357333672, + "ewc_loss": 0.05367366969585419, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022667810844723135, + "grad_norm": 6.178590297698975, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8607187271118164, + "num_tokens": 295141756.0, + "step": 7738 + }, + { + "epoch": 0.9844803460119578, + "ewc_loss": 0.053781963884830475, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022776104742661119, + "grad_norm": 6.412909507751465, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8776382207870483, + "num_tokens": 295178894.0, + "step": 7739 + }, + { + "epoch": 0.9846075562905483, + "ewc_loss": 0.053678885102272034, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022673027706332505, + "grad_norm": 6.173154354095459, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8496605157852173, + "num_tokens": 295219133.0, + "step": 7740 + }, + { + "epoch": 0.9847347665691388, + "ewc_loss": 0.05380943417549133, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022803577303420752, + "grad_norm": 6.27480936050415, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8600618243217468, + "num_tokens": 295260322.0, + "step": 7741 + }, + { + "epoch": 0.9848619768477292, + "ewc_loss": 0.05372713506221771, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022721274581272155, + "grad_norm": 6.219242095947266, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8539287447929382, + "num_tokens": 295299675.0, + "step": 7742 + }, + { + "epoch": 0.9849891871263198, + "ewc_loss": 0.05375497043132782, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022749110939912498, + "grad_norm": 6.263737201690674, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8666245341300964, + "num_tokens": 295336281.0, + "step": 7743 + }, + { + "epoch": 0.9851163974049103, + "ewc_loss": 0.05371206998825073, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022706208983436227, + "grad_norm": 6.222995758056641, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8583561778068542, + "num_tokens": 295376932.0, + "step": 7744 + }, + { + "epoch": 0.9852436076835008, + "ewc_loss": 0.05373920872807503, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022733349760528654, + "grad_norm": 6.231987953186035, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.85687255859375, + "num_tokens": 295421904.0, + "step": 7745 + }, + { + "epoch": 0.9853708179620914, + "ewc_loss": 0.05370306968688965, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022697211534250528, + "grad_norm": 6.26637601852417, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8608840703964233, + "num_tokens": 295456688.0, + "step": 7746 + }, + { + "epoch": 0.9854980282406819, + "ewc_loss": 0.05372187867760658, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002271601842949167, + "grad_norm": 6.2154221534729, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8769908547401428, + "num_tokens": 295500022.0, + "step": 7747 + }, + { + "epoch": 0.9856252385192723, + "ewc_loss": 0.0537877082824707, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022781848383601755, + "grad_norm": 6.220800399780273, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8809330463409424, + "num_tokens": 295538106.0, + "step": 7748 + }, + { + "epoch": 0.9857524487978628, + "ewc_loss": 0.05372890084981918, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022723044094163924, + "grad_norm": 6.274568557739258, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8559467196464539, + "num_tokens": 295572465.0, + "step": 7749 + }, + { + "epoch": 0.9858796590764534, + "ewc_loss": 0.05380699038505554, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022801131126470864, + "grad_norm": 6.218564510345459, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8657300472259521, + "num_tokens": 295614577.0, + "step": 7750 + }, + { + "epoch": 0.9860068693550439, + "ewc_loss": 0.05372527241706848, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022719411936122924, + "grad_norm": 6.2082390785217285, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8633536100387573, + "num_tokens": 295654780.0, + "step": 7751 + }, + { + "epoch": 0.9861340796336344, + "ewc_loss": 0.053847089409828186, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002284122892888263, + "grad_norm": 6.257167339324951, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.859870195388794, + "num_tokens": 295695880.0, + "step": 7752 + }, + { + "epoch": 0.986261289912225, + "ewc_loss": 0.0537395179271698, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022733656805939972, + "grad_norm": 6.193728446960449, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8606058359146118, + "num_tokens": 295738176.0, + "step": 7753 + }, + { + "epoch": 0.9863885001908154, + "ewc_loss": 0.05377879738807678, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022772936790715903, + "grad_norm": 6.277309417724609, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8499526977539062, + "num_tokens": 295777562.0, + "step": 7754 + }, + { + "epoch": 0.9865157104694059, + "ewc_loss": 0.053758613765239716, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022752756194677204, + "grad_norm": 6.21212100982666, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8771766424179077, + "num_tokens": 295815013.0, + "step": 7755 + }, + { + "epoch": 0.9866429207479964, + "ewc_loss": 0.05382639542222023, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002282053610542789, + "grad_norm": 6.248979568481445, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8489658236503601, + "num_tokens": 295857751.0, + "step": 7756 + }, + { + "epoch": 0.986770131026587, + "ewc_loss": 0.053847894072532654, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022842033649794757, + "grad_norm": 6.2769646644592285, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8664273619651794, + "num_tokens": 295896033.0, + "step": 7757 + }, + { + "epoch": 0.9868973413051775, + "ewc_loss": 0.05374658480286598, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022740726126357913, + "grad_norm": 6.271556854248047, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.863425612449646, + "num_tokens": 295936838.0, + "step": 7758 + }, + { + "epoch": 0.987024551583768, + "ewc_loss": 0.053843237459659576, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022837378492113203, + "grad_norm": 6.231564998626709, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8663026094436646, + "num_tokens": 295972307.0, + "step": 7759 + }, + { + "epoch": 0.9871517618623584, + "ewc_loss": 0.05381183326244354, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022805975459050387, + "grad_norm": 6.371485233306885, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.858069896697998, + "num_tokens": 296009966.0, + "step": 7760 + }, + { + "epoch": 0.987278972140949, + "ewc_loss": 0.05374644696712494, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002274058642797172, + "grad_norm": 6.300476551055908, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8575657606124878, + "num_tokens": 296047874.0, + "step": 7761 + }, + { + "epoch": 0.9874061824195395, + "ewc_loss": 0.05380333960056305, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002279748150613159, + "grad_norm": 6.212976455688477, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8682501316070557, + "num_tokens": 296090591.0, + "step": 7762 + }, + { + "epoch": 0.98753339269813, + "ewc_loss": 0.05383208766579628, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022826227359473705, + "grad_norm": 6.3879852294921875, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8556870222091675, + "num_tokens": 296133024.0, + "step": 7763 + }, + { + "epoch": 0.9876606029767205, + "ewc_loss": 0.05370479077100754, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002269893157063052, + "grad_norm": 6.249103546142578, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8494428396224976, + "num_tokens": 296177347.0, + "step": 7764 + }, + { + "epoch": 0.9877878132553111, + "ewc_loss": 0.05379961431026459, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002279375767102465, + "grad_norm": 6.285559177398682, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8507878184318542, + "num_tokens": 296215532.0, + "step": 7765 + }, + { + "epoch": 0.9879150235339016, + "ewc_loss": 0.053698550909757614, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022692691709380597, + "grad_norm": 6.401391983032227, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8489603996276855, + "num_tokens": 296249494.0, + "step": 7766 + }, + { + "epoch": 0.988042233812492, + "ewc_loss": 0.053662363439798355, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022656504006590694, + "grad_norm": 6.223966598510742, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8520054221153259, + "num_tokens": 296289370.0, + "step": 7767 + }, + { + "epoch": 0.9881694440910825, + "ewc_loss": 0.05373191833496094, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022726060706190765, + "grad_norm": 6.268945217132568, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8619846105575562, + "num_tokens": 296329376.0, + "step": 7768 + }, + { + "epoch": 0.9882966543696731, + "ewc_loss": 0.05371033400297165, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002270447730552405, + "grad_norm": 6.278486251831055, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8667120337486267, + "num_tokens": 296368875.0, + "step": 7769 + }, + { + "epoch": 0.9884238646482636, + "ewc_loss": 0.05379023402929306, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022784373140893877, + "grad_norm": 6.255166530609131, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8588947057723999, + "num_tokens": 296404766.0, + "step": 7770 + }, + { + "epoch": 0.9885510749268541, + "ewc_loss": 0.0537923127412796, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022786451154388487, + "grad_norm": 6.254495620727539, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8584694266319275, + "num_tokens": 296441705.0, + "step": 7771 + }, + { + "epoch": 0.9886782852054447, + "ewc_loss": 0.053742315620183945, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002273645659442991, + "grad_norm": 6.216392993927002, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8669688701629639, + "num_tokens": 296481075.0, + "step": 7772 + }, + { + "epoch": 0.9888054954840351, + "ewc_loss": 0.05384451150894165, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022838650329504162, + "grad_norm": 6.319133758544922, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8570683002471924, + "num_tokens": 296511404.0, + "step": 7773 + }, + { + "epoch": 0.9889327057626256, + "ewc_loss": 0.053867973387241364, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022862115292809904, + "grad_norm": 6.270321369171143, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8504984378814697, + "num_tokens": 296546080.0, + "step": 7774 + }, + { + "epoch": 0.9890599160412161, + "ewc_loss": 0.05387984961271286, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022873988200444728, + "grad_norm": 6.244470596313477, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8621898889541626, + "num_tokens": 296586119.0, + "step": 7775 + }, + { + "epoch": 0.9891871263198067, + "ewc_loss": 0.05389345437288284, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002288759424118325, + "grad_norm": 6.206220626831055, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8587011098861694, + "num_tokens": 296629602.0, + "step": 7776 + }, + { + "epoch": 0.9893143365983972, + "ewc_loss": 0.053890205919742584, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022884344798512757, + "grad_norm": 6.307681083679199, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8536165952682495, + "num_tokens": 296666709.0, + "step": 7777 + }, + { + "epoch": 0.9894415468769877, + "ewc_loss": 0.05389854684472084, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002289268741151318, + "grad_norm": 6.286493301391602, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8598483800888062, + "num_tokens": 296702611.0, + "step": 7778 + }, + { + "epoch": 0.9895687571555781, + "ewc_loss": 0.05385756492614746, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022851707763038576, + "grad_norm": 6.271360397338867, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8662065267562866, + "num_tokens": 296737649.0, + "step": 7779 + }, + { + "epoch": 0.9896959674341687, + "ewc_loss": 0.053859978914260864, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022854121925774962, + "grad_norm": 6.2481255531311035, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8622376918792725, + "num_tokens": 296776915.0, + "step": 7780 + }, + { + "epoch": 0.9898231777127592, + "ewc_loss": 0.053893089294433594, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022887227532919496, + "grad_norm": 6.298525333404541, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8673874735832214, + "num_tokens": 296811491.0, + "step": 7781 + }, + { + "epoch": 0.9899503879913497, + "ewc_loss": 0.05381156876683235, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022805709159001708, + "grad_norm": 6.288109302520752, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8651286959648132, + "num_tokens": 296847602.0, + "step": 7782 + }, + { + "epoch": 0.9900775982699402, + "ewc_loss": 0.053862571716308594, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022856710711494088, + "grad_norm": 6.329073429107666, + "learning_rate": 1e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8435041308403015, + "num_tokens": 296884737.0, + "step": 7783 + }, + { + "epoch": 0.9902048085485308, + "ewc_loss": 0.053829725831747055, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002282386558363214, + "grad_norm": 6.203357696533203, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8481792211532593, + "num_tokens": 296927780.0, + "step": 7784 + }, + { + "epoch": 0.9903320188271212, + "ewc_loss": 0.05389668792486191, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022890827676746994, + "grad_norm": 6.295331001281738, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8745883107185364, + "num_tokens": 296964200.0, + "step": 7785 + }, + { + "epoch": 0.9904592291057117, + "ewc_loss": 0.053911857306957245, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022905996593181044, + "grad_norm": 6.217863082885742, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.852059006690979, + "num_tokens": 297009927.0, + "step": 7786 + }, + { + "epoch": 0.9905864393843022, + "ewc_loss": 0.05395982041954994, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022953961160965264, + "grad_norm": 6.31089973449707, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.847845733165741, + "num_tokens": 297050788.0, + "step": 7787 + }, + { + "epoch": 0.9907136496628928, + "ewc_loss": 0.053862541913986206, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022856680152472109, + "grad_norm": 6.2138190269470215, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8658050298690796, + "num_tokens": 297092398.0, + "step": 7788 + }, + { + "epoch": 0.9908408599414833, + "ewc_loss": 0.05398895964026451, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022983099916018546, + "grad_norm": 6.336402416229248, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8579330444335938, + "num_tokens": 297130728.0, + "step": 7789 + }, + { + "epoch": 0.9909680702200738, + "ewc_loss": 0.05385669320821762, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002285083319293335, + "grad_norm": 6.205671787261963, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8834215402603149, + "num_tokens": 297169124.0, + "step": 7790 + }, + { + "epoch": 0.9910952804986642, + "ewc_loss": 0.05391047149896622, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002290460979565978, + "grad_norm": 6.311321258544922, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8703943490982056, + "num_tokens": 297204914.0, + "step": 7791 + }, + { + "epoch": 0.9912224907772548, + "ewc_loss": 0.053821638226509094, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022815779084339738, + "grad_norm": 6.22379207611084, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.857138991355896, + "num_tokens": 297240551.0, + "step": 7792 + }, + { + "epoch": 0.9913497010558453, + "ewc_loss": 0.05397545546293259, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022969598649069667, + "grad_norm": 6.242388725280762, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8517321348190308, + "num_tokens": 297287160.0, + "step": 7793 + }, + { + "epoch": 0.9914769113344358, + "ewc_loss": 0.05391101539134979, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022905156947672367, + "grad_norm": 6.239381313323975, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.862317681312561, + "num_tokens": 297323394.0, + "step": 7794 + }, + { + "epoch": 0.9916041216130264, + "ewc_loss": 0.05393153429031372, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022925676603335887, + "grad_norm": 6.341548919677734, + "learning_rate": 1e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8384049534797668, + "num_tokens": 297357808.0, + "step": 7795 + }, + { + "epoch": 0.9917313318916169, + "ewc_loss": 0.05399491637945175, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002298905747011304, + "grad_norm": 6.286245346069336, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8561626672744751, + "num_tokens": 297397300.0, + "step": 7796 + }, + { + "epoch": 0.9918585421702073, + "ewc_loss": 0.05388762056827545, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022881763288751245, + "grad_norm": 6.22752571105957, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8689240217208862, + "num_tokens": 297434699.0, + "step": 7797 + }, + { + "epoch": 0.9919857524487978, + "ewc_loss": 0.053946368396282196, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002294050791533664, + "grad_norm": 6.270772457122803, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8602675795555115, + "num_tokens": 297473939.0, + "step": 7798 + }, + { + "epoch": 0.9921129627273884, + "ewc_loss": 0.053897425532341, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022891565458849072, + "grad_norm": 6.3136796951293945, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8529654145240784, + "num_tokens": 297511154.0, + "step": 7799 + }, + { + "epoch": 0.9922401730059789, + "ewc_loss": 0.05396468937397003, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022958831686992198, + "grad_norm": 6.242799758911133, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.851768970489502, + "num_tokens": 297556092.0, + "step": 7800 + }, + { + "epoch": 0.9923673832845694, + "ewc_loss": 0.053967542946338654, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022961680951993912, + "grad_norm": 6.298675537109375, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8655483722686768, + "num_tokens": 297598160.0, + "step": 7801 + }, + { + "epoch": 0.9924945935631599, + "ewc_loss": 0.0539185106754303, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000229126526392065, + "grad_norm": 6.2574639320373535, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8768079280853271, + "num_tokens": 297638706.0, + "step": 7802 + }, + { + "epoch": 0.9926218038417504, + "ewc_loss": 0.0539647713303566, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022958914632909, + "grad_norm": 6.292490005493164, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8651331663131714, + "num_tokens": 297675955.0, + "step": 7803 + }, + { + "epoch": 0.9927490141203409, + "ewc_loss": 0.05393961817026138, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000229337572818622, + "grad_norm": 6.238497734069824, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8558971881866455, + "num_tokens": 297715362.0, + "step": 7804 + }, + { + "epoch": 0.9928762243989314, + "ewc_loss": 0.05402253568172455, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023016678460408002, + "grad_norm": 6.382193565368652, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8455865979194641, + "num_tokens": 297750931.0, + "step": 7805 + }, + { + "epoch": 0.993003434677522, + "ewc_loss": 0.05396474897861481, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002295888843946159, + "grad_norm": 6.263402462005615, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8516684174537659, + "num_tokens": 297786107.0, + "step": 7806 + }, + { + "epoch": 0.9931306449561125, + "ewc_loss": 0.05393879860639572, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022932938009034842, + "grad_norm": 6.289580821990967, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8595976829528809, + "num_tokens": 297824782.0, + "step": 7807 + }, + { + "epoch": 0.993257855234703, + "ewc_loss": 0.05388597771525383, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002288011892233044, + "grad_norm": 6.291015625, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8566727638244629, + "num_tokens": 297861748.0, + "step": 7808 + }, + { + "epoch": 0.9933850655132934, + "ewc_loss": 0.0538967102766037, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022890852415002882, + "grad_norm": 6.250866413116455, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8608022332191467, + "num_tokens": 297902722.0, + "step": 7809 + }, + { + "epoch": 0.993512275791884, + "ewc_loss": 0.053890421986579895, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022884560166858137, + "grad_norm": 6.189809799194336, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8616898059844971, + "num_tokens": 297948709.0, + "step": 7810 + }, + { + "epoch": 0.9936394860704745, + "ewc_loss": 0.05394691228866577, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022941053612157702, + "grad_norm": 6.310041904449463, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8626819849014282, + "num_tokens": 297986446.0, + "step": 7811 + }, + { + "epoch": 0.993766696349065, + "ewc_loss": 0.05392374098300934, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022917882597539574, + "grad_norm": 6.300677299499512, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8531990051269531, + "num_tokens": 298024004.0, + "step": 7812 + }, + { + "epoch": 0.9938939066276555, + "ewc_loss": 0.05388650298118591, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002288064279127866, + "grad_norm": 6.227848052978516, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8510031700134277, + "num_tokens": 298065518.0, + "step": 7813 + }, + { + "epoch": 0.9940211169062461, + "ewc_loss": 0.053988017141819, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002298215840710327, + "grad_norm": 6.269594669342041, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8536050915718079, + "num_tokens": 298108138.0, + "step": 7814 + }, + { + "epoch": 0.9941483271848366, + "ewc_loss": 0.0539424791932106, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022936622553970665, + "grad_norm": 6.244719505310059, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8594452142715454, + "num_tokens": 298149348.0, + "step": 7815 + }, + { + "epoch": 0.994275537463427, + "ewc_loss": 0.054025813937187195, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023019952641334385, + "grad_norm": 6.28212308883667, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8566890954971313, + "num_tokens": 298186347.0, + "step": 7816 + }, + { + "epoch": 0.9944027477420175, + "ewc_loss": 0.053952496498823166, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022946637182030827, + "grad_norm": 6.300992965698242, + "learning_rate": 1e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8389662504196167, + "num_tokens": 298223322.0, + "step": 7817 + }, + { + "epoch": 0.9945299580206081, + "ewc_loss": 0.05399971082806587, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022993850870989263, + "grad_norm": 6.307724952697754, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8547847867012024, + "num_tokens": 298258794.0, + "step": 7818 + }, + { + "epoch": 0.9946571682991986, + "ewc_loss": 0.05401027575135231, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023004415561445057, + "grad_norm": 6.325485706329346, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8536070585250854, + "num_tokens": 298297510.0, + "step": 7819 + }, + { + "epoch": 0.9947843785777891, + "ewc_loss": 0.05393190681934357, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002292604767717421, + "grad_norm": 6.278179168701172, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8637821674346924, + "num_tokens": 298333784.0, + "step": 7820 + }, + { + "epoch": 0.9949115888563796, + "ewc_loss": 0.053967833518981934, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022961973445490003, + "grad_norm": 6.3017425537109375, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8656841516494751, + "num_tokens": 298371241.0, + "step": 7821 + }, + { + "epoch": 0.9950387991349701, + "ewc_loss": 0.05389583110809326, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022889972024131566, + "grad_norm": 6.300915241241455, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8455647230148315, + "num_tokens": 298407902.0, + "step": 7822 + }, + { + "epoch": 0.9951660094135606, + "ewc_loss": 0.05397358164191246, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022967721452005208, + "grad_norm": 6.303130626678467, + "learning_rate": 1e-06, + "loss": 0.547, + "mean_token_accuracy": 0.8347796201705933, + "num_tokens": 298452339.0, + "step": 7823 + }, + { + "epoch": 0.9952932196921511, + "ewc_loss": 0.053887054324150085, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002288119721924886, + "grad_norm": 6.209099769592285, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8504079580307007, + "num_tokens": 298495854.0, + "step": 7824 + }, + { + "epoch": 0.9954204299707416, + "ewc_loss": 0.05396842211484909, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022962562798056751, + "grad_norm": 6.320180416107178, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8550459146499634, + "num_tokens": 298529642.0, + "step": 7825 + }, + { + "epoch": 0.9955476402493322, + "ewc_loss": 0.05391884595155716, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022912985878065228, + "grad_norm": 6.233633041381836, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8619354367256165, + "num_tokens": 298574396.0, + "step": 7826 + }, + { + "epoch": 0.9956748505279227, + "ewc_loss": 0.053917296230793, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002291143755428493, + "grad_norm": 6.313978672027588, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8462843298912048, + "num_tokens": 298612457.0, + "step": 7827 + }, + { + "epoch": 0.9958020608065131, + "ewc_loss": 0.0539211705327034, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022915311274118721, + "grad_norm": 6.270384311676025, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8600294589996338, + "num_tokens": 298645061.0, + "step": 7828 + }, + { + "epoch": 0.9959292710851037, + "ewc_loss": 0.05393906310200691, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022933204309083521, + "grad_norm": 6.243000507354736, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8569703698158264, + "num_tokens": 298688014.0, + "step": 7829 + }, + { + "epoch": 0.9960564813636942, + "ewc_loss": 0.0539754256606102, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002296956372447312, + "grad_norm": 6.269041061401367, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8633378744125366, + "num_tokens": 298727815.0, + "step": 7830 + }, + { + "epoch": 0.9961836916422847, + "ewc_loss": 0.05394058674573898, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022934727894607931, + "grad_norm": 6.2246012687683105, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8792944550514221, + "num_tokens": 298765663.0, + "step": 7831 + }, + { + "epoch": 0.9963109019208752, + "ewc_loss": 0.05402219668030739, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023016337945591658, + "grad_norm": 6.3114800453186035, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.863114595413208, + "num_tokens": 298799360.0, + "step": 7832 + }, + { + "epoch": 0.9964381121994658, + "ewc_loss": 0.053885020315647125, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022879162861499935, + "grad_norm": 6.218384265899658, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8587689995765686, + "num_tokens": 298835669.0, + "step": 7833 + }, + { + "epoch": 0.9965653224780562, + "ewc_loss": 0.05404597893357277, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023040120140649378, + "grad_norm": 6.381375789642334, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8574389815330505, + "num_tokens": 298866871.0, + "step": 7834 + }, + { + "epoch": 0.9966925327566467, + "ewc_loss": 0.053923994302749634, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022918134345673025, + "grad_norm": 6.236172676086426, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8514127731323242, + "num_tokens": 298904136.0, + "step": 7835 + }, + { + "epoch": 0.9968197430352372, + "ewc_loss": 0.05394995957612991, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022944102238398045, + "grad_norm": 6.276761531829834, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8721988201141357, + "num_tokens": 298937069.0, + "step": 7836 + }, + { + "epoch": 0.9969469533138278, + "ewc_loss": 0.054048579186201096, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023042720567900687, + "grad_norm": 6.303422451019287, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8512871265411377, + "num_tokens": 298976778.0, + "step": 7837 + }, + { + "epoch": 0.9970741635924183, + "ewc_loss": 0.05388692393898964, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022881064796820283, + "grad_norm": 6.2688212394714355, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8693842887878418, + "num_tokens": 299010808.0, + "step": 7838 + }, + { + "epoch": 0.9972013738710088, + "ewc_loss": 0.05424763634800911, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002299763582414016, + "grad_norm": 6.310209274291992, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8479106426239014, + "num_tokens": 299047611.0, + "step": 7839 + }, + { + "epoch": 0.9973285841495992, + "ewc_loss": 0.05394300818443298, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022937146422918886, + "grad_norm": 6.197674751281738, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8655712604522705, + "num_tokens": 299088668.0, + "step": 7840 + }, + { + "epoch": 0.9974557944281898, + "ewc_loss": 0.05422027409076691, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002297027676831931, + "grad_norm": 6.336764812469482, + "learning_rate": 1e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8390626907348633, + "num_tokens": 299126572.0, + "step": 7841 + }, + { + "epoch": 0.9975830047067803, + "ewc_loss": 0.054235681891441345, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00022985684336163104, + "grad_norm": 6.195442199707031, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.87047278881073, + "num_tokens": 299164806.0, + "step": 7842 + }, + { + "epoch": 0.9977102149853708, + "ewc_loss": 0.054250508546829224, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023000506917014718, + "grad_norm": 6.30180025100708, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8566478490829468, + "num_tokens": 299207039.0, + "step": 7843 + }, + { + "epoch": 0.9978374252639614, + "ewc_loss": 0.05420583486557007, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00022955832537263632, + "grad_norm": 6.235093116760254, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8689080476760864, + "num_tokens": 299243242.0, + "step": 7844 + }, + { + "epoch": 0.9979646355425519, + "ewc_loss": 0.05424485355615616, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002299485495314002, + "grad_norm": 6.289770603179932, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8653572797775269, + "num_tokens": 299282326.0, + "step": 7845 + }, + { + "epoch": 0.9980918458211423, + "ewc_loss": 0.05392719432711601, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022921335767023265, + "grad_norm": 6.183388710021973, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.88847416639328, + "num_tokens": 299316645.0, + "step": 7846 + }, + { + "epoch": 0.9982190560997328, + "ewc_loss": 0.054002001881599426, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022996139887254685, + "grad_norm": 6.383915901184082, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8616386651992798, + "num_tokens": 299349932.0, + "step": 7847 + }, + { + "epoch": 0.9983462663783234, + "ewc_loss": 0.0539727658033371, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022966905089560896, + "grad_norm": 6.206663608551025, + "learning_rate": 1e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.8387835621833801, + "num_tokens": 299392452.0, + "step": 7848 + }, + { + "epoch": 0.9984734766569139, + "ewc_loss": 0.053984612226486206, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022978754714131355, + "grad_norm": 6.287961483001709, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8658542037010193, + "num_tokens": 299431479.0, + "step": 7849 + }, + { + "epoch": 0.9986006869355044, + "ewc_loss": 0.05396366864442825, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022957808687351644, + "grad_norm": 6.27783203125, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.85737144947052, + "num_tokens": 299475585.0, + "step": 7850 + }, + { + "epoch": 0.9987278972140949, + "ewc_loss": 0.05400747433304787, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023001614317763597, + "grad_norm": 6.264660358428955, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8749749660491943, + "num_tokens": 299514026.0, + "step": 7851 + }, + { + "epoch": 0.9988551074926854, + "ewc_loss": 0.0539148673415184, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022909010294824839, + "grad_norm": 6.283868312835693, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.849302351474762, + "num_tokens": 299550858.0, + "step": 7852 + }, + { + "epoch": 0.9989823177712759, + "ewc_loss": 0.05392342060804367, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022917563910596073, + "grad_norm": 6.390380382537842, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8538163900375366, + "num_tokens": 299586831.0, + "step": 7853 + }, + { + "epoch": 0.9991095280498664, + "ewc_loss": 0.053955547511577606, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022949690173845738, + "grad_norm": 6.277166843414307, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.869377851486206, + "num_tokens": 299623116.0, + "step": 7854 + }, + { + "epoch": 0.9992367383284569, + "ewc_loss": 0.053858865052461624, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022853005793876946, + "grad_norm": 6.244131088256836, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8629359602928162, + "num_tokens": 299662165.0, + "step": 7855 + }, + { + "epoch": 0.9993639486070475, + "ewc_loss": 0.053970396518707275, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022964536037761718, + "grad_norm": 6.266458511352539, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8568793535232544, + "num_tokens": 299702581.0, + "step": 7856 + }, + { + "epoch": 0.999491158885638, + "ewc_loss": 0.053924836218357086, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022918976901564747, + "grad_norm": 6.30111837387085, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8466393351554871, + "num_tokens": 299742559.0, + "step": 7857 + }, + { + "epoch": 0.9996183691642284, + "ewc_loss": 0.053903184831142426, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022897328017279506, + "grad_norm": 6.282350540161133, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.874801754951477, + "num_tokens": 299778637.0, + "step": 7858 + }, + { + "epoch": 0.9997455794428189, + "ewc_loss": 0.053884357213974, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022878496383782476, + "grad_norm": 6.290252208709717, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8694576025009155, + "num_tokens": 299812808.0, + "step": 7859 + }, + { + "epoch": 0.9998727897214095, + "ewc_loss": 0.053934596478939056, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022928739781491458, + "grad_norm": 6.290585517883301, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8568646907806396, + "num_tokens": 299848987.0, + "step": 7860 + }, + { + "epoch": 1.0, + "ewc_loss": 0.0539863184094429, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022980461653787643, + "grad_norm": 6.506913661956787, + "learning_rate": 1e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8429362177848816, + "num_tokens": 299886286.0, + "step": 7861 + }, + { + "epoch": 1.0001272102785905, + "ewc_loss": 0.053793683648109436, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022787823399994522, + "grad_norm": 6.183501243591309, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8565269708633423, + "num_tokens": 299925456.0, + "step": 7862 + }, + { + "epoch": 1.000254420557181, + "ewc_loss": 0.05400557443499565, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022999715292826295, + "grad_norm": 6.362533092498779, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8565572500228882, + "num_tokens": 299965936.0, + "step": 7863 + }, + { + "epoch": 1.0003816308357716, + "ewc_loss": 0.05388666316866875, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022880802862346172, + "grad_norm": 6.239896774291992, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8726485967636108, + "num_tokens": 300003181.0, + "step": 7864 + }, + { + "epoch": 1.0005088411143621, + "ewc_loss": 0.05396807938814163, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022962220828048885, + "grad_norm": 6.297019958496094, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8582534790039062, + "num_tokens": 300040502.0, + "step": 7865 + }, + { + "epoch": 1.0006360513929526, + "ewc_loss": 0.0539926141500473, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022986752446740866, + "grad_norm": 6.30237340927124, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.875725269317627, + "num_tokens": 300078795.0, + "step": 7866 + }, + { + "epoch": 1.0007632616715432, + "ewc_loss": 0.053941357880830765, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022935497690923512, + "grad_norm": 6.339794158935547, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8706326484680176, + "num_tokens": 300115188.0, + "step": 7867 + }, + { + "epoch": 1.0008904719501335, + "ewc_loss": 0.05396825075149536, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022962391085457057, + "grad_norm": 6.297451019287109, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8626636266708374, + "num_tokens": 300158583.0, + "step": 7868 + }, + { + "epoch": 1.001017682228724, + "ewc_loss": 0.05394556373357773, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022939704649616033, + "grad_norm": 6.343690872192383, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8649666905403137, + "num_tokens": 300194415.0, + "step": 7869 + }, + { + "epoch": 1.0011448925073145, + "ewc_loss": 0.05391347035765648, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002290761040057987, + "grad_norm": 6.274204730987549, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8563843369483948, + "num_tokens": 300235066.0, + "step": 7870 + }, + { + "epoch": 1.001272102785905, + "ewc_loss": 0.053990643471479416, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002298478502780199, + "grad_norm": 6.346619129180908, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8634786605834961, + "num_tokens": 300272167.0, + "step": 7871 + }, + { + "epoch": 1.0013993130644956, + "ewc_loss": 0.05389903485774994, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022893176355864853, + "grad_norm": 6.337020397186279, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8522205352783203, + "num_tokens": 300306032.0, + "step": 7872 + }, + { + "epoch": 1.0015265233430861, + "ewc_loss": 0.05396292358636856, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022957065084483474, + "grad_norm": 6.308394908905029, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8579168319702148, + "num_tokens": 300338433.0, + "step": 7873 + }, + { + "epoch": 1.0016537336216766, + "ewc_loss": 0.05400830879807472, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023002449597697705, + "grad_norm": 6.364811897277832, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8515315055847168, + "num_tokens": 300376527.0, + "step": 7874 + }, + { + "epoch": 1.0017809439002672, + "ewc_loss": 0.05397563427686691, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022969777637626976, + "grad_norm": 6.287125110626221, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.878354012966156, + "num_tokens": 300417309.0, + "step": 7875 + }, + { + "epoch": 1.0019081541788577, + "ewc_loss": 0.05395632982254028, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022950471611693501, + "grad_norm": 6.248100280761719, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8624086380004883, + "num_tokens": 300459438.0, + "step": 7876 + }, + { + "epoch": 1.0020353644574482, + "ewc_loss": 0.054053086787462234, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023047227296046913, + "grad_norm": 6.398712158203125, + "learning_rate": 1e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8382822275161743, + "num_tokens": 300501443.0, + "step": 7877 + }, + { + "epoch": 1.0021625747360388, + "ewc_loss": 0.05397739261388779, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00022971535508986562, + "grad_norm": 6.341063976287842, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8694043159484863, + "num_tokens": 300531918.0, + "step": 7878 + }, + { + "epoch": 1.0022897850146293, + "ewc_loss": 0.05402591452002525, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023020054504740983, + "grad_norm": 6.29170036315918, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8549564480781555, + "num_tokens": 300575084.0, + "step": 7879 + }, + { + "epoch": 1.0024169952932196, + "ewc_loss": 0.054120130836963654, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023114270879887044, + "grad_norm": 6.351834774017334, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8617229461669922, + "num_tokens": 300608737.0, + "step": 7880 + }, + { + "epoch": 1.0025442055718101, + "ewc_loss": 0.05400627851486206, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023000419605523348, + "grad_norm": 6.264653205871582, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8484156727790833, + "num_tokens": 300647855.0, + "step": 7881 + }, + { + "epoch": 1.0026714158504006, + "ewc_loss": 0.054139718413352966, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023133857757784426, + "grad_norm": 6.311435222625732, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8615593314170837, + "num_tokens": 300689763.0, + "step": 7882 + }, + { + "epoch": 1.0027986261289912, + "ewc_loss": 0.05404495447874069, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023039097141008824, + "grad_norm": 12.621953010559082, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8719550371170044, + "num_tokens": 300725838.0, + "step": 7883 + }, + { + "epoch": 1.0029258364075817, + "ewc_loss": 0.06265056878328323, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00031644708360545337, + "grad_norm": 7.48220157623291, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8664122819900513, + "num_tokens": 300760423.0, + "step": 7884 + }, + { + "epoch": 1.0030530466861722, + "ewc_loss": 0.052417099475860596, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002141123841283843, + "grad_norm": 5.859328269958496, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8665090799331665, + "num_tokens": 300796236.0, + "step": 7885 + }, + { + "epoch": 1.0031802569647628, + "ewc_loss": 0.05598071217536926, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002497485256753862, + "grad_norm": 6.811868667602539, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8660063147544861, + "num_tokens": 300835213.0, + "step": 7886 + }, + { + "epoch": 1.0033074672433533, + "ewc_loss": 0.05490771681070328, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023901855456642807, + "grad_norm": 6.250843048095703, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8690669536590576, + "num_tokens": 300873363.0, + "step": 7887 + }, + { + "epoch": 1.0034346775219438, + "ewc_loss": 0.054912153631448746, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023906293790787458, + "grad_norm": 6.653052806854248, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8575350046157837, + "num_tokens": 300910233.0, + "step": 7888 + }, + { + "epoch": 1.0035618878005343, + "ewc_loss": 0.054852552711963654, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023846692056395113, + "grad_norm": 6.385796546936035, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.869258463382721, + "num_tokens": 300946986.0, + "step": 7889 + }, + { + "epoch": 1.0036890980791249, + "ewc_loss": 0.05456971004605293, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023563850845675915, + "grad_norm": 6.435400009155273, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8626837134361267, + "num_tokens": 300985750.0, + "step": 7890 + }, + { + "epoch": 1.0038163083577154, + "ewc_loss": 0.05467091500759125, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023665053595323116, + "grad_norm": 13.021382331848145, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8609856367111206, + "num_tokens": 301032820.0, + "step": 7891 + }, + { + "epoch": 1.0039435186363057, + "ewc_loss": 0.06342393904924393, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00032418081536889076, + "grad_norm": 7.524461269378662, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8518332242965698, + "num_tokens": 301070825.0, + "step": 7892 + }, + { + "epoch": 1.0040707289148962, + "ewc_loss": 0.05249364674091339, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021487788762897253, + "grad_norm": 5.917092800140381, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8589446544647217, + "num_tokens": 301110764.0, + "step": 7893 + }, + { + "epoch": 1.0041979391934868, + "ewc_loss": 0.05618492513895035, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002517906541470438, + "grad_norm": 6.820634841918945, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.864254355430603, + "num_tokens": 301152421.0, + "step": 7894 + }, + { + "epoch": 1.0043251494720773, + "ewc_loss": 0.05519745871424675, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00024191600095946342, + "grad_norm": 6.265830993652344, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8622056245803833, + "num_tokens": 301192780.0, + "step": 7895 + }, + { + "epoch": 1.0044523597506678, + "ewc_loss": 0.05490414425730705, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023898284416645765, + "grad_norm": 6.606211185455322, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.858165979385376, + "num_tokens": 301228147.0, + "step": 7896 + }, + { + "epoch": 1.0045795700292584, + "ewc_loss": 0.05498766526579857, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023981805134098977, + "grad_norm": 6.394938945770264, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8708740472793579, + "num_tokens": 301259568.0, + "step": 7897 + }, + { + "epoch": 1.0047067803078489, + "ewc_loss": 0.054765842854976654, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002375998446950689, + "grad_norm": 6.446751594543457, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8619527220726013, + "num_tokens": 301296483.0, + "step": 7898 + }, + { + "epoch": 1.0048339905864394, + "ewc_loss": 0.05462802201509476, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023622160369995981, + "grad_norm": 6.293829917907715, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.863171398639679, + "num_tokens": 301337437.0, + "step": 7899 + }, + { + "epoch": 1.00496120086503, + "ewc_loss": 0.05467737466096878, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002367151464568451, + "grad_norm": 6.4972825050354, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8610497713088989, + "num_tokens": 301368777.0, + "step": 7900 + }, + { + "epoch": 1.0050884111436205, + "ewc_loss": 0.05447005480527878, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002346419496461749, + "grad_norm": 6.3045501708984375, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8590807914733887, + "num_tokens": 301404885.0, + "step": 7901 + }, + { + "epoch": 1.005215621422211, + "ewc_loss": 0.05445978045463562, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023453918402083218, + "grad_norm": 6.3665971755981445, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8511021137237549, + "num_tokens": 301445308.0, + "step": 7902 + }, + { + "epoch": 1.0053428317008015, + "ewc_loss": 0.05446483939886093, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023458981013391167, + "grad_norm": 6.40200138092041, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8599003553390503, + "num_tokens": 301484845.0, + "step": 7903 + }, + { + "epoch": 1.0054700419793918, + "ewc_loss": 0.054274000227451324, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002326813992112875, + "grad_norm": 6.280980587005615, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8601222038269043, + "num_tokens": 301524498.0, + "step": 7904 + }, + { + "epoch": 1.0055972522579824, + "ewc_loss": 0.05442265421152115, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023416795011144131, + "grad_norm": 6.379932403564453, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8668330311775208, + "num_tokens": 301565522.0, + "step": 7905 + }, + { + "epoch": 1.0057244625365729, + "ewc_loss": 0.054292500019073486, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023286641226150095, + "grad_norm": 6.337721347808838, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8572109341621399, + "num_tokens": 301603724.0, + "step": 7906 + }, + { + "epoch": 1.0058516728151634, + "ewc_loss": 0.05426386743783951, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023258005967363715, + "grad_norm": 6.380524158477783, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8590900897979736, + "num_tokens": 301636650.0, + "step": 7907 + }, + { + "epoch": 1.005978883093754, + "ewc_loss": 0.05430176854133606, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002329590788576752, + "grad_norm": 6.340710639953613, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8511948585510254, + "num_tokens": 301675328.0, + "step": 7908 + }, + { + "epoch": 1.0061060933723445, + "ewc_loss": 0.05425969511270523, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023253835388459265, + "grad_norm": 6.501033782958984, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8550795912742615, + "num_tokens": 301710058.0, + "step": 7909 + }, + { + "epoch": 1.006233303650935, + "ewc_loss": 0.05410540848970413, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023099548707250506, + "grad_norm": 6.242734909057617, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8548423051834106, + "num_tokens": 301753567.0, + "step": 7910 + }, + { + "epoch": 1.0063605139295255, + "ewc_loss": 0.0542202964425087, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002321443462278694, + "grad_norm": 6.329929351806641, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8688448667526245, + "num_tokens": 301792895.0, + "step": 7911 + }, + { + "epoch": 1.006487724208116, + "ewc_loss": 0.054173536598682404, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023167677863966674, + "grad_norm": 6.334715843200684, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8732563853263855, + "num_tokens": 301833001.0, + "step": 7912 + }, + { + "epoch": 1.0066149344867066, + "ewc_loss": 0.054138410836458206, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002313255099579692, + "grad_norm": 6.286123752593994, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8640381097793579, + "num_tokens": 301871623.0, + "step": 7913 + }, + { + "epoch": 1.006742144765297, + "ewc_loss": 0.054268836975097656, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023262978356797248, + "grad_norm": 6.355788230895996, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8728369474411011, + "num_tokens": 301903750.0, + "step": 7914 + }, + { + "epoch": 1.0068693550438876, + "ewc_loss": 0.05417467653751373, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023168817278929055, + "grad_norm": 6.279362678527832, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.862358808517456, + "num_tokens": 301943079.0, + "step": 7915 + }, + { + "epoch": 1.0069965653224782, + "ewc_loss": 0.05435379594564438, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023347933893091977, + "grad_norm": 6.363182544708252, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8669902682304382, + "num_tokens": 301980031.0, + "step": 7916 + }, + { + "epoch": 1.0071237756010685, + "ewc_loss": 0.05422551929950714, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023219661670736969, + "grad_norm": 6.341818809509277, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.860801100730896, + "num_tokens": 302020454.0, + "step": 7917 + }, + { + "epoch": 1.007250985879659, + "ewc_loss": 0.05425529181957245, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023249431978911161, + "grad_norm": 6.328134536743164, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.856499433517456, + "num_tokens": 302064391.0, + "step": 7918 + }, + { + "epoch": 1.0073781961582495, + "ewc_loss": 0.05427287518978119, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023267015058081597, + "grad_norm": 6.336272716522217, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8607747554779053, + "num_tokens": 302101598.0, + "step": 7919 + }, + { + "epoch": 1.00750540643684, + "ewc_loss": 0.05428243428468704, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002327657421119511, + "grad_norm": 6.3502068519592285, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.865247368812561, + "num_tokens": 302138803.0, + "step": 7920 + }, + { + "epoch": 1.0076326167154306, + "ewc_loss": 0.05425379425287247, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023247937497217208, + "grad_norm": 6.360497951507568, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8594372868537903, + "num_tokens": 302173682.0, + "step": 7921 + }, + { + "epoch": 1.0077598269940211, + "ewc_loss": 0.05426885187625885, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023262991453520954, + "grad_norm": 6.353416442871094, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8580697774887085, + "num_tokens": 302207390.0, + "step": 7922 + }, + { + "epoch": 1.0078870372726116, + "ewc_loss": 0.054289910942316055, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002328405244043097, + "grad_norm": 6.356391429901123, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8660644292831421, + "num_tokens": 302244440.0, + "step": 7923 + }, + { + "epoch": 1.0080142475512022, + "ewc_loss": 0.05423668771982193, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002323083026567474, + "grad_norm": 6.440166473388672, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8557091951370239, + "num_tokens": 302281058.0, + "step": 7924 + }, + { + "epoch": 1.0081414578297927, + "ewc_loss": 0.05418761074542999, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023181751021184027, + "grad_norm": 6.271329879760742, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.877213180065155, + "num_tokens": 302319586.0, + "step": 7925 + }, + { + "epoch": 1.0082686681083832, + "ewc_loss": 0.05426158010959625, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023255721316672862, + "grad_norm": 6.416438102722168, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8564523458480835, + "num_tokens": 302361356.0, + "step": 7926 + }, + { + "epoch": 1.0083958783869738, + "ewc_loss": 0.05413287132978439, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023127012536861002, + "grad_norm": 6.249924182891846, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8710498213768005, + "num_tokens": 302399974.0, + "step": 7927 + }, + { + "epoch": 1.0085230886655643, + "ewc_loss": 0.05431021377444267, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023304353817366064, + "grad_norm": 6.4324517250061035, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8650752305984497, + "num_tokens": 302435709.0, + "step": 7928 + }, + { + "epoch": 1.0086502989441546, + "ewc_loss": 0.05421559885144234, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023209740174934268, + "grad_norm": 6.306856155395508, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8574727773666382, + "num_tokens": 302474151.0, + "step": 7929 + }, + { + "epoch": 1.0087775092227451, + "ewc_loss": 0.054299041628837585, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023293182312045246, + "grad_norm": 6.434507369995117, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8631076812744141, + "num_tokens": 302509877.0, + "step": 7930 + }, + { + "epoch": 1.0089047195013356, + "ewc_loss": 0.05422569811344147, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023219839204102755, + "grad_norm": 6.275032043457031, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8745372295379639, + "num_tokens": 302547689.0, + "step": 7931 + }, + { + "epoch": 1.0090319297799262, + "ewc_loss": 0.054282642900943756, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023276780848391354, + "grad_norm": 6.344053745269775, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8597229719161987, + "num_tokens": 302590728.0, + "step": 7932 + }, + { + "epoch": 1.0091591400585167, + "ewc_loss": 0.05417201668024063, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023166158644016832, + "grad_norm": 6.315043926239014, + "learning_rate": 1e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8379456400871277, + "num_tokens": 302625709.0, + "step": 7933 + }, + { + "epoch": 1.0092863503371072, + "ewc_loss": 0.054245658218860626, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023239800066221505, + "grad_norm": 6.31273889541626, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8692926168441772, + "num_tokens": 302662086.0, + "step": 7934 + }, + { + "epoch": 1.0094135606156978, + "ewc_loss": 0.05422089993953705, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002321503998246044, + "grad_norm": 6.276815414428711, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8798133134841919, + "num_tokens": 302703680.0, + "step": 7935 + }, + { + "epoch": 1.0095407708942883, + "ewc_loss": 0.054241083562374115, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002323522203369066, + "grad_norm": 6.3232269287109375, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8456177711486816, + "num_tokens": 302740426.0, + "step": 7936 + }, + { + "epoch": 1.0096679811728788, + "ewc_loss": 0.05425648391246796, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023250625235959888, + "grad_norm": 6.280002593994141, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8739171028137207, + "num_tokens": 302780778.0, + "step": 7937 + }, + { + "epoch": 1.0097951914514693, + "ewc_loss": 0.054242320358753204, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023236461856868118, + "grad_norm": 6.28943395614624, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8662323951721191, + "num_tokens": 302816012.0, + "step": 7938 + }, + { + "epoch": 1.0099224017300599, + "ewc_loss": 0.05437098443508148, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023365124070551246, + "grad_norm": 6.4413909912109375, + "learning_rate": 1e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8397036790847778, + "num_tokens": 302858325.0, + "step": 7939 + }, + { + "epoch": 1.0100496120086504, + "ewc_loss": 0.0542086660861969, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023202809097710997, + "grad_norm": 6.296483993530273, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8632491827011108, + "num_tokens": 302898210.0, + "step": 7940 + }, + { + "epoch": 1.0101768222872407, + "ewc_loss": 0.054342858493328094, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023336999583989382, + "grad_norm": 6.337255477905273, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8628331422805786, + "num_tokens": 302936072.0, + "step": 7941 + }, + { + "epoch": 1.0103040325658312, + "ewc_loss": 0.05428031086921692, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023274451086763293, + "grad_norm": 6.363775730133057, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8530438542366028, + "num_tokens": 302978941.0, + "step": 7942 + }, + { + "epoch": 1.0104312428444218, + "ewc_loss": 0.0542810894548893, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002327522961422801, + "grad_norm": 6.327401161193848, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8542351722717285, + "num_tokens": 303016664.0, + "step": 7943 + }, + { + "epoch": 1.0105584531230123, + "ewc_loss": 0.05427908897399902, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023273231636267155, + "grad_norm": 6.285451412200928, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8569761514663696, + "num_tokens": 303063313.0, + "step": 7944 + }, + { + "epoch": 1.0106856634016028, + "ewc_loss": 0.05422262102365494, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023216762929223478, + "grad_norm": 6.348325729370117, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8506388664245605, + "num_tokens": 303103786.0, + "step": 7945 + }, + { + "epoch": 1.0108128736801933, + "ewc_loss": 0.05430884286761284, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023302983026951551, + "grad_norm": 6.362133026123047, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8693050146102905, + "num_tokens": 303136928.0, + "step": 7946 + }, + { + "epoch": 1.0109400839587839, + "ewc_loss": 0.054206572473049164, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023200710711535066, + "grad_norm": 6.321108341217041, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8693881034851074, + "num_tokens": 303174854.0, + "step": 7947 + }, + { + "epoch": 1.0110672942373744, + "ewc_loss": 0.05427879840135574, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023272939142771065, + "grad_norm": 6.3134026527404785, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8590699434280396, + "num_tokens": 303214234.0, + "step": 7948 + }, + { + "epoch": 1.011194504515965, + "ewc_loss": 0.054275233298540115, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023269373923540115, + "grad_norm": 6.278472900390625, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.875040590763092, + "num_tokens": 303251830.0, + "step": 7949 + }, + { + "epoch": 1.0113217147945555, + "ewc_loss": 0.05430196225643158, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023296104336623102, + "grad_norm": 6.313610553741455, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8514930605888367, + "num_tokens": 303291564.0, + "step": 7950 + }, + { + "epoch": 1.011448925073146, + "ewc_loss": 0.054272204637527466, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002326634421478957, + "grad_norm": 6.311013221740723, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8579064607620239, + "num_tokens": 303328642.0, + "step": 7951 + }, + { + "epoch": 1.0115761353517365, + "ewc_loss": 0.0543266162276268, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023320755281019956, + "grad_norm": 6.317074775695801, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8598443269729614, + "num_tokens": 303373693.0, + "step": 7952 + }, + { + "epoch": 1.0117033456303268, + "ewc_loss": 0.05431841313838959, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023312553821597248, + "grad_norm": 6.351775169372559, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8657130002975464, + "num_tokens": 303407124.0, + "step": 7953 + }, + { + "epoch": 1.0118305559089174, + "ewc_loss": 0.054252997040748596, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023247135686688125, + "grad_norm": 6.301333904266357, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8727798461914062, + "num_tokens": 303441586.0, + "step": 7954 + }, + { + "epoch": 1.0119577661875079, + "ewc_loss": 0.054395463317632675, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023389603302348405, + "grad_norm": 6.370591640472412, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8631035685539246, + "num_tokens": 303473593.0, + "step": 7955 + }, + { + "epoch": 1.0120849764660984, + "ewc_loss": 0.054307762533426285, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023301903274841607, + "grad_norm": 6.306540012359619, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.85826176404953, + "num_tokens": 303510239.0, + "step": 7956 + }, + { + "epoch": 1.012212186744689, + "ewc_loss": 0.05434276908636093, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002333691081730649, + "grad_norm": 14.563253402709961, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8591268062591553, + "num_tokens": 303544709.0, + "step": 7957 + }, + { + "epoch": 1.0123393970232795, + "ewc_loss": 0.06597774475812912, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00034971884451806545, + "grad_norm": 7.942681312561035, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8709224462509155, + "num_tokens": 303581990.0, + "step": 7958 + }, + { + "epoch": 1.01246660730187, + "ewc_loss": 0.05221328139305115, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00021207419922575355, + "grad_norm": 5.5243353843688965, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8644813299179077, + "num_tokens": 303625167.0, + "step": 7959 + }, + { + "epoch": 1.0125938175804605, + "ewc_loss": 0.05797900632023811, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00026973147760145366, + "grad_norm": 7.19186544418335, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8627090454101562, + "num_tokens": 303664751.0, + "step": 7960 + }, + { + "epoch": 1.012721027859051, + "ewc_loss": 0.05670684948563576, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002570099022705108, + "grad_norm": 6.356761455535889, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8587934970855713, + "num_tokens": 303702190.0, + "step": 7961 + }, + { + "epoch": 1.0128482381376416, + "ewc_loss": 0.055985935032367706, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00024980073794722557, + "grad_norm": 6.6349663734436035, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8754626512527466, + "num_tokens": 303742872.0, + "step": 7962 + }, + { + "epoch": 1.012975448416232, + "ewc_loss": 0.05618581175804138, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00025179953081533313, + "grad_norm": 6.456187725067139, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8549745082855225, + "num_tokens": 303781265.0, + "step": 7963 + }, + { + "epoch": 1.0131026586948226, + "ewc_loss": 0.05544823035597801, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00024442371795885265, + "grad_norm": 6.454559803009033, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8614286184310913, + "num_tokens": 303820686.0, + "step": 7964 + }, + { + "epoch": 1.0132298689734132, + "ewc_loss": 0.05564180389046669, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00024635944282636046, + "grad_norm": 6.421056270599365, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8641075491905212, + "num_tokens": 303863428.0, + "step": 7965 + }, + { + "epoch": 1.0133570792520035, + "ewc_loss": 0.05528039112687111, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00024274531460832804, + "grad_norm": 6.385821342468262, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.857776403427124, + "num_tokens": 303903847.0, + "step": 7966 + }, + { + "epoch": 1.013484289530594, + "ewc_loss": 0.05519033223390579, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00024184472567867488, + "grad_norm": 6.392910957336426, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8531407713890076, + "num_tokens": 303943083.0, + "step": 7967 + }, + { + "epoch": 1.0136114998091845, + "ewc_loss": 0.0553949736058712, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002414497284917161, + "grad_norm": 6.357711315155029, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8712382316589355, + "num_tokens": 303978773.0, + "step": 7968 + }, + { + "epoch": 1.013738710087775, + "ewc_loss": 0.05525237321853638, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024002374266274273, + "grad_norm": 6.372599124908447, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.868942141532898, + "num_tokens": 304017058.0, + "step": 7969 + }, + { + "epoch": 1.0138659203663656, + "ewc_loss": 0.05493242293596268, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023926564608700573, + "grad_norm": 6.372786521911621, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.852756142616272, + "num_tokens": 304054104.0, + "step": 7970 + }, + { + "epoch": 1.013993130644956, + "ewc_loss": 0.05489785224199295, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023891995078884065, + "grad_norm": 6.356428623199463, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8760172128677368, + "num_tokens": 304091100.0, + "step": 7971 + }, + { + "epoch": 1.0141203409235466, + "ewc_loss": 0.054749440401792526, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023743580095469952, + "grad_norm": 6.361630439758301, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8624453544616699, + "num_tokens": 304127747.0, + "step": 7972 + }, + { + "epoch": 1.0142475512021372, + "ewc_loss": 0.054651856422424316, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023645999317523092, + "grad_norm": 6.288094520568848, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8618859052658081, + "num_tokens": 304171089.0, + "step": 7973 + }, + { + "epoch": 1.0143747614807277, + "ewc_loss": 0.05468681827187538, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002368095883866772, + "grad_norm": 6.363733768463135, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8519544005393982, + "num_tokens": 304210207.0, + "step": 7974 + }, + { + "epoch": 1.0145019717593182, + "ewc_loss": 0.054626770317554474, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002362091327086091, + "grad_norm": 6.291159629821777, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8779948353767395, + "num_tokens": 304249423.0, + "step": 7975 + }, + { + "epoch": 1.0146291820379088, + "ewc_loss": 0.05466659739613533, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023660737497266382, + "grad_norm": 6.351208209991455, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8662984371185303, + "num_tokens": 304286872.0, + "step": 7976 + }, + { + "epoch": 1.0147563923164993, + "ewc_loss": 0.054616957902908325, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000236110994592309, + "grad_norm": 6.361128330230713, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.866176962852478, + "num_tokens": 304323033.0, + "step": 7977 + }, + { + "epoch": 1.0148836025950896, + "ewc_loss": 0.054527632892131805, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002352177252760157, + "grad_norm": 6.324017524719238, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8647661209106445, + "num_tokens": 304360096.0, + "step": 7978 + }, + { + "epoch": 1.0150108128736801, + "ewc_loss": 0.054682742804288864, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023676882847212255, + "grad_norm": 6.43245792388916, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8820021748542786, + "num_tokens": 304396364.0, + "step": 7979 + }, + { + "epoch": 1.0151380231522706, + "ewc_loss": 0.054455943405628204, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023450085427612066, + "grad_norm": 6.281766414642334, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8639129400253296, + "num_tokens": 304431583.0, + "step": 7980 + }, + { + "epoch": 1.0152652334308612, + "ewc_loss": 0.05465773493051529, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023651873925700784, + "grad_norm": 6.346978187561035, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8566920757293701, + "num_tokens": 304470531.0, + "step": 7981 + }, + { + "epoch": 1.0153924437094517, + "ewc_loss": 0.05456313490867615, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023557274835184216, + "grad_norm": 6.3094162940979, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8665769696235657, + "num_tokens": 304505187.0, + "step": 7982 + }, + { + "epoch": 1.0155196539880422, + "ewc_loss": 0.054622821509838104, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023616960970684886, + "grad_norm": 6.408109188079834, + "learning_rate": 1e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8398846983909607, + "num_tokens": 304542685.0, + "step": 7983 + }, + { + "epoch": 1.0156468642666328, + "ewc_loss": 0.054545409977436066, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023539549147244543, + "grad_norm": 6.29440450668335, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8616785407066345, + "num_tokens": 304579204.0, + "step": 7984 + }, + { + "epoch": 1.0157740745452233, + "ewc_loss": 0.05472709238529205, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002372123271925375, + "grad_norm": 6.361518859863281, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.868240237236023, + "num_tokens": 304615224.0, + "step": 7985 + }, + { + "epoch": 1.0159012848238138, + "ewc_loss": 0.054489701986312866, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023483844415750355, + "grad_norm": 7.177881717681885, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8576302528381348, + "num_tokens": 304653513.0, + "step": 7986 + }, + { + "epoch": 1.0160284951024043, + "ewc_loss": 0.05464540049433708, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023639541177544743, + "grad_norm": 6.278388977050781, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8673859238624573, + "num_tokens": 304686744.0, + "step": 7987 + }, + { + "epoch": 1.0161557053809949, + "ewc_loss": 0.05483078211545944, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023580783454235643, + "grad_norm": 6.415839672088623, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.861117422580719, + "num_tokens": 304727262.0, + "step": 7988 + }, + { + "epoch": 1.0162829156595854, + "ewc_loss": 0.05432375147938728, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023317892919294536, + "grad_norm": 6.265637397766113, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8599153757095337, + "num_tokens": 304763418.0, + "step": 7989 + }, + { + "epoch": 1.0164101259381757, + "ewc_loss": 0.054645948112010956, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002364008832955733, + "grad_norm": 6.402588367462158, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.865128219127655, + "num_tokens": 304800763.0, + "step": 7990 + }, + { + "epoch": 1.0165373362167662, + "ewc_loss": 0.05449362099170685, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023487758880946785, + "grad_norm": 6.3284525871276855, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8805631399154663, + "num_tokens": 304833136.0, + "step": 7991 + }, + { + "epoch": 1.0166645464953568, + "ewc_loss": 0.05451822280883789, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002351236471440643, + "grad_norm": 6.3559794425964355, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8708866834640503, + "num_tokens": 304873208.0, + "step": 7992 + }, + { + "epoch": 1.0167917567739473, + "ewc_loss": 0.054429568350315094, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023423708626069129, + "grad_norm": 6.306850433349609, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8629871010780334, + "num_tokens": 304914237.0, + "step": 7993 + }, + { + "epoch": 1.0169189670525378, + "ewc_loss": 0.05461140722036362, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023605547903571278, + "grad_norm": 6.349688529968262, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8773686289787292, + "num_tokens": 304955453.0, + "step": 7994 + }, + { + "epoch": 1.0170461773311283, + "ewc_loss": 0.05440295860171318, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023397098993882537, + "grad_norm": 6.355840682983398, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8614410161972046, + "num_tokens": 304990533.0, + "step": 7995 + }, + { + "epoch": 1.0171733876097189, + "ewc_loss": 0.054537199437618256, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023531341867055744, + "grad_norm": 6.376094818115234, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8615977764129639, + "num_tokens": 305024849.0, + "step": 7996 + }, + { + "epoch": 1.0173005978883094, + "ewc_loss": 0.05446332320570946, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002345746470382437, + "grad_norm": 6.342099666595459, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.858165442943573, + "num_tokens": 305061615.0, + "step": 7997 + }, + { + "epoch": 1.0174278081669, + "ewc_loss": 0.0545065775513649, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023500715906266123, + "grad_norm": 6.339099407196045, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8427977561950684, + "num_tokens": 305102251.0, + "step": 7998 + }, + { + "epoch": 1.0175550184454905, + "ewc_loss": 0.05475309118628502, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023503090778831393, + "grad_norm": 6.375155925750732, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8720533847808838, + "num_tokens": 305135405.0, + "step": 7999 + }, + { + "epoch": 1.017682228724081, + "ewc_loss": 0.05447465926408768, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023468797735404223, + "grad_norm": 6.345605850219727, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8616732358932495, + "num_tokens": 305169005.0, + "step": 8000 + }, + { + "epoch": 1.0178094390026715, + "ewc_loss": 0.05452593415975571, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023520072863902897, + "grad_norm": 6.340043067932129, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8582199215888977, + "num_tokens": 305209050.0, + "step": 8001 + }, + { + "epoch": 1.0179366492812618, + "ewc_loss": 0.054527804255485535, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023521942785009742, + "grad_norm": 6.367950916290283, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8628814220428467, + "num_tokens": 305238250.0, + "step": 8002 + }, + { + "epoch": 1.0180638595598523, + "ewc_loss": 0.05455462634563446, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002354876633035019, + "grad_norm": 6.316372394561768, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8658236265182495, + "num_tokens": 305283544.0, + "step": 8003 + }, + { + "epoch": 1.0181910698384429, + "ewc_loss": 0.05481792241334915, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023567922471556813, + "grad_norm": 6.368300437927246, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8445665240287781, + "num_tokens": 305320548.0, + "step": 8004 + }, + { + "epoch": 1.0183182801170334, + "ewc_loss": 0.05477195978164673, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002352195733692497, + "grad_norm": 6.326946258544922, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8698060512542725, + "num_tokens": 305359560.0, + "step": 8005 + }, + { + "epoch": 1.018445490395624, + "ewc_loss": 0.05479367449879646, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023543674615211785, + "grad_norm": 6.351098537445068, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.858126163482666, + "num_tokens": 305396680.0, + "step": 8006 + }, + { + "epoch": 1.0185727006742145, + "ewc_loss": 0.05477125942707062, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023521261755377054, + "grad_norm": 6.29442024230957, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8732826709747314, + "num_tokens": 305432856.0, + "step": 8007 + }, + { + "epoch": 1.018699910952805, + "ewc_loss": 0.054809071123600006, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023559069086331874, + "grad_norm": 6.367228031158447, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8645939826965332, + "num_tokens": 305469973.0, + "step": 8008 + }, + { + "epoch": 1.0188271212313955, + "ewc_loss": 0.05455610528588295, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023550246260128915, + "grad_norm": 6.361477375030518, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8678932785987854, + "num_tokens": 305505576.0, + "step": 8009 + }, + { + "epoch": 1.018954331509986, + "ewc_loss": 0.054518215358257294, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023512357438448817, + "grad_norm": 6.405478000640869, + "learning_rate": 1e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8421491980552673, + "num_tokens": 305542037.0, + "step": 8010 + }, + { + "epoch": 1.0190815417885766, + "ewc_loss": 0.05446593463420868, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023460073862224817, + "grad_norm": 6.297861099243164, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8701345920562744, + "num_tokens": 305587784.0, + "step": 8011 + }, + { + "epoch": 1.019208752067167, + "ewc_loss": 0.054586391896009445, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023580533161293715, + "grad_norm": 6.385764122009277, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8546566963195801, + "num_tokens": 305621888.0, + "step": 8012 + }, + { + "epoch": 1.0193359623457576, + "ewc_loss": 0.054412804543972015, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023406946274917573, + "grad_norm": 6.265651226043701, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8812378644943237, + "num_tokens": 305663844.0, + "step": 8013 + }, + { + "epoch": 1.0194631726243482, + "ewc_loss": 0.0548684298992157, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002361843071412295, + "grad_norm": 6.424071311950684, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8765468597412109, + "num_tokens": 305695913.0, + "step": 8014 + }, + { + "epoch": 1.0195903829029385, + "ewc_loss": 0.05446760356426239, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023461745877284557, + "grad_norm": 6.298621654510498, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.859337568283081, + "num_tokens": 305733632.0, + "step": 8015 + }, + { + "epoch": 1.019717593181529, + "ewc_loss": 0.05458436906337738, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023578511900268495, + "grad_norm": 6.441924571990967, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8474129438400269, + "num_tokens": 305766917.0, + "step": 8016 + }, + { + "epoch": 1.0198448034601195, + "ewc_loss": 0.05450138449668884, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023495523782912642, + "grad_norm": 6.332422256469727, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.857746958732605, + "num_tokens": 305805768.0, + "step": 8017 + }, + { + "epoch": 1.01997201373871, + "ewc_loss": 0.054529160261154175, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023523299023509026, + "grad_norm": 6.361052989959717, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8647665977478027, + "num_tokens": 305843412.0, + "step": 8018 + }, + { + "epoch": 1.0200992240173006, + "ewc_loss": 0.054520584642887115, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002351472357986495, + "grad_norm": 6.330571174621582, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8779278993606567, + "num_tokens": 305880523.0, + "step": 8019 + }, + { + "epoch": 1.020226434295891, + "ewc_loss": 0.054830994457006454, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023580994457006454, + "grad_norm": 6.429399490356445, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8630959987640381, + "num_tokens": 305915379.0, + "step": 8020 + }, + { + "epoch": 1.0203536445744816, + "ewc_loss": 0.05451986566185951, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002351400617044419, + "grad_norm": 6.356773376464844, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8774334192276001, + "num_tokens": 305952471.0, + "step": 8021 + }, + { + "epoch": 1.0204808548530722, + "ewc_loss": 0.054480478167533875, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023474621411878616, + "grad_norm": 6.337831497192383, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8619219064712524, + "num_tokens": 305994023.0, + "step": 8022 + }, + { + "epoch": 1.0206080651316627, + "ewc_loss": 0.05453522503376007, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023529367172159255, + "grad_norm": 6.424417972564697, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8677953481674194, + "num_tokens": 306026563.0, + "step": 8023 + }, + { + "epoch": 1.0207352754102532, + "ewc_loss": 0.05437647923827171, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023370620328933, + "grad_norm": 6.270783424377441, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.858017086982727, + "num_tokens": 306067913.0, + "step": 8024 + }, + { + "epoch": 1.0208624856888437, + "ewc_loss": 0.05453186109662056, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023526001314166933, + "grad_norm": 6.385722637176514, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8610609769821167, + "num_tokens": 306108692.0, + "step": 8025 + }, + { + "epoch": 1.0209896959674343, + "ewc_loss": 0.054445087909698486, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023439231154043227, + "grad_norm": 6.31939697265625, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8622674942016602, + "num_tokens": 306148082.0, + "step": 8026 + }, + { + "epoch": 1.0211169062460246, + "ewc_loss": 0.05448193475604057, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002347607514820993, + "grad_norm": 6.354772090911865, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.869667649269104, + "num_tokens": 306182800.0, + "step": 8027 + }, + { + "epoch": 1.021244116524615, + "ewc_loss": 0.05450388789176941, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023498029622714967, + "grad_norm": 6.296171188354492, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8608229160308838, + "num_tokens": 306226741.0, + "step": 8028 + }, + { + "epoch": 1.0213713268032056, + "ewc_loss": 0.05452559515833855, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023519735259469599, + "grad_norm": 6.419381618499756, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8615546226501465, + "num_tokens": 306265993.0, + "step": 8029 + }, + { + "epoch": 1.0214985370817962, + "ewc_loss": 0.05443331599235535, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002342745865462348, + "grad_norm": 6.278038501739502, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8657457232475281, + "num_tokens": 306306511.0, + "step": 8030 + }, + { + "epoch": 1.0216257473603867, + "ewc_loss": 0.05456206947565079, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023556208179797977, + "grad_norm": 6.4599409103393555, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8783698081970215, + "num_tokens": 306337687.0, + "step": 8031 + }, + { + "epoch": 1.0217529576389772, + "ewc_loss": 0.054469216614961624, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023463356774300337, + "grad_norm": 6.313663482666016, + "learning_rate": 1e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.8280814290046692, + "num_tokens": 306379646.0, + "step": 8032 + }, + { + "epoch": 1.0218801679175677, + "ewc_loss": 0.05466734617948532, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023661486920900643, + "grad_norm": 6.398857116699219, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8632229566574097, + "num_tokens": 306415389.0, + "step": 8033 + }, + { + "epoch": 1.0220073781961583, + "ewc_loss": 0.054504022002220154, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002349816495552659, + "grad_norm": 6.3470048904418945, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8585260510444641, + "num_tokens": 306452035.0, + "step": 8034 + }, + { + "epoch": 1.0221345884747488, + "ewc_loss": 0.054609715938568115, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023603854060638696, + "grad_norm": 6.316596984863281, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8617234230041504, + "num_tokens": 306496277.0, + "step": 8035 + }, + { + "epoch": 1.0222617987533393, + "ewc_loss": 0.05457057058811188, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023564710863865912, + "grad_norm": 6.357988357543945, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8683004379272461, + "num_tokens": 306528212.0, + "step": 8036 + }, + { + "epoch": 1.0223890090319299, + "ewc_loss": 0.054596252739429474, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002359039062866941, + "grad_norm": 6.376499652862549, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8668981790542603, + "num_tokens": 306562047.0, + "step": 8037 + }, + { + "epoch": 1.0225162193105204, + "ewc_loss": 0.054556168615818024, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023550308833364397, + "grad_norm": 6.381964683532715, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8520261645317078, + "num_tokens": 306601298.0, + "step": 8038 + }, + { + "epoch": 1.0226434295891107, + "ewc_loss": 0.0545281320810318, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023522274568676949, + "grad_norm": 6.303845405578613, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8624745607376099, + "num_tokens": 306640961.0, + "step": 8039 + }, + { + "epoch": 1.0227706398677012, + "ewc_loss": 0.05467374622821808, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023667885398026556, + "grad_norm": 6.330672740936279, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.880144476890564, + "num_tokens": 306679507.0, + "step": 8040 + }, + { + "epoch": 1.0228978501462918, + "ewc_loss": 0.054506778717041016, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023500918177887797, + "grad_norm": 6.335025310516357, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8534330129623413, + "num_tokens": 306717337.0, + "step": 8041 + }, + { + "epoch": 1.0230250604248823, + "ewc_loss": 0.05462009832262993, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023614238307345659, + "grad_norm": 6.392508506774902, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.83571457862854, + "num_tokens": 306754152.0, + "step": 8042 + }, + { + "epoch": 1.0231522707034728, + "ewc_loss": 0.05462999641895294, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023624136520083994, + "grad_norm": 6.352016925811768, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.85528564453125, + "num_tokens": 306794887.0, + "step": 8043 + }, + { + "epoch": 1.0232794809820633, + "ewc_loss": 0.05463230982422829, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023626450274605304, + "grad_norm": 6.615212917327881, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8652915954589844, + "num_tokens": 306828374.0, + "step": 8044 + }, + { + "epoch": 1.0234066912606539, + "ewc_loss": 0.054446905851364136, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002344104868825525, + "grad_norm": 6.243854522705078, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8656142354011536, + "num_tokens": 306865285.0, + "step": 8045 + }, + { + "epoch": 1.0235339015392444, + "ewc_loss": 0.05464641749858856, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023640559811610729, + "grad_norm": 6.412851810455322, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.867893397808075, + "num_tokens": 306897195.0, + "step": 8046 + }, + { + "epoch": 1.023661111817835, + "ewc_loss": 0.05451527237892151, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002350941504118964, + "grad_norm": 6.321382999420166, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.861355185508728, + "num_tokens": 306935352.0, + "step": 8047 + }, + { + "epoch": 1.0237883220964255, + "ewc_loss": 0.05456419289112091, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002355833275942132, + "grad_norm": 6.428960800170898, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8680063486099243, + "num_tokens": 306964621.0, + "step": 8048 + }, + { + "epoch": 1.023915532375016, + "ewc_loss": 0.05453367903828621, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023527818848378956, + "grad_norm": 6.306870937347412, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.861400842666626, + "num_tokens": 307003451.0, + "step": 8049 + }, + { + "epoch": 1.0240427426536065, + "ewc_loss": 0.05461142212152481, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023605561000294983, + "grad_norm": 6.374331474304199, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8632373809814453, + "num_tokens": 307044695.0, + "step": 8050 + }, + { + "epoch": 1.0241699529321968, + "ewc_loss": 0.05454941466450691, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023543555289506912, + "grad_norm": 6.303737640380859, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.868954062461853, + "num_tokens": 307080603.0, + "step": 8051 + }, + { + "epoch": 1.0242971632107873, + "ewc_loss": 0.054580219089984894, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002357435878366232, + "grad_norm": 6.353048324584961, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8551826477050781, + "num_tokens": 307122300.0, + "step": 8052 + }, + { + "epoch": 1.0244243734893779, + "ewc_loss": 0.05455431342124939, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023548456374555826, + "grad_norm": 6.359022617340088, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.869098961353302, + "num_tokens": 307158685.0, + "step": 8053 + }, + { + "epoch": 1.0245515837679684, + "ewc_loss": 0.05453024059534073, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002352437877561897, + "grad_norm": 6.283926010131836, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8675433397293091, + "num_tokens": 307199104.0, + "step": 8054 + }, + { + "epoch": 1.024678794046559, + "ewc_loss": 0.054628461599349976, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023622602748218924, + "grad_norm": 6.380959987640381, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8555122017860413, + "num_tokens": 307235840.0, + "step": 8055 + }, + { + "epoch": 1.0248060043251495, + "ewc_loss": 0.05457044392824173, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023564585717394948, + "grad_norm": 6.372326374053955, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8510861396789551, + "num_tokens": 307273089.0, + "step": 8056 + }, + { + "epoch": 1.02493321460374, + "ewc_loss": 0.054545968770980835, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023540110851172358, + "grad_norm": 6.3214545249938965, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8631081581115723, + "num_tokens": 307318825.0, + "step": 8057 + }, + { + "epoch": 1.0250604248823305, + "ewc_loss": 0.054574012756347656, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002356815239181742, + "grad_norm": 6.353217601776123, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8688535094261169, + "num_tokens": 307357006.0, + "step": 8058 + }, + { + "epoch": 1.025187635160921, + "ewc_loss": 0.054518282413482666, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023512422922067344, + "grad_norm": 6.356326580047607, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8591732978820801, + "num_tokens": 307390274.0, + "step": 8059 + }, + { + "epoch": 1.0253148454395116, + "ewc_loss": 0.054560914635658264, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023555054212920368, + "grad_norm": 6.334916591644287, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.871105432510376, + "num_tokens": 307427157.0, + "step": 8060 + }, + { + "epoch": 1.025442055718102, + "ewc_loss": 0.05456027388572693, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023554415383841842, + "grad_norm": 6.381258964538574, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8578088283538818, + "num_tokens": 307466555.0, + "step": 8061 + }, + { + "epoch": 1.0255692659966926, + "ewc_loss": 0.054527658969163895, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023521800176240504, + "grad_norm": 6.356296062469482, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8544188141822815, + "num_tokens": 307503375.0, + "step": 8062 + }, + { + "epoch": 1.0256964762752832, + "ewc_loss": 0.0546637661755085, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023657907149754465, + "grad_norm": 6.449594497680664, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8717273473739624, + "num_tokens": 307543170.0, + "step": 8063 + }, + { + "epoch": 1.0258236865538735, + "ewc_loss": 0.054474323987960815, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002346846304135397, + "grad_norm": 6.342870712280273, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8587403297424316, + "num_tokens": 307582890.0, + "step": 8064 + }, + { + "epoch": 1.025950896832464, + "ewc_loss": 0.05460105836391449, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023595198581460863, + "grad_norm": 6.446962833404541, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8545675277709961, + "num_tokens": 307617643.0, + "step": 8065 + }, + { + "epoch": 1.0260781071110545, + "ewc_loss": 0.05447850376367569, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002347264380659908, + "grad_norm": 6.357452392578125, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8603683710098267, + "num_tokens": 307652258.0, + "step": 8066 + }, + { + "epoch": 1.026205317389645, + "ewc_loss": 0.0545383021235466, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002353244344703853, + "grad_norm": 6.360730171203613, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8579460382461548, + "num_tokens": 307693245.0, + "step": 8067 + }, + { + "epoch": 1.0263325276682356, + "ewc_loss": 0.05446052551269531, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023454666370525956, + "grad_norm": 6.393678188323975, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8643744587898254, + "num_tokens": 307728751.0, + "step": 8068 + }, + { + "epoch": 1.026459737946826, + "ewc_loss": 0.05448984354734421, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002348398556932807, + "grad_norm": 6.359259128570557, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8513325452804565, + "num_tokens": 307767869.0, + "step": 8069 + }, + { + "epoch": 1.0265869482254166, + "ewc_loss": 0.054533012211322784, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023527155281044543, + "grad_norm": 6.33419132232666, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8717386722564697, + "num_tokens": 307809438.0, + "step": 8070 + }, + { + "epoch": 1.0267141585040072, + "ewc_loss": 0.054783694446086884, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023533693456556648, + "grad_norm": 6.446045398712158, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8420339822769165, + "num_tokens": 307848752.0, + "step": 8071 + }, + { + "epoch": 1.0268413687825977, + "ewc_loss": 0.05452807992696762, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023522220726590604, + "grad_norm": 6.363258361816406, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8565829992294312, + "num_tokens": 307886080.0, + "step": 8072 + }, + { + "epoch": 1.0269685790611882, + "ewc_loss": 0.054551273584365845, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023545413569081575, + "grad_norm": 6.386910915374756, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8750828504562378, + "num_tokens": 307922854.0, + "step": 8073 + }, + { + "epoch": 1.0270957893397787, + "ewc_loss": 0.05455484241247177, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002354898169869557, + "grad_norm": 6.347339153289795, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8595539927482605, + "num_tokens": 307960032.0, + "step": 8074 + }, + { + "epoch": 1.0272229996183693, + "ewc_loss": 0.05458511412143707, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023579254047945142, + "grad_norm": 6.422684669494629, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8611794710159302, + "num_tokens": 307996074.0, + "step": 8075 + }, + { + "epoch": 1.0273502098969596, + "ewc_loss": 0.05455990135669708, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002355403994442895, + "grad_norm": 6.3724799156188965, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8758552074432373, + "num_tokens": 308030833.0, + "step": 8076 + }, + { + "epoch": 1.02747742017555, + "ewc_loss": 0.05461088567972183, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023605026945006102, + "grad_norm": 6.348435401916504, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.871795117855072, + "num_tokens": 308079620.0, + "step": 8077 + }, + { + "epoch": 1.0276046304541406, + "ewc_loss": 0.054573506116867065, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002356764452997595, + "grad_norm": 6.401138782501221, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.855133593082428, + "num_tokens": 308118555.0, + "step": 8078 + }, + { + "epoch": 1.0277318407327312, + "ewc_loss": 0.054591864347457886, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023586006136611104, + "grad_norm": 6.40825891494751, + "learning_rate": 1e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8461894989013672, + "num_tokens": 308154410.0, + "step": 8079 + }, + { + "epoch": 1.0278590510113217, + "ewc_loss": 0.05461796373128891, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002361210499657318, + "grad_norm": 6.427894115447998, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8635381460189819, + "num_tokens": 308191990.0, + "step": 8080 + }, + { + "epoch": 1.0279862612899122, + "ewc_loss": 0.05486918240785599, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002361918450333178, + "grad_norm": 6.369485855102539, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8431028127670288, + "num_tokens": 308231047.0, + "step": 8081 + }, + { + "epoch": 1.0281134715685027, + "ewc_loss": 0.05469711869955063, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023691257229074836, + "grad_norm": 6.375057220458984, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8774383664131165, + "num_tokens": 308269487.0, + "step": 8082 + }, + { + "epoch": 1.0282406818470933, + "ewc_loss": 0.05464715510606766, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023641293228138238, + "grad_norm": 6.387653350830078, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.861754298210144, + "num_tokens": 308304974.0, + "step": 8083 + }, + { + "epoch": 1.0283678921256838, + "ewc_loss": 0.05471532791852951, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023709467495791614, + "grad_norm": 6.409458637237549, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8830381631851196, + "num_tokens": 308342072.0, + "step": 8084 + }, + { + "epoch": 1.0284951024042743, + "ewc_loss": 0.054633818566799164, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023627959308214486, + "grad_norm": 6.3780741691589355, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8591642379760742, + "num_tokens": 308374293.0, + "step": 8085 + }, + { + "epoch": 1.0286223126828649, + "ewc_loss": 0.05470482259988785, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002369896392337978, + "grad_norm": 6.34368896484375, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8662019968032837, + "num_tokens": 308415872.0, + "step": 8086 + }, + { + "epoch": 1.0287495229614554, + "ewc_loss": 0.05466822534799576, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002366236731177196, + "grad_norm": 6.379537582397461, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8561372756958008, + "num_tokens": 308451198.0, + "step": 8087 + }, + { + "epoch": 1.0288767332400457, + "ewc_loss": 0.0546858012676239, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023679941659793258, + "grad_norm": 6.347451210021973, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.870222806930542, + "num_tokens": 308490554.0, + "step": 8088 + }, + { + "epoch": 1.0290039435186362, + "ewc_loss": 0.054736919701099396, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023731061082798988, + "grad_norm": 6.341700553894043, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8567282557487488, + "num_tokens": 308530221.0, + "step": 8089 + }, + { + "epoch": 1.0291311537972267, + "ewc_loss": 0.0547175370156765, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023711677931714803, + "grad_norm": 6.4183197021484375, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.846502959728241, + "num_tokens": 308563634.0, + "step": 8090 + }, + { + "epoch": 1.0292583640758173, + "ewc_loss": 0.05492974445223808, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023679743753746152, + "grad_norm": 6.468164920806885, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8614799380302429, + "num_tokens": 308605033.0, + "step": 8091 + }, + { + "epoch": 1.0293855743544078, + "ewc_loss": 0.054701946675777435, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023696085554547608, + "grad_norm": 6.360142230987549, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8633131980895996, + "num_tokens": 308642603.0, + "step": 8092 + }, + { + "epoch": 1.0295127846329983, + "ewc_loss": 0.054695531725883484, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002368966961512342, + "grad_norm": 6.307191848754883, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8598001003265381, + "num_tokens": 308685189.0, + "step": 8093 + }, + { + "epoch": 1.0296399949115889, + "ewc_loss": 0.05471406877040863, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002370820875512436, + "grad_norm": 6.371752738952637, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8624953031539917, + "num_tokens": 308726052.0, + "step": 8094 + }, + { + "epoch": 1.0297672051901794, + "ewc_loss": 0.05478018522262573, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023774325381964445, + "grad_norm": 6.386165142059326, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8485363125801086, + "num_tokens": 308764828.0, + "step": 8095 + }, + { + "epoch": 1.02989441546877, + "ewc_loss": 0.05470535159111023, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023699493613094091, + "grad_norm": 6.370089054107666, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8726317882537842, + "num_tokens": 308799712.0, + "step": 8096 + }, + { + "epoch": 1.0300216257473604, + "ewc_loss": 0.05476023629307747, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000237543776165694, + "grad_norm": 6.35405158996582, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8621034622192383, + "num_tokens": 308839879.0, + "step": 8097 + }, + { + "epoch": 1.030148836025951, + "ewc_loss": 0.05480474233627319, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002379888464929536, + "grad_norm": 6.452480792999268, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8543144464492798, + "num_tokens": 308880362.0, + "step": 8098 + }, + { + "epoch": 1.0302760463045415, + "ewc_loss": 0.05466502904891968, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002365917171118781, + "grad_norm": 6.34076452255249, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8715620040893555, + "num_tokens": 308921395.0, + "step": 8099 + }, + { + "epoch": 1.0304032565831318, + "ewc_loss": 0.05477021634578705, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023764354409649968, + "grad_norm": 6.521238803863525, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8704676628112793, + "num_tokens": 308959976.0, + "step": 8100 + }, + { + "epoch": 1.0305304668617223, + "ewc_loss": 0.05457072705030441, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023564868024550378, + "grad_norm": 6.373917579650879, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8742611408233643, + "num_tokens": 309000702.0, + "step": 8101 + }, + { + "epoch": 1.0306576771403129, + "ewc_loss": 0.05471349507570267, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023707635409664363, + "grad_norm": 6.350543975830078, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8813943862915039, + "num_tokens": 309036540.0, + "step": 8102 + }, + { + "epoch": 1.0307848874189034, + "ewc_loss": 0.05465491861104965, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002364906104048714, + "grad_norm": 6.382751941680908, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8601180911064148, + "num_tokens": 309072408.0, + "step": 8103 + }, + { + "epoch": 1.030912097697494, + "ewc_loss": 0.054605357348918915, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023599498672410846, + "grad_norm": 6.412233352661133, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8478085994720459, + "num_tokens": 309110366.0, + "step": 8104 + }, + { + "epoch": 1.0310393079760845, + "ewc_loss": 0.05464264750480652, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023636790865566581, + "grad_norm": 6.371982097625732, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8649479150772095, + "num_tokens": 309155194.0, + "step": 8105 + }, + { + "epoch": 1.031166518254675, + "ewc_loss": 0.05457273870706558, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023566879099234939, + "grad_norm": 6.3676652908325195, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8714038133621216, + "num_tokens": 309193747.0, + "step": 8106 + }, + { + "epoch": 1.0312937285332655, + "ewc_loss": 0.05465368553996086, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023647825582884252, + "grad_norm": 6.462477207183838, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8511104583740234, + "num_tokens": 309234161.0, + "step": 8107 + }, + { + "epoch": 1.031420938811856, + "ewc_loss": 0.05444790795445442, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002344204840483144, + "grad_norm": 6.31652307510376, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8511717319488525, + "num_tokens": 309272574.0, + "step": 8108 + }, + { + "epoch": 1.0315481490904466, + "ewc_loss": 0.0546945258975029, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002368866844335571, + "grad_norm": 6.483138084411621, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8652083277702332, + "num_tokens": 309306567.0, + "step": 8109 + }, + { + "epoch": 1.031675359369037, + "ewc_loss": 0.054528430104255676, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002352257288293913, + "grad_norm": 6.3119964599609375, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8682218790054321, + "num_tokens": 309347218.0, + "step": 8110 + }, + { + "epoch": 1.0318025696476276, + "ewc_loss": 0.054636597633361816, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023630738724023104, + "grad_norm": 6.3668670654296875, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8763279318809509, + "num_tokens": 309385701.0, + "step": 8111 + }, + { + "epoch": 1.0319297799262181, + "ewc_loss": 0.054600633680820465, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023594773665536195, + "grad_norm": 6.339911460876465, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8606513738632202, + "num_tokens": 309424600.0, + "step": 8112 + }, + { + "epoch": 1.0320569902048085, + "ewc_loss": 0.054651014506816864, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000236451523960568, + "grad_norm": 6.437591552734375, + "learning_rate": 1e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.8416016101837158, + "num_tokens": 309459929.0, + "step": 8113 + }, + { + "epoch": 1.032184200483399, + "ewc_loss": 0.05464492738246918, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023639065329916775, + "grad_norm": 6.324421405792236, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8532766699790955, + "num_tokens": 309505085.0, + "step": 8114 + }, + { + "epoch": 1.0323114107619895, + "ewc_loss": 0.05471447482705116, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023708614753559232, + "grad_norm": 6.383511543273926, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8652461171150208, + "num_tokens": 309544746.0, + "step": 8115 + }, + { + "epoch": 1.03243862104058, + "ewc_loss": 0.05471963435411453, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023713776317890733, + "grad_norm": 6.347146034240723, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8490803837776184, + "num_tokens": 309584301.0, + "step": 8116 + }, + { + "epoch": 1.0325658313191706, + "ewc_loss": 0.05472073704004288, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023714874987490475, + "grad_norm": 6.363673210144043, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8646782636642456, + "num_tokens": 309622570.0, + "step": 8117 + }, + { + "epoch": 1.032693041597761, + "ewc_loss": 0.05482487753033638, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023819018679205328, + "grad_norm": 6.371024131774902, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8464629650115967, + "num_tokens": 309666417.0, + "step": 8118 + }, + { + "epoch": 1.0328202518763516, + "ewc_loss": 0.05495668575167656, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002370668516959995, + "grad_norm": 6.483285903930664, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8518047332763672, + "num_tokens": 309705706.0, + "step": 8119 + }, + { + "epoch": 1.0329474621549422, + "ewc_loss": 0.054957494139671326, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023707492800895125, + "grad_norm": 6.50444221496582, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8703275322914124, + "num_tokens": 309744288.0, + "step": 8120 + }, + { + "epoch": 1.0330746724335327, + "ewc_loss": 0.05488365888595581, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023633657838217914, + "grad_norm": 6.343021869659424, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8497769236564636, + "num_tokens": 309780406.0, + "step": 8121 + }, + { + "epoch": 1.0332018827121232, + "ewc_loss": 0.05494469404220581, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002369469148106873, + "grad_norm": 6.467091083526611, + "learning_rate": 1e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.8307124972343445, + "num_tokens": 309811135.0, + "step": 8122 + }, + { + "epoch": 1.0333290929907137, + "ewc_loss": 0.054634712636470795, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023628851340617985, + "grad_norm": 6.360851287841797, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.865841269493103, + "num_tokens": 309847037.0, + "step": 8123 + }, + { + "epoch": 1.0334563032693043, + "ewc_loss": 0.05479584261775017, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023789983242750168, + "grad_norm": 6.37709379196167, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8625481128692627, + "num_tokens": 309884274.0, + "step": 8124 + }, + { + "epoch": 1.0335835135478946, + "ewc_loss": 0.054934658110141754, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023684656480327249, + "grad_norm": 6.388664722442627, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8747291564941406, + "num_tokens": 309915572.0, + "step": 8125 + }, + { + "epoch": 1.033710723826485, + "ewc_loss": 0.05496295541524887, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023712957045063376, + "grad_norm": 6.2917704582214355, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8669671416282654, + "num_tokens": 309954502.0, + "step": 8126 + }, + { + "epoch": 1.0338379341050756, + "ewc_loss": 0.055072370916604996, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023822371440473944, + "grad_norm": 6.374004364013672, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8835773468017578, + "num_tokens": 309993015.0, + "step": 8127 + }, + { + "epoch": 1.0339651443836662, + "ewc_loss": 0.05478714033961296, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023781281197443604, + "grad_norm": 6.372000694274902, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.85189288854599, + "num_tokens": 310030840.0, + "step": 8128 + }, + { + "epoch": 1.0340923546622567, + "ewc_loss": 0.05511820316314697, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023868202697485685, + "grad_norm": 6.339437484741211, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8625192642211914, + "num_tokens": 310067586.0, + "step": 8129 + }, + { + "epoch": 1.0342195649408472, + "ewc_loss": 0.055103473365306854, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023853473248891532, + "grad_norm": 6.3572869300842285, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8710659742355347, + "num_tokens": 310104496.0, + "step": 8130 + }, + { + "epoch": 1.0343467752194377, + "ewc_loss": 0.054884299635887146, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00023878438514657319, + "grad_norm": 6.332286357879639, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8447223901748657, + "num_tokens": 310141162.0, + "step": 8131 + }, + { + "epoch": 1.0344739854980283, + "ewc_loss": 0.0552082285284996, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023958231031429023, + "grad_norm": 6.400866985321045, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8604245185852051, + "num_tokens": 310182965.0, + "step": 8132 + }, + { + "epoch": 1.0346011957766188, + "ewc_loss": 0.05512886494398117, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023878863430581987, + "grad_norm": 6.355432033538818, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8447263240814209, + "num_tokens": 310219572.0, + "step": 8133 + }, + { + "epoch": 1.0347284060552093, + "ewc_loss": 0.055238593369722366, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002398859360255301, + "grad_norm": 6.493755340576172, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8494207859039307, + "num_tokens": 310248711.0, + "step": 8134 + }, + { + "epoch": 1.0348556163337999, + "ewc_loss": 0.05511043220758438, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023860430519562215, + "grad_norm": 6.312237739562988, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8460996747016907, + "num_tokens": 310287915.0, + "step": 8135 + }, + { + "epoch": 1.0349828266123904, + "ewc_loss": 0.055441923439502716, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023947781301103532, + "grad_norm": 6.401412010192871, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8596183657646179, + "num_tokens": 310325619.0, + "step": 8136 + }, + { + "epoch": 1.0351100368909807, + "ewc_loss": 0.05512174218893051, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023871743178460747, + "grad_norm": 6.331746578216553, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8689543008804321, + "num_tokens": 310362456.0, + "step": 8137 + }, + { + "epoch": 1.0352372471695712, + "ewc_loss": 0.055136073380708694, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023886073904577643, + "grad_norm": 6.426846027374268, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8571442365646362, + "num_tokens": 310399584.0, + "step": 8138 + }, + { + "epoch": 1.0353644574481617, + "ewc_loss": 0.055315930396318436, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002382178936386481, + "grad_norm": 6.3683247566223145, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8557605743408203, + "num_tokens": 310437435.0, + "step": 8139 + }, + { + "epoch": 1.0354916677267523, + "ewc_loss": 0.05535636097192764, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023862221860326827, + "grad_norm": 6.348483562469482, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8678490519523621, + "num_tokens": 310476320.0, + "step": 8140 + }, + { + "epoch": 1.0356188780053428, + "ewc_loss": 0.055139437317848206, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023889436852186918, + "grad_norm": 6.355016231536865, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8767452239990234, + "num_tokens": 310512956.0, + "step": 8141 + }, + { + "epoch": 1.0357460882839333, + "ewc_loss": 0.055325947701931, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023831805447116494, + "grad_norm": 6.406161785125732, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8605939149856567, + "num_tokens": 310547605.0, + "step": 8142 + }, + { + "epoch": 1.0358732985625239, + "ewc_loss": 0.0554078072309494, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023913668701425195, + "grad_norm": 6.35244083404541, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8596215844154358, + "num_tokens": 310586991.0, + "step": 8143 + }, + { + "epoch": 1.0360005088411144, + "ewc_loss": 0.0553918182849884, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002389767614658922, + "grad_norm": 6.354959487915039, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8607606887817383, + "num_tokens": 310621271.0, + "step": 8144 + }, + { + "epoch": 1.036127719119705, + "ewc_loss": 0.05537810176610947, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002388395951129496, + "grad_norm": 6.338494777679443, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8646408319473267, + "num_tokens": 310662154.0, + "step": 8145 + }, + { + "epoch": 1.0362549293982954, + "ewc_loss": 0.05541170388460159, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023917561338748783, + "grad_norm": 6.407139301300049, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8711641430854797, + "num_tokens": 310701924.0, + "step": 8146 + }, + { + "epoch": 1.036382139676886, + "ewc_loss": 0.05541011691093445, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023915973724797368, + "grad_norm": 6.315146446228027, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8675435185432434, + "num_tokens": 310739688.0, + "step": 8147 + }, + { + "epoch": 1.0365093499554765, + "ewc_loss": 0.055440597236156464, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023946458532009274, + "grad_norm": 6.409167766571045, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8567107319831848, + "num_tokens": 310773317.0, + "step": 8148 + }, + { + "epoch": 1.0366365602340668, + "ewc_loss": 0.05534140020608902, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002384725958108902, + "grad_norm": 6.342621803283691, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8590966463088989, + "num_tokens": 310810854.0, + "step": 8149 + }, + { + "epoch": 1.0367637705126573, + "ewc_loss": 0.0554334819316864, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002393934119027108, + "grad_norm": 6.430513381958008, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.854566752910614, + "num_tokens": 310846080.0, + "step": 8150 + }, + { + "epoch": 1.0368909807912479, + "ewc_loss": 0.0554315522313118, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023937410151120275, + "grad_norm": 6.388256072998047, + "learning_rate": 1e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8387707471847534, + "num_tokens": 310885032.0, + "step": 8151 + }, + { + "epoch": 1.0370181910698384, + "ewc_loss": 0.05538177490234375, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023887633869890124, + "grad_norm": 6.374795436859131, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8672820329666138, + "num_tokens": 310924518.0, + "step": 8152 + }, + { + "epoch": 1.037145401348429, + "ewc_loss": 0.055431075394153595, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002393693575868383, + "grad_norm": 6.435086727142334, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8632234334945679, + "num_tokens": 310961734.0, + "step": 8153 + }, + { + "epoch": 1.0372726116270194, + "ewc_loss": 0.055365003645420074, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023870864242780954, + "grad_norm": 6.3507304191589355, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8647436499595642, + "num_tokens": 311004541.0, + "step": 8154 + }, + { + "epoch": 1.03739982190561, + "ewc_loss": 0.055407606065273285, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023913463519420475, + "grad_norm": 6.399961948394775, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8710881471633911, + "num_tokens": 311043947.0, + "step": 8155 + }, + { + "epoch": 1.0375270321842005, + "ewc_loss": 0.05531511828303337, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023820977366995066, + "grad_norm": 6.395669460296631, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.858081579208374, + "num_tokens": 311083154.0, + "step": 8156 + }, + { + "epoch": 1.037654242462791, + "ewc_loss": 0.0553668737411499, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023872731253504753, + "grad_norm": 6.418118953704834, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8737447261810303, + "num_tokens": 311121429.0, + "step": 8157 + }, + { + "epoch": 1.0377814527413816, + "ewc_loss": 0.055288806557655334, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023794664593879133, + "grad_norm": 6.321518898010254, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8456001877784729, + "num_tokens": 311166578.0, + "step": 8158 + }, + { + "epoch": 1.037908663019972, + "ewc_loss": 0.055333562195301056, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023839423374738544, + "grad_norm": 6.412055015563965, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8639870882034302, + "num_tokens": 311209645.0, + "step": 8159 + }, + { + "epoch": 1.0380358732985626, + "ewc_loss": 0.05527246743440628, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023778325703460723, + "grad_norm": 6.370999336242676, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8658460378646851, + "num_tokens": 311248428.0, + "step": 8160 + }, + { + "epoch": 1.0381630835771531, + "ewc_loss": 0.055306918919086456, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002381277590757236, + "grad_norm": 6.374526023864746, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8616356253623962, + "num_tokens": 311282040.0, + "step": 8161 + }, + { + "epoch": 1.0382902938557435, + "ewc_loss": 0.05533294752240181, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023838807828724384, + "grad_norm": 6.409576416015625, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8686308860778809, + "num_tokens": 311320233.0, + "step": 8162 + }, + { + "epoch": 1.038417504134334, + "ewc_loss": 0.055263951420784, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002376980846747756, + "grad_norm": 6.3231329917907715, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8472926616668701, + "num_tokens": 311357586.0, + "step": 8163 + }, + { + "epoch": 1.0385447144129245, + "ewc_loss": 0.05539241433143616, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023898272775113583, + "grad_norm": 6.425643444061279, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8607807755470276, + "num_tokens": 311395545.0, + "step": 8164 + }, + { + "epoch": 1.038671924691515, + "ewc_loss": 0.05525509640574455, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023760955082252622, + "grad_norm": 6.414604663848877, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8619471192359924, + "num_tokens": 311428785.0, + "step": 8165 + }, + { + "epoch": 1.0387991349701056, + "ewc_loss": 0.0554024800658226, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023908338334877044, + "grad_norm": 6.376753807067871, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8498737215995789, + "num_tokens": 311469546.0, + "step": 8166 + }, + { + "epoch": 1.038926345248696, + "ewc_loss": 0.05529308319091797, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002379894140176475, + "grad_norm": 6.3930583000183105, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8536359071731567, + "num_tokens": 311504064.0, + "step": 8167 + }, + { + "epoch": 1.0390535555272866, + "ewc_loss": 0.05535878986120224, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023864649119786918, + "grad_norm": 6.446012496948242, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8443533182144165, + "num_tokens": 311536600.0, + "step": 8168 + }, + { + "epoch": 1.0391807658058771, + "ewc_loss": 0.0553554967045784, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002386135602137074, + "grad_norm": 6.31470251083374, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8782927989959717, + "num_tokens": 311573638.0, + "step": 8169 + }, + { + "epoch": 1.0393079760844677, + "ewc_loss": 0.05533698946237564, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023842850350774825, + "grad_norm": 6.382561683654785, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8541370034217834, + "num_tokens": 311611085.0, + "step": 8170 + }, + { + "epoch": 1.0394351863630582, + "ewc_loss": 0.05538511276245117, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023890970624051988, + "grad_norm": 6.381600856781006, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8548840284347534, + "num_tokens": 311655497.0, + "step": 8171 + }, + { + "epoch": 1.0395623966416487, + "ewc_loss": 0.055402614176273346, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023908472212497145, + "grad_norm": 6.447478771209717, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8470373153686523, + "num_tokens": 311688167.0, + "step": 8172 + }, + { + "epoch": 1.0396896069202393, + "ewc_loss": 0.055325526744127274, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023831386351957917, + "grad_norm": 6.353672027587891, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8611577749252319, + "num_tokens": 311724926.0, + "step": 8173 + }, + { + "epoch": 1.0398168171988296, + "ewc_loss": 0.05541855841875076, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023924419656395912, + "grad_norm": 6.460712432861328, + "learning_rate": 1e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8434892892837524, + "num_tokens": 311760729.0, + "step": 8174 + }, + { + "epoch": 1.03994402747742, + "ewc_loss": 0.05525413155555725, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023759993200656027, + "grad_norm": 6.317322731018066, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8620176315307617, + "num_tokens": 311802138.0, + "step": 8175 + }, + { + "epoch": 1.0400712377560106, + "ewc_loss": 0.05545230209827423, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023958161182235926, + "grad_norm": 6.413639545440674, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8683841228485107, + "num_tokens": 311838510.0, + "step": 8176 + }, + { + "epoch": 1.0401984480346012, + "ewc_loss": 0.055266011506319046, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002377187047386542, + "grad_norm": 6.410709857940674, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8544161915779114, + "num_tokens": 311870875.0, + "step": 8177 + }, + { + "epoch": 1.0403256583131917, + "ewc_loss": 0.05537226051092148, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002387812128290534, + "grad_norm": 6.40924596786499, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8748505711555481, + "num_tokens": 311905854.0, + "step": 8178 + }, + { + "epoch": 1.0404528685917822, + "ewc_loss": 0.05537406727671623, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023879925720393658, + "grad_norm": 6.526486873626709, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8588192462921143, + "num_tokens": 311936381.0, + "step": 8179 + }, + { + "epoch": 1.0405800788703727, + "ewc_loss": 0.0552578866481781, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.000237637446844019, + "grad_norm": 6.357143878936768, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.855323314666748, + "num_tokens": 311972529.0, + "step": 8180 + }, + { + "epoch": 1.0407072891489633, + "ewc_loss": 0.05535050481557846, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023856363259255886, + "grad_norm": 6.39611291885376, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8794175386428833, + "num_tokens": 312007883.0, + "step": 8181 + }, + { + "epoch": 1.0408344994275538, + "ewc_loss": 0.055296286940574646, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023802144278306514, + "grad_norm": 6.344986438751221, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8567416667938232, + "num_tokens": 312041000.0, + "step": 8182 + }, + { + "epoch": 1.0409617097061443, + "ewc_loss": 0.055339373648166656, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023845235409680754, + "grad_norm": 6.3622636795043945, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8641330003738403, + "num_tokens": 312076011.0, + "step": 8183 + }, + { + "epoch": 1.0410889199847349, + "ewc_loss": 0.05532124638557434, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023827108088880777, + "grad_norm": 6.334415912628174, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8552739024162292, + "num_tokens": 312115614.0, + "step": 8184 + }, + { + "epoch": 1.0412161302633254, + "ewc_loss": 0.05536828562617302, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002387414569966495, + "grad_norm": 6.334553241729736, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8554331064224243, + "num_tokens": 312154111.0, + "step": 8185 + }, + { + "epoch": 1.0413433405419157, + "ewc_loss": 0.055385708808898926, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002389156725257635, + "grad_norm": 6.368728160858154, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8558390736579895, + "num_tokens": 312191571.0, + "step": 8186 + }, + { + "epoch": 1.0414705508205062, + "ewc_loss": 0.05537061393260956, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023876475461293012, + "grad_norm": 6.359828472137451, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8550665378570557, + "num_tokens": 312229205.0, + "step": 8187 + }, + { + "epoch": 1.0415977610990967, + "ewc_loss": 0.055396802723407745, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023902661632746458, + "grad_norm": 6.367356777191162, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8762126564979553, + "num_tokens": 312266604.0, + "step": 8188 + }, + { + "epoch": 1.0417249713776873, + "ewc_loss": 0.05536356568336487, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023869423603173345, + "grad_norm": 6.399948596954346, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8493364453315735, + "num_tokens": 312305222.0, + "step": 8189 + }, + { + "epoch": 1.0418521816562778, + "ewc_loss": 0.05533553659915924, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023841393704060465, + "grad_norm": 6.342507839202881, + "learning_rate": 1e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8385394811630249, + "num_tokens": 312345823.0, + "step": 8190 + }, + { + "epoch": 1.0419793919348683, + "ewc_loss": 0.05537291616201401, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002387877320870757, + "grad_norm": 6.426599979400635, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8515512943267822, + "num_tokens": 312383569.0, + "step": 8191 + }, + { + "epoch": 1.0421066022134589, + "ewc_loss": 0.055246755480766296, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023752616834826767, + "grad_norm": 6.329453945159912, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8640262484550476, + "num_tokens": 312423068.0, + "step": 8192 + }, + { + "epoch": 1.0422338124920494, + "ewc_loss": 0.05538049340248108, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023886353301350027, + "grad_norm": 6.408761978149414, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8560597896575928, + "num_tokens": 312460569.0, + "step": 8193 + }, + { + "epoch": 1.04236102277064, + "ewc_loss": 0.05534903332591057, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002385489351581782, + "grad_norm": 6.391566753387451, + "learning_rate": 1e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.8400218486785889, + "num_tokens": 312501654.0, + "step": 8194 + }, + { + "epoch": 1.0424882330492304, + "ewc_loss": 0.055289436131715775, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023795294691808522, + "grad_norm": 6.557682514190674, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8526246547698975, + "num_tokens": 312543630.0, + "step": 8195 + }, + { + "epoch": 1.042615443327821, + "ewc_loss": 0.05525767803192139, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023763538047205657, + "grad_norm": 6.367198467254639, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8581995964050293, + "num_tokens": 312579428.0, + "step": 8196 + }, + { + "epoch": 1.0427426536064115, + "ewc_loss": 0.05518730729818344, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023693164985161275, + "grad_norm": 6.322869777679443, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8742821216583252, + "num_tokens": 312621388.0, + "step": 8197 + }, + { + "epoch": 1.0428698638850018, + "ewc_loss": 0.05529762804508209, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023803487420082092, + "grad_norm": 6.357511043548584, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8613513708114624, + "num_tokens": 312665303.0, + "step": 8198 + }, + { + "epoch": 1.0429970741635923, + "ewc_loss": 0.0552874356508255, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002379329380346462, + "grad_norm": 6.446445941925049, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8515593409538269, + "num_tokens": 312702354.0, + "step": 8199 + }, + { + "epoch": 1.0431242844421829, + "ewc_loss": 0.055302295833826065, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023808155674487352, + "grad_norm": 6.377472877502441, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8654166460037231, + "num_tokens": 312737223.0, + "step": 8200 + }, + { + "epoch": 1.0432514947207734, + "ewc_loss": 0.055297501385211945, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023803359363228083, + "grad_norm": 6.4104719161987305, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8620061874389648, + "num_tokens": 312773353.0, + "step": 8201 + }, + { + "epoch": 1.043378704999364, + "ewc_loss": 0.05524156242609024, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002374742180109024, + "grad_norm": 6.415666103363037, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8425230979919434, + "num_tokens": 312809230.0, + "step": 8202 + }, + { + "epoch": 1.0435059152779544, + "ewc_loss": 0.05502959340810776, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023779593175277114, + "grad_norm": 6.38171911239624, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.878311276435852, + "num_tokens": 312841278.0, + "step": 8203 + }, + { + "epoch": 1.043633125556545, + "ewc_loss": 0.055272020399570465, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002377787750447169, + "grad_norm": 6.405098915100098, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8657580614089966, + "num_tokens": 312888469.0, + "step": 8204 + }, + { + "epoch": 1.0437603358351355, + "ewc_loss": 0.05504410341382027, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023794102889951319, + "grad_norm": 6.364029884338379, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8681210875511169, + "num_tokens": 312925446.0, + "step": 8205 + }, + { + "epoch": 1.043887546113726, + "ewc_loss": 0.05532987415790558, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023835731553845108, + "grad_norm": 6.443751811981201, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8590518832206726, + "num_tokens": 312965425.0, + "step": 8206 + }, + { + "epoch": 1.0440147563923166, + "ewc_loss": 0.05529112368822098, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023796985624358058, + "grad_norm": 6.459583759307861, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8546831607818604, + "num_tokens": 313000637.0, + "step": 8207 + }, + { + "epoch": 1.044141966670907, + "ewc_loss": 0.0552993081510067, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023805166711099446, + "grad_norm": 6.424625396728516, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8616663813591003, + "num_tokens": 313035986.0, + "step": 8208 + }, + { + "epoch": 1.0442691769494976, + "ewc_loss": 0.05527958273887634, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023785443045198917, + "grad_norm": 6.361041069030762, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8494744896888733, + "num_tokens": 313077027.0, + "step": 8209 + }, + { + "epoch": 1.0443963872280881, + "ewc_loss": 0.05538202077150345, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023887879797257483, + "grad_norm": 6.420795440673828, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8712718486785889, + "num_tokens": 313121733.0, + "step": 8210 + }, + { + "epoch": 1.0445235975066784, + "ewc_loss": 0.055365703999996185, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023871564189903438, + "grad_norm": 6.398227214813232, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8547185063362122, + "num_tokens": 313165542.0, + "step": 8211 + }, + { + "epoch": 1.044650807785269, + "ewc_loss": 0.05531659722328186, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023822455841582268, + "grad_norm": 6.463597774505615, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8491065502166748, + "num_tokens": 313201082.0, + "step": 8212 + }, + { + "epoch": 1.0447780180638595, + "ewc_loss": 0.05534890294075012, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023854764003772289, + "grad_norm": 6.450921535491943, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8710564970970154, + "num_tokens": 313241765.0, + "step": 8213 + }, + { + "epoch": 1.04490522834245, + "ewc_loss": 0.055329546332359314, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023835407046135515, + "grad_norm": 6.381239414215088, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8588913679122925, + "num_tokens": 313282471.0, + "step": 8214 + }, + { + "epoch": 1.0450324386210406, + "ewc_loss": 0.05541066452860832, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023916523787193, + "grad_norm": 6.373648166656494, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8563451766967773, + "num_tokens": 313331505.0, + "step": 8215 + }, + { + "epoch": 1.045159648899631, + "ewc_loss": 0.05533870309591293, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023844564566388726, + "grad_norm": 6.3724684715271, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8611105680465698, + "num_tokens": 313369823.0, + "step": 8216 + }, + { + "epoch": 1.0452868591782216, + "ewc_loss": 0.05544193089008331, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023947788577061146, + "grad_norm": 6.450937271118164, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8612509965896606, + "num_tokens": 313404255.0, + "step": 8217 + }, + { + "epoch": 1.0454140694568121, + "ewc_loss": 0.05540184676647186, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023907703871373087, + "grad_norm": 6.4070563316345215, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8723576068878174, + "num_tokens": 313439368.0, + "step": 8218 + }, + { + "epoch": 1.0455412797354027, + "ewc_loss": 0.05540001019835472, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002390586887486279, + "grad_norm": 6.418643951416016, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8530130386352539, + "num_tokens": 313479475.0, + "step": 8219 + }, + { + "epoch": 1.0456684900139932, + "ewc_loss": 0.0554375946521759, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002394345501670614, + "grad_norm": 6.450133323669434, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8512625694274902, + "num_tokens": 313514604.0, + "step": 8220 + }, + { + "epoch": 1.0457957002925837, + "ewc_loss": 0.05540753901004791, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023913400946184993, + "grad_norm": 6.433411598205566, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8570491075515747, + "num_tokens": 313548907.0, + "step": 8221 + }, + { + "epoch": 1.0459229105711743, + "ewc_loss": 0.055428147315979004, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023934005002956837, + "grad_norm": 6.44766902923584, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8592607975006104, + "num_tokens": 313586615.0, + "step": 8222 + }, + { + "epoch": 1.0460501208497646, + "ewc_loss": 0.05536063760519028, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023866494302637875, + "grad_norm": 6.409107685089111, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8507565259933472, + "num_tokens": 313625202.0, + "step": 8223 + }, + { + "epoch": 1.046177331128355, + "ewc_loss": 0.055392295122146606, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002389815344940871, + "grad_norm": 6.424271583557129, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8551111221313477, + "num_tokens": 313663257.0, + "step": 8224 + }, + { + "epoch": 1.0463045414069456, + "ewc_loss": 0.05535127967596054, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023857137421146035, + "grad_norm": 6.399573802947998, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8656854033470154, + "num_tokens": 313701430.0, + "step": 8225 + }, + { + "epoch": 1.0464317516855361, + "ewc_loss": 0.05541326850652695, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023919128580018878, + "grad_norm": 6.413230895996094, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.871654748916626, + "num_tokens": 313741286.0, + "step": 8226 + }, + { + "epoch": 1.0465589619641267, + "ewc_loss": 0.055380817502737045, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023886676353868097, + "grad_norm": 6.432260990142822, + "learning_rate": 1e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.8455948829650879, + "num_tokens": 313779115.0, + "step": 8227 + }, + { + "epoch": 1.0466861722427172, + "ewc_loss": 0.05511903017759323, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023869032156653702, + "grad_norm": 6.466620445251465, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8654454946517944, + "num_tokens": 313822331.0, + "step": 8228 + }, + { + "epoch": 1.0468133825213077, + "ewc_loss": 0.05512870103120804, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002387870044913143, + "grad_norm": 6.359917640686035, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8552030920982361, + "num_tokens": 313863545.0, + "step": 8229 + }, + { + "epoch": 1.0469405927998983, + "ewc_loss": 0.055101290345191956, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023851289006415755, + "grad_norm": 6.444068908691406, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8802305459976196, + "num_tokens": 313894956.0, + "step": 8230 + }, + { + "epoch": 1.0470678030784888, + "ewc_loss": 0.055118903517723083, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023868905554991215, + "grad_norm": 6.433041572570801, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8699702024459839, + "num_tokens": 313927920.0, + "step": 8231 + }, + { + "epoch": 1.0471950133570793, + "ewc_loss": 0.05510317534208298, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023853173479437828, + "grad_norm": 6.4197773933410645, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8698623180389404, + "num_tokens": 313963230.0, + "step": 8232 + }, + { + "epoch": 1.0473222236356698, + "ewc_loss": 0.05511099845170975, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00023860996589064598, + "grad_norm": 6.4549431800842285, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8518919944763184, + "num_tokens": 313995951.0, + "step": 8233 + }, + { + "epoch": 1.0474494339142604, + "ewc_loss": 0.05535481497645378, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002386067499173805, + "grad_norm": 6.329561233520508, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.862144947052002, + "num_tokens": 314040928.0, + "step": 8234 + }, + { + "epoch": 1.0475766441928507, + "ewc_loss": 0.05545184016227722, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023957696976140141, + "grad_norm": 6.48399019241333, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8519468307495117, + "num_tokens": 314080329.0, + "step": 8235 + }, + { + "epoch": 1.0477038544714412, + "ewc_loss": 0.05530952662229538, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023815385065972805, + "grad_norm": 6.357616424560547, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8616417050361633, + "num_tokens": 314117107.0, + "step": 8236 + }, + { + "epoch": 1.0478310647500317, + "ewc_loss": 0.05550515651702881, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024011013738345355, + "grad_norm": 6.523373126983643, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.858074963092804, + "num_tokens": 314148077.0, + "step": 8237 + }, + { + "epoch": 1.0479582750286223, + "ewc_loss": 0.05533112958073616, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023836988839320838, + "grad_norm": 6.382967472076416, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8526593446731567, + "num_tokens": 314182064.0, + "step": 8238 + }, + { + "epoch": 1.0480854853072128, + "ewc_loss": 0.055408164858818054, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023914022312965244, + "grad_norm": 6.442590713500977, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8517485857009888, + "num_tokens": 314221667.0, + "step": 8239 + }, + { + "epoch": 1.0482126955858033, + "ewc_loss": 0.055411942303180695, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002391780144535005, + "grad_norm": 6.456089019775391, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8575020432472229, + "num_tokens": 314255225.0, + "step": 8240 + }, + { + "epoch": 1.0483399058643939, + "ewc_loss": 0.055441681295633316, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023947541194502264, + "grad_norm": 6.442480087280273, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8613879680633545, + "num_tokens": 314288011.0, + "step": 8241 + }, + { + "epoch": 1.0484671161429844, + "ewc_loss": 0.05534978583455086, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023855644394643605, + "grad_norm": 6.350466251373291, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8564741015434265, + "num_tokens": 314332461.0, + "step": 8242 + }, + { + "epoch": 1.048594326421575, + "ewc_loss": 0.05551403760910034, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024019894772209227, + "grad_norm": 6.424849987030029, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8555874824523926, + "num_tokens": 314367967.0, + "step": 8243 + }, + { + "epoch": 1.0487215367001654, + "ewc_loss": 0.055493853986263275, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023999714176170528, + "grad_norm": 6.36802339553833, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8694649338722229, + "num_tokens": 314404487.0, + "step": 8244 + }, + { + "epoch": 1.048848746978756, + "ewc_loss": 0.055526457726955414, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024032314831856638, + "grad_norm": 6.401771068572998, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8590378761291504, + "num_tokens": 314444731.0, + "step": 8245 + }, + { + "epoch": 1.0489759572573465, + "ewc_loss": 0.055549994111061096, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024055851099547, + "grad_norm": 6.416692733764648, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8621224164962769, + "num_tokens": 314479720.0, + "step": 8246 + }, + { + "epoch": 1.0491031675359368, + "ewc_loss": 0.05553630739450455, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024042167933657765, + "grad_norm": 6.412656784057617, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8776768445968628, + "num_tokens": 314515613.0, + "step": 8247 + }, + { + "epoch": 1.0492303778145273, + "ewc_loss": 0.05555511638522148, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002406097628409043, + "grad_norm": 6.415264129638672, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8587170243263245, + "num_tokens": 314556519.0, + "step": 8248 + }, + { + "epoch": 1.0493575880931179, + "ewc_loss": 0.0555521696805954, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024058028066065162, + "grad_norm": 6.464743614196777, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8717883825302124, + "num_tokens": 314590803.0, + "step": 8249 + }, + { + "epoch": 1.0494847983717084, + "ewc_loss": 0.05550014227628708, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024006002058740705, + "grad_norm": 6.438120365142822, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8616786003112793, + "num_tokens": 314631347.0, + "step": 8250 + }, + { + "epoch": 1.049612008650299, + "ewc_loss": 0.0555090568959713, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024014916562009603, + "grad_norm": 6.421310901641846, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8594995737075806, + "num_tokens": 314673739.0, + "step": 8251 + }, + { + "epoch": 1.0497392189288894, + "ewc_loss": 0.055441245436668396, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023947104637045413, + "grad_norm": 6.424391269683838, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8613263964653015, + "num_tokens": 314718498.0, + "step": 8252 + }, + { + "epoch": 1.04986642920748, + "ewc_loss": 0.05554904043674469, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024054900859482586, + "grad_norm": 6.529468536376953, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8671268224716187, + "num_tokens": 314758755.0, + "step": 8253 + }, + { + "epoch": 1.0499936394860705, + "ewc_loss": 0.05540064722299576, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023906509159132838, + "grad_norm": 6.396581649780273, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8651512861251831, + "num_tokens": 314793644.0, + "step": 8254 + }, + { + "epoch": 1.050120849764661, + "ewc_loss": 0.05553913116455078, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024044988094829023, + "grad_norm": 6.512432098388672, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8573817014694214, + "num_tokens": 314828045.0, + "step": 8255 + }, + { + "epoch": 1.0502480600432516, + "ewc_loss": 0.05537994205951691, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023885798873379827, + "grad_norm": 6.328839302062988, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8633720874786377, + "num_tokens": 314866088.0, + "step": 8256 + }, + { + "epoch": 1.050375270321842, + "ewc_loss": 0.055611077696084976, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024116937129292637, + "grad_norm": 6.497859477996826, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8524343371391296, + "num_tokens": 314904757.0, + "step": 8257 + }, + { + "epoch": 1.0505024806004326, + "ewc_loss": 0.05541412532329559, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023919987143017352, + "grad_norm": 6.445200443267822, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.852303147315979, + "num_tokens": 314941410.0, + "step": 8258 + }, + { + "epoch": 1.0506296908790231, + "ewc_loss": 0.055487439036369324, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002399329823674634, + "grad_norm": 6.429780006408691, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8690446615219116, + "num_tokens": 314982060.0, + "step": 8259 + }, + { + "epoch": 1.0507569011576134, + "ewc_loss": 0.055474285036325455, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002398014476057142, + "grad_norm": 6.463833808898926, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8515987396240234, + "num_tokens": 315021812.0, + "step": 8260 + }, + { + "epoch": 1.050884111436204, + "ewc_loss": 0.05540776625275612, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002391362504567951, + "grad_norm": 6.441421985626221, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8565823435783386, + "num_tokens": 315059520.0, + "step": 8261 + }, + { + "epoch": 1.0510113217147945, + "ewc_loss": 0.05551346391439438, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024019325792323798, + "grad_norm": 6.469663143157959, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8553004860877991, + "num_tokens": 315098392.0, + "step": 8262 + }, + { + "epoch": 1.051138531993385, + "ewc_loss": 0.055390872061252594, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023896728816907853, + "grad_norm": 6.409441947937012, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8503500819206238, + "num_tokens": 315137375.0, + "step": 8263 + }, + { + "epoch": 1.0512657422719756, + "ewc_loss": 0.05541566014289856, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.000239215194596909, + "grad_norm": 6.516708850860596, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.864748477935791, + "num_tokens": 315171308.0, + "step": 8264 + }, + { + "epoch": 1.051392952550566, + "ewc_loss": 0.055371589958667755, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023877451894804835, + "grad_norm": 6.360270977020264, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8613662123680115, + "num_tokens": 315212176.0, + "step": 8265 + }, + { + "epoch": 1.0515201628291566, + "ewc_loss": 0.05546785145998001, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023973712814040482, + "grad_norm": 6.461312294006348, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8485733270645142, + "num_tokens": 315251321.0, + "step": 8266 + }, + { + "epoch": 1.0516473731077471, + "ewc_loss": 0.05546172708272934, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002396758645772934, + "grad_norm": 6.445690631866455, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8458802700042725, + "num_tokens": 315288780.0, + "step": 8267 + }, + { + "epoch": 1.0517745833863377, + "ewc_loss": 0.055398520082235336, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023904380213934928, + "grad_norm": 6.391854763031006, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8832070827484131, + "num_tokens": 315325630.0, + "step": 8268 + }, + { + "epoch": 1.0519017936649282, + "ewc_loss": 0.05544814467430115, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00023954003700055182, + "grad_norm": 6.501147747039795, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8544615507125854, + "num_tokens": 315362887.0, + "step": 8269 + }, + { + "epoch": 1.0520290039435187, + "ewc_loss": 0.05540207028388977, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002390793088125065, + "grad_norm": 6.398838043212891, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8732504844665527, + "num_tokens": 315397552.0, + "step": 8270 + }, + { + "epoch": 1.0521562142221093, + "ewc_loss": 0.057423174381256104, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00023975908698048443, + "grad_norm": 53.636531829833984, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8708622455596924, + "num_tokens": 315436785.0, + "step": 8271 + }, + { + "epoch": 1.0522834245006996, + "ewc_loss": 0.08483129739761353, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0005309301777742803, + "grad_norm": 10.383584022521973, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8701125383377075, + "num_tokens": 315467849.0, + "step": 8272 + }, + { + "epoch": 1.05241063477929, + "ewc_loss": 0.0588148795068264, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00027076597325503826, + "grad_norm": 6.1234211921691895, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8686276078224182, + "num_tokens": 315501496.0, + "step": 8273 + }, + { + "epoch": 1.0525378450578806, + "ewc_loss": 0.06838522106409073, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00036646940861828625, + "grad_norm": 8.971416473388672, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8694846630096436, + "num_tokens": 315540878.0, + "step": 8274 + }, + { + "epoch": 1.0526650553364711, + "ewc_loss": 0.07372111827135086, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0004198283713776618, + "grad_norm": 9.01545238494873, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8745196461677551, + "num_tokens": 315578139.0, + "step": 8275 + }, + { + "epoch": 1.0527922656150617, + "ewc_loss": 0.06120806559920311, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00029713925323449075, + "grad_norm": 6.848769664764404, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8627907037734985, + "num_tokens": 315615845.0, + "step": 8276 + }, + { + "epoch": 1.0529194758936522, + "ewc_loss": 0.062163181602954865, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0003066904318984598, + "grad_norm": 7.861056327819824, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8517074584960938, + "num_tokens": 315645407.0, + "step": 8277 + }, + { + "epoch": 1.0530466861722427, + "ewc_loss": 0.06394708156585693, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.000324529450153932, + "grad_norm": 7.561439514160156, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8683077692985535, + "num_tokens": 315687922.0, + "step": 8278 + }, + { + "epoch": 1.0531738964508333, + "ewc_loss": 0.059303537011146545, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00027809393941424787, + "grad_norm": 6.974862575531006, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8631662130355835, + "num_tokens": 315723730.0, + "step": 8279 + }, + { + "epoch": 1.0533011067294238, + "ewc_loss": 0.05968134105205536, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00028187199495732784, + "grad_norm": 7.18973445892334, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8725805878639221, + "num_tokens": 315765662.0, + "step": 8280 + }, + { + "epoch": 1.0534283170080143, + "ewc_loss": 0.05907517671585083, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002758103364612907, + "grad_norm": 6.8474321365356445, + "learning_rate": 1e-06, + "loss": 0.5418, + "mean_token_accuracy": 0.8366509675979614, + "num_tokens": 315808668.0, + "step": 8281 + }, + { + "epoch": 1.0535555272866048, + "ewc_loss": 0.05799015238881111, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00026496010832488537, + "grad_norm": 6.962556838989258, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.856931209564209, + "num_tokens": 315844878.0, + "step": 8282 + }, + { + "epoch": 1.0536827375651954, + "ewc_loss": 0.05787377059459686, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00026379627524875104, + "grad_norm": 6.759597301483154, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8746985197067261, + "num_tokens": 315879472.0, + "step": 8283 + }, + { + "epoch": 1.0538099478437857, + "ewc_loss": 0.0572514533996582, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00025757314870133996, + "grad_norm": 6.787353515625, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8567520380020142, + "num_tokens": 315915496.0, + "step": 8284 + }, + { + "epoch": 1.0539371581223762, + "ewc_loss": 0.05691292881965637, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002541879075579345, + "grad_norm": 6.657689571380615, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8632094264030457, + "num_tokens": 315954317.0, + "step": 8285 + }, + { + "epoch": 1.0540643684009667, + "ewc_loss": 0.05669509992003441, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00025200960226356983, + "grad_norm": 6.662217140197754, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8564253449440002, + "num_tokens": 315995518.0, + "step": 8286 + }, + { + "epoch": 1.0541915786795573, + "ewc_loss": 0.05616869777441025, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024918699637055397, + "grad_norm": 6.549272060394287, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8743044137954712, + "num_tokens": 316040475.0, + "step": 8287 + }, + { + "epoch": 1.0543187889581478, + "ewc_loss": 0.056141309440135956, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002489130711182952, + "grad_norm": 6.606637477874756, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8596090078353882, + "num_tokens": 316083003.0, + "step": 8288 + }, + { + "epoch": 1.0544459992367383, + "ewc_loss": 0.055838510394096375, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002458851085975766, + "grad_norm": 6.544973373413086, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8620771765708923, + "num_tokens": 316124687.0, + "step": 8289 + }, + { + "epoch": 1.0545732095153288, + "ewc_loss": 0.05585160478949547, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024601604673080146, + "grad_norm": 6.572197914123535, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8473907113075256, + "num_tokens": 316170052.0, + "step": 8290 + }, + { + "epoch": 1.0547004197939194, + "ewc_loss": 0.05563080310821533, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002438080555293709, + "grad_norm": 6.497520923614502, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8472850918769836, + "num_tokens": 316211941.0, + "step": 8291 + }, + { + "epoch": 1.05482763007251, + "ewc_loss": 0.055651646107435226, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024401645350735635, + "grad_norm": 6.554892539978027, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8646277189254761, + "num_tokens": 316252592.0, + "step": 8292 + }, + { + "epoch": 1.0549548403511004, + "ewc_loss": 0.05555799603462219, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002430799650028348, + "grad_norm": 6.468781471252441, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8691540956497192, + "num_tokens": 316287250.0, + "step": 8293 + }, + { + "epoch": 1.055082050629691, + "ewc_loss": 0.055329062044620514, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.000243232017965056, + "grad_norm": 6.539584159851074, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8644964098930359, + "num_tokens": 316326308.0, + "step": 8294 + }, + { + "epoch": 1.0552092609082815, + "ewc_loss": 0.055416837334632874, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024166835646610707, + "grad_norm": 6.465479850769043, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8639869689941406, + "num_tokens": 316364519.0, + "step": 8295 + }, + { + "epoch": 1.0553364711868718, + "ewc_loss": 0.055285073816776276, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002427921281196177, + "grad_norm": 6.573884010314941, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8667028546333313, + "num_tokens": 316401165.0, + "step": 8296 + }, + { + "epoch": 1.0554636814654623, + "ewc_loss": 0.055125243961811066, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.0002411938621662557, + "grad_norm": 6.432673931121826, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8559303879737854, + "num_tokens": 316440245.0, + "step": 8297 + }, + { + "epoch": 1.0555908917440529, + "ewc_loss": 0.05527330935001373, + "ewc_loss_diag": 3.0994415283203125e-05, + "ewc_loss_parallel": 0.00024267449043691158, + "grad_norm": 6.491782188415527, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8697525858879089, + "num_tokens": 316484969.0, + "step": 8298 + }, + { + "epoch": 1.0557181020226434, + "ewc_loss": 0.05539570748806, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002414570772089064, + "grad_norm": 6.42514705657959, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8794622421264648, + "num_tokens": 316520510.0, + "step": 8299 + }, + { + "epoch": 1.055845312301234, + "ewc_loss": 0.0554879754781723, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.000242379741393961, + "grad_norm": 6.458361625671387, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8451336622238159, + "num_tokens": 316561946.0, + "step": 8300 + }, + { + "epoch": 1.0559725225798244, + "ewc_loss": 0.055445052683353424, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024195054720621556, + "grad_norm": 6.4375386238098145, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8784801959991455, + "num_tokens": 316605489.0, + "step": 8301 + }, + { + "epoch": 1.056099732858415, + "ewc_loss": 0.05547333508729935, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024223334912676364, + "grad_norm": 6.4682698249816895, + "learning_rate": 1e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8457059264183044, + "num_tokens": 316648590.0, + "step": 8302 + }, + { + "epoch": 1.0562269431370055, + "ewc_loss": 0.05550886690616608, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024258866324089468, + "grad_norm": 6.432496547698975, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.873776912689209, + "num_tokens": 316688169.0, + "step": 8303 + }, + { + "epoch": 1.056354153415596, + "ewc_loss": 0.05552997440099716, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002427997242193669, + "grad_norm": 6.53272008895874, + "learning_rate": 1e-06, + "loss": 0.5391, + "mean_token_accuracy": 0.8411969542503357, + "num_tokens": 316724031.0, + "step": 8304 + }, + { + "epoch": 1.0564813636941865, + "ewc_loss": 0.05542999133467674, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024179992033168674, + "grad_norm": 6.388209819793701, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.86011803150177, + "num_tokens": 316769762.0, + "step": 8305 + }, + { + "epoch": 1.056608573972777, + "ewc_loss": 0.05553896725177765, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024288965505547822, + "grad_norm": 6.449854373931885, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8693736791610718, + "num_tokens": 316814156.0, + "step": 8306 + }, + { + "epoch": 1.0567357842513676, + "ewc_loss": 0.05558282881975174, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024332827888429165, + "grad_norm": 6.4289021492004395, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8707330226898193, + "num_tokens": 316852784.0, + "step": 8307 + }, + { + "epoch": 1.0568629945299581, + "ewc_loss": 0.05556763708591461, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024317637144122273, + "grad_norm": 6.482138156890869, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8555135130882263, + "num_tokens": 316891992.0, + "step": 8308 + }, + { + "epoch": 1.0569902048085484, + "ewc_loss": 0.05582940950989723, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024335269699804485, + "grad_norm": 6.432894229888916, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8754298686981201, + "num_tokens": 316932246.0, + "step": 8309 + }, + { + "epoch": 1.057117415087139, + "ewc_loss": 0.0558738186955452, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024379679234698415, + "grad_norm": 6.524285316467285, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8540784120559692, + "num_tokens": 316972017.0, + "step": 8310 + }, + { + "epoch": 1.0572446253657295, + "ewc_loss": 0.055622901767492294, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024372902407776564, + "grad_norm": 6.580524921417236, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8565918207168579, + "num_tokens": 317008631.0, + "step": 8311 + }, + { + "epoch": 1.05737183564432, + "ewc_loss": 0.055795472115278244, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024301331723108888, + "grad_norm": 6.462769031524658, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8668403625488281, + "num_tokens": 317048616.0, + "step": 8312 + }, + { + "epoch": 1.0574990459229106, + "ewc_loss": 0.05585620552301407, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024362064141314477, + "grad_norm": 6.5365424156188965, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8538727164268494, + "num_tokens": 317088675.0, + "step": 8313 + }, + { + "epoch": 1.057626256201501, + "ewc_loss": 0.05578015744686127, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002428601437713951, + "grad_norm": 6.505887508392334, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8824375867843628, + "num_tokens": 317120351.0, + "step": 8314 + }, + { + "epoch": 1.0577534664800916, + "ewc_loss": 0.05587567389011383, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002438153314869851, + "grad_norm": 6.498757362365723, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8555335402488708, + "num_tokens": 317159027.0, + "step": 8315 + }, + { + "epoch": 1.0578806767586821, + "ewc_loss": 0.05577860027551651, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024284457322210073, + "grad_norm": 6.5281853675842285, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8612370491027832, + "num_tokens": 317188892.0, + "step": 8316 + }, + { + "epoch": 1.0580078870372727, + "ewc_loss": 0.05575910955667496, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.000242649664869532, + "grad_norm": 6.429315090179443, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8665321469306946, + "num_tokens": 317229026.0, + "step": 8317 + }, + { + "epoch": 1.0581350973158632, + "ewc_loss": 0.05589905008673668, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024404909345321357, + "grad_norm": 6.541557788848877, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8606321215629578, + "num_tokens": 317272946.0, + "step": 8318 + }, + { + "epoch": 1.0582623075944537, + "ewc_loss": 0.05572615563869476, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024232013674918562, + "grad_norm": 6.4144392013549805, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8606822490692139, + "num_tokens": 317314625.0, + "step": 8319 + }, + { + "epoch": 1.058389517873044, + "ewc_loss": 0.05593552067875862, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002444137935526669, + "grad_norm": 6.5492353439331055, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8714809417724609, + "num_tokens": 317351450.0, + "step": 8320 + }, + { + "epoch": 1.0585167281516346, + "ewc_loss": 0.05578029528260231, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024286155530717224, + "grad_norm": 6.481232643127441, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8446543216705322, + "num_tokens": 317391519.0, + "step": 8321 + }, + { + "epoch": 1.058643938430225, + "ewc_loss": 0.05592067167162895, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024426530580967665, + "grad_norm": 6.544722080230713, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8553471565246582, + "num_tokens": 317430672.0, + "step": 8322 + }, + { + "epoch": 1.0587711487088156, + "ewc_loss": 0.05576970800757408, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024275567557197064, + "grad_norm": 6.439324378967285, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8627711534500122, + "num_tokens": 317470203.0, + "step": 8323 + }, + { + "epoch": 1.0588983589874061, + "ewc_loss": 0.055893562734127045, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024399420362897217, + "grad_norm": 6.537857532501221, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8637235164642334, + "num_tokens": 317509594.0, + "step": 8324 + }, + { + "epoch": 1.0590255692659967, + "ewc_loss": 0.05578684061765671, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002429269952699542, + "grad_norm": 6.49374532699585, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8588888645172119, + "num_tokens": 317549735.0, + "step": 8325 + }, + { + "epoch": 1.0591527795445872, + "ewc_loss": 0.055811312049627304, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024317171482834965, + "grad_norm": 6.603224277496338, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.859151303768158, + "num_tokens": 317585064.0, + "step": 8326 + }, + { + "epoch": 1.0592799898231777, + "ewc_loss": 0.05569414049386978, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024199999461416155, + "grad_norm": 6.432676792144775, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8768535852432251, + "num_tokens": 317622289.0, + "step": 8327 + }, + { + "epoch": 1.0594072001017683, + "ewc_loss": 0.0556073933839798, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024357391521334648, + "grad_norm": 6.643575191497803, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.850199818611145, + "num_tokens": 317662872.0, + "step": 8328 + }, + { + "epoch": 1.0595344103803588, + "ewc_loss": 0.05538008362054825, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002413008187431842, + "grad_norm": 6.408577919006348, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8624163866043091, + "num_tokens": 317701418.0, + "step": 8329 + }, + { + "epoch": 1.0596616206589493, + "ewc_loss": 0.05571258068084717, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024462578585371375, + "grad_norm": 6.651143550872803, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8750115036964417, + "num_tokens": 317741416.0, + "step": 8330 + }, + { + "epoch": 1.0597888309375398, + "ewc_loss": 0.055658865720033646, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002416472416371107, + "grad_norm": 6.433241367340088, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8665259480476379, + "num_tokens": 317778623.0, + "step": 8331 + }, + { + "epoch": 1.0599160412161304, + "ewc_loss": 0.055939119309186935, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002444497949909419, + "grad_norm": 6.660762310028076, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8639397621154785, + "num_tokens": 317816577.0, + "step": 8332 + }, + { + "epoch": 1.0600432514947207, + "ewc_loss": 0.055471021682024, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024221021158155054, + "grad_norm": 6.50869083404541, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8604521751403809, + "num_tokens": 317851532.0, + "step": 8333 + }, + { + "epoch": 1.0601704617733112, + "ewc_loss": 0.055819809436798096, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024325666890945286, + "grad_norm": 6.489005088806152, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.861859142780304, + "num_tokens": 317886789.0, + "step": 8334 + }, + { + "epoch": 1.0602976720519017, + "ewc_loss": 0.05572732165455818, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024233180738519877, + "grad_norm": 6.475993633270264, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8633222579956055, + "num_tokens": 317929511.0, + "step": 8335 + }, + { + "epoch": 1.0604248823304923, + "ewc_loss": 0.05551396310329437, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024263965315185487, + "grad_norm": 6.444636344909668, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8594681024551392, + "num_tokens": 317963767.0, + "step": 8336 + }, + { + "epoch": 1.0605520926090828, + "ewc_loss": 0.05561082810163498, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024360828683711588, + "grad_norm": 6.556685447692871, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.877863883972168, + "num_tokens": 318000458.0, + "step": 8337 + }, + { + "epoch": 1.0606793028876733, + "ewc_loss": 0.055560629814863205, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002431062894174829, + "grad_norm": 6.425739288330078, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8781483173370361, + "num_tokens": 318040394.0, + "step": 8338 + }, + { + "epoch": 1.0608065131662638, + "ewc_loss": 0.05590180307626724, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024407664022874087, + "grad_norm": 6.494131565093994, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8586219549179077, + "num_tokens": 318080859.0, + "step": 8339 + }, + { + "epoch": 1.0609337234448544, + "ewc_loss": 0.055570222437381744, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024320220109075308, + "grad_norm": 6.92088508605957, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8780330419540405, + "num_tokens": 318115378.0, + "step": 8340 + }, + { + "epoch": 1.061060933723445, + "ewc_loss": 0.05540751665830612, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.00024157515144906938, + "grad_norm": 6.3610944747924805, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8542707562446594, + "num_tokens": 318150351.0, + "step": 8341 + }, + { + "epoch": 1.0611881440020354, + "ewc_loss": 0.05566607415676117, + "ewc_loss_diag": 3.123283386230469e-05, + "ewc_loss_parallel": 0.0002441607357468456, + "grad_norm": 6.50521183013916, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8618218302726746, + "num_tokens": 318192427.0, + "step": 8342 + }, + { + "epoch": 1.061315354280626, + "ewc_loss": 0.055665887892246246, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024171748373191804, + "grad_norm": 6.483639240264893, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8491069674491882, + "num_tokens": 318232150.0, + "step": 8343 + }, + { + "epoch": 1.0614425645592165, + "ewc_loss": 0.0557936355471611, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002429949672659859, + "grad_norm": 6.511836051940918, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8571824431419373, + "num_tokens": 318267742.0, + "step": 8344 + }, + { + "epoch": 1.0615697748378068, + "ewc_loss": 0.05572168529033661, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024227543326560408, + "grad_norm": 6.411968231201172, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8658473491668701, + "num_tokens": 318307974.0, + "step": 8345 + }, + { + "epoch": 1.0616969851163973, + "ewc_loss": 0.05578509718179703, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024290959117934108, + "grad_norm": 6.495323181152344, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8441696166992188, + "num_tokens": 318349880.0, + "step": 8346 + }, + { + "epoch": 1.0618241953949878, + "ewc_loss": 0.05580154433846474, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024307404237333685, + "grad_norm": 6.452223777770996, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8594468832015991, + "num_tokens": 318383869.0, + "step": 8347 + }, + { + "epoch": 1.0619514056735784, + "ewc_loss": 0.05577556788921356, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024281426158268005, + "grad_norm": 6.4258575439453125, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8773645758628845, + "num_tokens": 318419659.0, + "step": 8348 + }, + { + "epoch": 1.062078615952169, + "ewc_loss": 0.05580759793519974, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024313456378877163, + "grad_norm": 6.421679496765137, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8699944019317627, + "num_tokens": 318460795.0, + "step": 8349 + }, + { + "epoch": 1.0622058262307594, + "ewc_loss": 0.055820759385824203, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002432661858620122, + "grad_norm": 6.490076541900635, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8515967130661011, + "num_tokens": 318499451.0, + "step": 8350 + }, + { + "epoch": 1.06233303650935, + "ewc_loss": 0.05583638697862625, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024342245887964964, + "grad_norm": 6.499561309814453, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8665960431098938, + "num_tokens": 318533413.0, + "step": 8351 + }, + { + "epoch": 1.0624602467879405, + "ewc_loss": 0.05583224073052406, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024338100047316402, + "grad_norm": 6.460165500640869, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8508222103118896, + "num_tokens": 318572533.0, + "step": 8352 + }, + { + "epoch": 1.062587457066531, + "ewc_loss": 0.05581554025411606, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002432140172459185, + "grad_norm": 6.496172904968262, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8635071516036987, + "num_tokens": 318609442.0, + "step": 8353 + }, + { + "epoch": 1.0627146673451215, + "ewc_loss": 0.05584249272942543, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024348351871594787, + "grad_norm": 6.461427211761475, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8761956095695496, + "num_tokens": 318643475.0, + "step": 8354 + }, + { + "epoch": 1.062841877623712, + "ewc_loss": 0.05582929402589798, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024335151829291135, + "grad_norm": 6.52985143661499, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8647456765174866, + "num_tokens": 318684223.0, + "step": 8355 + }, + { + "epoch": 1.0629690879023026, + "ewc_loss": 0.05583563074469566, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002434148918837309, + "grad_norm": 6.487119197845459, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8618637323379517, + "num_tokens": 318717148.0, + "step": 8356 + }, + { + "epoch": 1.0630962981808931, + "ewc_loss": 0.05587976425886154, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024385623692069203, + "grad_norm": 6.443832874298096, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8725666999816895, + "num_tokens": 318758929.0, + "step": 8357 + }, + { + "epoch": 1.0632235084594834, + "ewc_loss": 0.055844347923994064, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024350207240786403, + "grad_norm": 6.482398986816406, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8599846363067627, + "num_tokens": 318796438.0, + "step": 8358 + }, + { + "epoch": 1.063350718738074, + "ewc_loss": 0.055908773094415665, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002441463293507695, + "grad_norm": 6.466084957122803, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8484625816345215, + "num_tokens": 318835732.0, + "step": 8359 + }, + { + "epoch": 1.0634779290166645, + "ewc_loss": 0.055876798927783966, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024382658011745661, + "grad_norm": 6.452168941497803, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8605465888977051, + "num_tokens": 318872151.0, + "step": 8360 + }, + { + "epoch": 1.063605139295255, + "ewc_loss": 0.05591952055692673, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002442537806928158, + "grad_norm": 6.478578567504883, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8507990837097168, + "num_tokens": 318911956.0, + "step": 8361 + }, + { + "epoch": 1.0637323495738455, + "ewc_loss": 0.05594813823699951, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024453995865769684, + "grad_norm": 6.514644145965576, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8628454804420471, + "num_tokens": 318945593.0, + "step": 8362 + }, + { + "epoch": 1.063859559852436, + "ewc_loss": 0.0558866448700428, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024392503837589175, + "grad_norm": 6.592087745666504, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8720470666885376, + "num_tokens": 318976670.0, + "step": 8363 + }, + { + "epoch": 1.0639867701310266, + "ewc_loss": 0.0558437779545784, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024349636805709451, + "grad_norm": 6.457276821136475, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8646473288536072, + "num_tokens": 319007798.0, + "step": 8364 + }, + { + "epoch": 1.0641139804096171, + "ewc_loss": 0.055897749960422516, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024403609859291464, + "grad_norm": 6.508916854858398, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8517971038818359, + "num_tokens": 319044808.0, + "step": 8365 + }, + { + "epoch": 1.0642411906882077, + "ewc_loss": 0.05578021705150604, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024286074039991945, + "grad_norm": 6.4205474853515625, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8813866972923279, + "num_tokens": 319083387.0, + "step": 8366 + }, + { + "epoch": 1.0643684009667982, + "ewc_loss": 0.055915504693984985, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002442136174067855, + "grad_norm": 6.468846321105957, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8779347538948059, + "num_tokens": 319121897.0, + "step": 8367 + }, + { + "epoch": 1.0644956112453887, + "ewc_loss": 0.056097619235515594, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024359338567592204, + "grad_norm": 6.522192478179932, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8504065275192261, + "num_tokens": 319157190.0, + "step": 8368 + }, + { + "epoch": 1.064622821523979, + "ewc_loss": 0.05609362572431564, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024355342611670494, + "grad_norm": 6.494574546813965, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.871716320514679, + "num_tokens": 319188353.0, + "step": 8369 + }, + { + "epoch": 1.0647500318025696, + "ewc_loss": 0.0561068020761013, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024368520826101303, + "grad_norm": 6.44444465637207, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.865031898021698, + "num_tokens": 319225536.0, + "step": 8370 + }, + { + "epoch": 1.06487724208116, + "ewc_loss": 0.05605305731296539, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024314776237588376, + "grad_norm": 6.5095367431640625, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.868302047252655, + "num_tokens": 319265114.0, + "step": 8371 + }, + { + "epoch": 1.0650044523597506, + "ewc_loss": 0.05614994466304779, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024411662889178842, + "grad_norm": 6.410344123840332, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8719585537910461, + "num_tokens": 319309280.0, + "step": 8372 + }, + { + "epoch": 1.0651316626383411, + "ewc_loss": 0.056176140904426575, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002443786070216447, + "grad_norm": 6.440297603607178, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8731536865234375, + "num_tokens": 319346114.0, + "step": 8373 + }, + { + "epoch": 1.0652588729169317, + "ewc_loss": 0.05607931315898895, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024341029347851872, + "grad_norm": 6.443182468414307, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8692997694015503, + "num_tokens": 319389001.0, + "step": 8374 + }, + { + "epoch": 1.0653860831955222, + "ewc_loss": 0.055928826332092285, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002443468547426164, + "grad_norm": 6.435525894165039, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8709884881973267, + "num_tokens": 319431464.0, + "step": 8375 + }, + { + "epoch": 1.0655132934741127, + "ewc_loss": 0.05599632114171982, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.000245021830778569, + "grad_norm": 6.455523490905762, + "learning_rate": 1e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8382067084312439, + "num_tokens": 319475638.0, + "step": 8376 + }, + { + "epoch": 1.0656405037527032, + "ewc_loss": 0.05591624975204468, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024422109709121287, + "grad_norm": 6.478172779083252, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8724087476730347, + "num_tokens": 319515983.0, + "step": 8377 + }, + { + "epoch": 1.0657677140312938, + "ewc_loss": 0.05596958100795746, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024475439568050206, + "grad_norm": 6.534761428833008, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8596384525299072, + "num_tokens": 319552152.0, + "step": 8378 + }, + { + "epoch": 1.0658949243098843, + "ewc_loss": 0.05588863044977188, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024394488718826324, + "grad_norm": 6.417038917541504, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8547798991203308, + "num_tokens": 319592963.0, + "step": 8379 + }, + { + "epoch": 1.0660221345884748, + "ewc_loss": 0.05600457638502121, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002451043692417443, + "grad_norm": 6.532379150390625, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8484621644020081, + "num_tokens": 319635136.0, + "step": 8380 + }, + { + "epoch": 1.0661493448670654, + "ewc_loss": 0.0558772012591362, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024383061099797487, + "grad_norm": 6.4331583976745605, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8655887842178345, + "num_tokens": 319670118.0, + "step": 8381 + }, + { + "epoch": 1.0662765551456557, + "ewc_loss": 0.05610673502087593, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002461259427946061, + "grad_norm": 6.5634870529174805, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.86863774061203, + "num_tokens": 319711057.0, + "step": 8382 + }, + { + "epoch": 1.0664037654242462, + "ewc_loss": 0.05583487078547478, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024340729578398168, + "grad_norm": 6.473438739776611, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8595963716506958, + "num_tokens": 319746451.0, + "step": 8383 + }, + { + "epoch": 1.0665309757028367, + "ewc_loss": 0.056006237864494324, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002451209875289351, + "grad_norm": 6.4760212898254395, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8559092283248901, + "num_tokens": 319792149.0, + "step": 8384 + }, + { + "epoch": 1.0666581859814273, + "ewc_loss": 0.05596817657351494, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002447403676342219, + "grad_norm": 6.542669296264648, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8538500070571899, + "num_tokens": 319826508.0, + "step": 8385 + }, + { + "epoch": 1.0667853962600178, + "ewc_loss": 0.05589974299073219, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024405602016486228, + "grad_norm": 6.445455074310303, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8654325008392334, + "num_tokens": 319867000.0, + "step": 8386 + }, + { + "epoch": 1.0669126065386083, + "ewc_loss": 0.05599639564752579, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002450225583743304, + "grad_norm": 6.526456356048584, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8444787263870239, + "num_tokens": 319904476.0, + "step": 8387 + }, + { + "epoch": 1.0670398168171988, + "ewc_loss": 0.05589577928185463, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024401639529969543, + "grad_norm": 6.540432929992676, + "learning_rate": 1e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8375773429870605, + "num_tokens": 319943279.0, + "step": 8388 + }, + { + "epoch": 1.0671670270957894, + "ewc_loss": 0.055964402854442596, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024470261996611953, + "grad_norm": 6.465012073516846, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8574166297912598, + "num_tokens": 319986610.0, + "step": 8389 + }, + { + "epoch": 1.06729423737438, + "ewc_loss": 0.0559602789580822, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024466137983836234, + "grad_norm": 6.562074661254883, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8547543287277222, + "num_tokens": 320022702.0, + "step": 8390 + }, + { + "epoch": 1.0674214476529704, + "ewc_loss": 0.05590977519750595, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002441563701722771, + "grad_norm": 6.4665117263793945, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8639810085296631, + "num_tokens": 320067918.0, + "step": 8391 + }, + { + "epoch": 1.067548657931561, + "ewc_loss": 0.055965863168239594, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002447172300890088, + "grad_norm": 6.497958660125732, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8545864224433899, + "num_tokens": 320110455.0, + "step": 8392 + }, + { + "epoch": 1.0676758682101515, + "ewc_loss": 0.05618433654308319, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002444605634082109, + "grad_norm": 6.510610103607178, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8785730600357056, + "num_tokens": 320147675.0, + "step": 8393 + }, + { + "epoch": 1.0678030784887418, + "ewc_loss": 0.05593045800924301, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002443631528876722, + "grad_norm": 6.519505500793457, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8484089970588684, + "num_tokens": 320183882.0, + "step": 8394 + }, + { + "epoch": 1.0679302887673323, + "ewc_loss": 0.055923499166965485, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002442935947328806, + "grad_norm": 6.467545986175537, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8512816429138184, + "num_tokens": 320222215.0, + "step": 8395 + }, + { + "epoch": 1.0680574990459228, + "ewc_loss": 0.055951714515686035, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024457572726532817, + "grad_norm": 6.517919540405273, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.853883683681488, + "num_tokens": 320262418.0, + "step": 8396 + }, + { + "epoch": 1.0681847093245134, + "ewc_loss": 0.05589508265256882, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002440093958284706, + "grad_norm": 6.473195552825928, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8556910753250122, + "num_tokens": 320297304.0, + "step": 8397 + }, + { + "epoch": 1.068311919603104, + "ewc_loss": 0.056007690727710724, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.000245135510340333, + "grad_norm": 6.554272174835205, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8634353280067444, + "num_tokens": 320330919.0, + "step": 8398 + }, + { + "epoch": 1.0684391298816944, + "ewc_loss": 0.05609899014234543, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002436070644762367, + "grad_norm": 6.48781156539917, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8629838228225708, + "num_tokens": 320367602.0, + "step": 8399 + }, + { + "epoch": 1.068566340160285, + "ewc_loss": 0.05615770071744919, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002441941760480404, + "grad_norm": 6.463200092315674, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8623226881027222, + "num_tokens": 320403875.0, + "step": 8400 + }, + { + "epoch": 1.0686935504388755, + "ewc_loss": 0.05609007552266121, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024351793399546295, + "grad_norm": 6.496076583862305, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.865031361579895, + "num_tokens": 320439149.0, + "step": 8401 + }, + { + "epoch": 1.068820760717466, + "ewc_loss": 0.05615639686584473, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002441811375319958, + "grad_norm": 6.470096111297607, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8737372159957886, + "num_tokens": 320476786.0, + "step": 8402 + }, + { + "epoch": 1.0689479709960565, + "ewc_loss": 0.05615641921758652, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002441813994664699, + "grad_norm": 6.51495361328125, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8468239903450012, + "num_tokens": 320516945.0, + "step": 8403 + }, + { + "epoch": 1.069075181274647, + "ewc_loss": 0.05612104758620262, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024382765695918351, + "grad_norm": 6.438095569610596, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8765136003494263, + "num_tokens": 320552801.0, + "step": 8404 + }, + { + "epoch": 1.0692023915532376, + "ewc_loss": 0.05615624040365219, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002441795659251511, + "grad_norm": 6.44821834564209, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8633469939231873, + "num_tokens": 320593605.0, + "step": 8405 + }, + { + "epoch": 1.0693296018318281, + "ewc_loss": 0.056137438863515854, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024399158428423107, + "grad_norm": 6.552151679992676, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8704997897148132, + "num_tokens": 320624476.0, + "step": 8406 + }, + { + "epoch": 1.0694568121104184, + "ewc_loss": 0.05610143020749092, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024363149714190513, + "grad_norm": 6.4056806564331055, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8681748509407043, + "num_tokens": 320667000.0, + "step": 8407 + }, + { + "epoch": 1.069584022389009, + "ewc_loss": 0.05617400258779526, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024435718660242856, + "grad_norm": 6.514097213745117, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.866877555847168, + "num_tokens": 320699286.0, + "step": 8408 + }, + { + "epoch": 1.0697112326675995, + "ewc_loss": 0.0560573935508728, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024319114163517952, + "grad_norm": 6.476035118103027, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8692672252655029, + "num_tokens": 320732520.0, + "step": 8409 + }, + { + "epoch": 1.06983844294619, + "ewc_loss": 0.056203991174697876, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024465707247145474, + "grad_norm": 6.457890510559082, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8500572443008423, + "num_tokens": 320773425.0, + "step": 8410 + }, + { + "epoch": 1.0699656532247805, + "ewc_loss": 0.05612461268901825, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024386329459957778, + "grad_norm": 6.494732856750488, + "learning_rate": 1e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.8392981290817261, + "num_tokens": 320810859.0, + "step": 8411 + }, + { + "epoch": 1.070092863503371, + "ewc_loss": 0.05609352886676788, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024355245113838464, + "grad_norm": 6.411098480224609, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8656413555145264, + "num_tokens": 320850266.0, + "step": 8412 + }, + { + "epoch": 1.0702200737819616, + "ewc_loss": 0.056219786405563354, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002448150480631739, + "grad_norm": 6.398693084716797, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8600780963897705, + "num_tokens": 320892812.0, + "step": 8413 + }, + { + "epoch": 1.0703472840605521, + "ewc_loss": 0.05620954930782318, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002447126607876271, + "grad_norm": 6.479244232177734, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8738573789596558, + "num_tokens": 320929563.0, + "step": 8414 + }, + { + "epoch": 1.0704744943391427, + "ewc_loss": 0.05621282011270523, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024474537349306047, + "grad_norm": 6.513955116271973, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8687565922737122, + "num_tokens": 320964010.0, + "step": 8415 + }, + { + "epoch": 1.0706017046177332, + "ewc_loss": 0.05615535378456116, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002441707474645227, + "grad_norm": 6.412955284118652, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8584725856781006, + "num_tokens": 321006405.0, + "step": 8416 + }, + { + "epoch": 1.0707289148963237, + "ewc_loss": 0.05621311813592911, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024474834208376706, + "grad_norm": 6.4842987060546875, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8566868901252747, + "num_tokens": 321050146.0, + "step": 8417 + }, + { + "epoch": 1.070856125174914, + "ewc_loss": 0.056184060871601105, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024445776944048703, + "grad_norm": 6.449739456176758, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8613966107368469, + "num_tokens": 321086857.0, + "step": 8418 + }, + { + "epoch": 1.0709833354535045, + "ewc_loss": 0.05620846524834633, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002447018341626972, + "grad_norm": 6.521100997924805, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8671422004699707, + "num_tokens": 321127310.0, + "step": 8419 + }, + { + "epoch": 1.071110545732095, + "ewc_loss": 0.05622712895274162, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024488847702741623, + "grad_norm": 6.474562644958496, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8515451550483704, + "num_tokens": 321165881.0, + "step": 8420 + }, + { + "epoch": 1.0712377560106856, + "ewc_loss": 0.05624321848154068, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024504936300218105, + "grad_norm": 6.488083839416504, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.855819821357727, + "num_tokens": 321204494.0, + "step": 8421 + }, + { + "epoch": 1.0713649662892761, + "ewc_loss": 0.056245774030685425, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002450749452691525, + "grad_norm": 6.499619483947754, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8511212468147278, + "num_tokens": 321243639.0, + "step": 8422 + }, + { + "epoch": 1.0714921765678667, + "ewc_loss": 0.05619318038225174, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002445490099489689, + "grad_norm": 6.5026936531066895, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8527427911758423, + "num_tokens": 321281399.0, + "step": 8423 + }, + { + "epoch": 1.0716193868464572, + "ewc_loss": 0.05634103715419769, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024602757184766233, + "grad_norm": 6.585282325744629, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8493759632110596, + "num_tokens": 321316539.0, + "step": 8424 + }, + { + "epoch": 1.0717465971250477, + "ewc_loss": 0.056090258061885834, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024351978208869696, + "grad_norm": 6.5530104637146, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8571889400482178, + "num_tokens": 321348618.0, + "step": 8425 + }, + { + "epoch": 1.0718738074036382, + "ewc_loss": 0.05620095878839493, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002446267753839493, + "grad_norm": 6.522644996643066, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8668871521949768, + "num_tokens": 321381674.0, + "step": 8426 + }, + { + "epoch": 1.0720010176822288, + "ewc_loss": 0.056101515889167786, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024363237025681883, + "grad_norm": 6.411616325378418, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8763086199760437, + "num_tokens": 321420342.0, + "step": 8427 + }, + { + "epoch": 1.0721282279608193, + "ewc_loss": 0.056291982531547546, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002455370267853141, + "grad_norm": 6.603939056396484, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8581131100654602, + "num_tokens": 321450770.0, + "step": 8428 + }, + { + "epoch": 1.0722554382394098, + "ewc_loss": 0.05614646524190903, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002440818352624774, + "grad_norm": 6.39084529876709, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8653874397277832, + "num_tokens": 321491450.0, + "step": 8429 + }, + { + "epoch": 1.0723826485180004, + "ewc_loss": 0.05638841912150383, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024650138220749795, + "grad_norm": 6.6222052574157715, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8551653623580933, + "num_tokens": 321531774.0, + "step": 8430 + }, + { + "epoch": 1.0725098587965907, + "ewc_loss": 0.056151024997234344, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024412742641288787, + "grad_norm": 6.501633167266846, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.861525297164917, + "num_tokens": 321567692.0, + "step": 8431 + }, + { + "epoch": 1.0726370690751812, + "ewc_loss": 0.05628974363207817, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002455146168358624, + "grad_norm": 6.603545188903809, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8570672869682312, + "num_tokens": 321601168.0, + "step": 8432 + }, + { + "epoch": 1.0727642793537717, + "ewc_loss": 0.0561211034655571, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024382822448387742, + "grad_norm": 6.459227561950684, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8431114554405212, + "num_tokens": 321636100.0, + "step": 8433 + }, + { + "epoch": 1.0728914896323622, + "ewc_loss": 0.05621529743075371, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002447701699566096, + "grad_norm": 6.500898838043213, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8623199462890625, + "num_tokens": 321674365.0, + "step": 8434 + }, + { + "epoch": 1.0730186999109528, + "ewc_loss": 0.0562015175819397, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024463236331939697, + "grad_norm": 6.485530376434326, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8715876340866089, + "num_tokens": 321712267.0, + "step": 8435 + }, + { + "epoch": 1.0731459101895433, + "ewc_loss": 0.05624060705304146, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024502325686626136, + "grad_norm": 6.588777542114258, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8586207628250122, + "num_tokens": 321749955.0, + "step": 8436 + }, + { + "epoch": 1.0732731204681338, + "ewc_loss": 0.05615601688623428, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024417738313786685, + "grad_norm": 6.535970687866211, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8699454069137573, + "num_tokens": 321788522.0, + "step": 8437 + }, + { + "epoch": 1.0734003307467244, + "ewc_loss": 0.056195661425590515, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024457377730868757, + "grad_norm": 6.518163681030273, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8583098649978638, + "num_tokens": 321826314.0, + "step": 8438 + }, + { + "epoch": 1.073527541025315, + "ewc_loss": 0.05642986297607422, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002444743877276778, + "grad_norm": 6.525842666625977, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8573338985443115, + "num_tokens": 321864851.0, + "step": 8439 + }, + { + "epoch": 1.0736547513039054, + "ewc_loss": 0.05615299567580223, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002441471442580223, + "grad_norm": 6.503319263458252, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.871010422706604, + "num_tokens": 321901001.0, + "step": 8440 + }, + { + "epoch": 1.073781961582496, + "ewc_loss": 0.05627124011516571, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002453295746818185, + "grad_norm": 6.607602596282959, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8591387271881104, + "num_tokens": 321941368.0, + "step": 8441 + }, + { + "epoch": 1.0739091718610865, + "ewc_loss": 0.056085340678691864, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024347056751139462, + "grad_norm": 6.511242866516113, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8636773824691772, + "num_tokens": 321978268.0, + "step": 8442 + }, + { + "epoch": 1.0740363821396768, + "ewc_loss": 0.0562199130654335, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.000244816328631714, + "grad_norm": 6.508021354675293, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8687810301780701, + "num_tokens": 322016969.0, + "step": 8443 + }, + { + "epoch": 1.0741635924182673, + "ewc_loss": 0.0560910701751709, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024352790205739439, + "grad_norm": 6.456278324127197, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8538216352462769, + "num_tokens": 322059362.0, + "step": 8444 + }, + { + "epoch": 1.0742908026968578, + "ewc_loss": 0.05612630397081375, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024388021847698838, + "grad_norm": 6.6213908195495605, + "learning_rate": 1e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8421223163604736, + "num_tokens": 322097334.0, + "step": 8445 + }, + { + "epoch": 1.0744180129754484, + "ewc_loss": 0.056124888360500336, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024386608856730163, + "grad_norm": 6.564210891723633, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8813721537590027, + "num_tokens": 322121988.0, + "step": 8446 + }, + { + "epoch": 1.074545223254039, + "ewc_loss": 0.056030116975307465, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024291836598422378, + "grad_norm": 6.5174689292907715, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8562729358673096, + "num_tokens": 322156602.0, + "step": 8447 + }, + { + "epoch": 1.0746724335326294, + "ewc_loss": 0.056114956736564636, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024376677174586803, + "grad_norm": 6.525023937225342, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8587597012519836, + "num_tokens": 322194475.0, + "step": 8448 + }, + { + "epoch": 1.07479964381122, + "ewc_loss": 0.05605912581086159, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024320844386238605, + "grad_norm": 6.532476902008057, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8701532483100891, + "num_tokens": 322232158.0, + "step": 8449 + }, + { + "epoch": 1.0749268540898105, + "ewc_loss": 0.056077830493450165, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024339549418073148, + "grad_norm": 6.441734313964844, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8703621029853821, + "num_tokens": 322272170.0, + "step": 8450 + }, + { + "epoch": 1.075054064368401, + "ewc_loss": 0.056093037128448486, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002435475616948679, + "grad_norm": 6.490921974182129, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8673282265663147, + "num_tokens": 322305496.0, + "step": 8451 + }, + { + "epoch": 1.0751812746469915, + "ewc_loss": 0.05617048963904381, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024432208738289773, + "grad_norm": 6.526141166687012, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8658632040023804, + "num_tokens": 322347552.0, + "step": 8452 + }, + { + "epoch": 1.075308484925582, + "ewc_loss": 0.05613938719034195, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024401106929872185, + "grad_norm": 6.5049333572387695, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8642988204956055, + "num_tokens": 322386471.0, + "step": 8453 + }, + { + "epoch": 1.0754356952041726, + "ewc_loss": 0.05616484954953194, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002442656841594726, + "grad_norm": 6.540668487548828, + "learning_rate": 1e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.826979398727417, + "num_tokens": 322431019.0, + "step": 8454 + }, + { + "epoch": 1.0755629054827631, + "ewc_loss": 0.056238554418087006, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024500273866578937, + "grad_norm": 6.519625663757324, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.880151093006134, + "num_tokens": 322471525.0, + "step": 8455 + }, + { + "epoch": 1.0756901157613534, + "ewc_loss": 0.056162379682064056, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002442409750074148, + "grad_norm": 6.508115291595459, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8648426532745361, + "num_tokens": 322516594.0, + "step": 8456 + }, + { + "epoch": 1.075817326039944, + "ewc_loss": 0.05622778832912445, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024489505449309945, + "grad_norm": 6.524329662322998, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8681054711341858, + "num_tokens": 322557837.0, + "step": 8457 + }, + { + "epoch": 1.0759445363185345, + "ewc_loss": 0.05621354654431343, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024475264945067465, + "grad_norm": 6.565808296203613, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8566477298736572, + "num_tokens": 322594165.0, + "step": 8458 + }, + { + "epoch": 1.076071746597125, + "ewc_loss": 0.056176938116550446, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024438658147118986, + "grad_norm": 6.531076908111572, + "learning_rate": 1e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.844115138053894, + "num_tokens": 322637083.0, + "step": 8459 + }, + { + "epoch": 1.0761989568757155, + "ewc_loss": 0.056213926523923874, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002447564620524645, + "grad_norm": 6.549438953399658, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8607673645019531, + "num_tokens": 322673216.0, + "step": 8460 + }, + { + "epoch": 1.076326167154306, + "ewc_loss": 0.055942438542842865, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.000244483002461493, + "grad_norm": 6.507633209228516, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8652366399765015, + "num_tokens": 322709150.0, + "step": 8461 + }, + { + "epoch": 1.0764533774328966, + "ewc_loss": 0.056178390979766846, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024440110428258777, + "grad_norm": 6.490403175354004, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8676428198814392, + "num_tokens": 322747893.0, + "step": 8462 + }, + { + "epoch": 1.0765805877114871, + "ewc_loss": 0.05593331903219223, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024439176195301116, + "grad_norm": 6.5770487785339355, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8449006080627441, + "num_tokens": 322781667.0, + "step": 8463 + }, + { + "epoch": 1.0767077979900777, + "ewc_loss": 0.05596328154206276, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002446914149913937, + "grad_norm": 6.525211334228516, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8610537052154541, + "num_tokens": 322818008.0, + "step": 8464 + }, + { + "epoch": 1.0768350082686682, + "ewc_loss": 0.05617788806557655, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024439606931991875, + "grad_norm": 6.5228400230407715, + "learning_rate": 1e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8413052558898926, + "num_tokens": 322856528.0, + "step": 8465 + }, + { + "epoch": 1.0769622185472587, + "ewc_loss": 0.05620826780796051, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024469985510222614, + "grad_norm": 6.569764137268066, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8646081686019897, + "num_tokens": 322888798.0, + "step": 8466 + }, + { + "epoch": 1.077089428825849, + "ewc_loss": 0.05615198612213135, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024413704522885382, + "grad_norm": 6.54349946975708, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8629841804504395, + "num_tokens": 322923301.0, + "step": 8467 + }, + { + "epoch": 1.0772166391044395, + "ewc_loss": 0.05616668611764908, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002442840486764908, + "grad_norm": 6.478602409362793, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8728386759757996, + "num_tokens": 322964253.0, + "step": 8468 + }, + { + "epoch": 1.07734384938303, + "ewc_loss": 0.05610363930463791, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024365355784539133, + "grad_norm": 6.464868068695068, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8591933250427246, + "num_tokens": 323006926.0, + "step": 8469 + }, + { + "epoch": 1.0774710596616206, + "ewc_loss": 0.05623836815357208, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024500087602064013, + "grad_norm": 6.491754055023193, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.874282717704773, + "num_tokens": 323047541.0, + "step": 8470 + }, + { + "epoch": 1.0775982699402111, + "ewc_loss": 0.056182652711868286, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002444437122903764, + "grad_norm": 6.497676849365234, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8534634113311768, + "num_tokens": 323084272.0, + "step": 8471 + }, + { + "epoch": 1.0777254802188017, + "ewc_loss": 0.05622601509094238, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002448773302603513, + "grad_norm": 6.535654544830322, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8730812668800354, + "num_tokens": 323116698.0, + "step": 8472 + }, + { + "epoch": 1.0778526904973922, + "ewc_loss": 0.05619350075721741, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024455218226648867, + "grad_norm": 6.439100742340088, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8684080839157104, + "num_tokens": 323158836.0, + "step": 8473 + }, + { + "epoch": 1.0779799007759827, + "ewc_loss": 0.056307192891836166, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.000245689123403281, + "grad_norm": 6.505879878997803, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.857879102230072, + "num_tokens": 323193777.0, + "step": 8474 + }, + { + "epoch": 1.0781071110545732, + "ewc_loss": 0.0562983863055706, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002456010552123189, + "grad_norm": 6.4672770500183105, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8619566559791565, + "num_tokens": 323232546.0, + "step": 8475 + }, + { + "epoch": 1.0782343213331638, + "ewc_loss": 0.056326769292354584, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002458849048707634, + "grad_norm": 6.5126166343688965, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.861781120300293, + "num_tokens": 323269403.0, + "step": 8476 + }, + { + "epoch": 1.0783615316117543, + "ewc_loss": 0.056278668344020844, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002454038767609745, + "grad_norm": 6.431522846221924, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8744723796844482, + "num_tokens": 323305982.0, + "step": 8477 + }, + { + "epoch": 1.0784887418903448, + "ewc_loss": 0.056152693927288055, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024658555048517883, + "grad_norm": 6.554336071014404, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.856137752532959, + "num_tokens": 323341032.0, + "step": 8478 + }, + { + "epoch": 1.0786159521689354, + "ewc_loss": 0.05630550906062126, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002456722722854465, + "grad_norm": 6.513527870178223, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8563758134841919, + "num_tokens": 323381647.0, + "step": 8479 + }, + { + "epoch": 1.0787431624475257, + "ewc_loss": 0.05620051175355911, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024706372641958296, + "grad_norm": 6.536291599273682, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8662348985671997, + "num_tokens": 323417185.0, + "step": 8480 + }, + { + "epoch": 1.0788703727261162, + "ewc_loss": 0.056353747844696045, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002461546682752669, + "grad_norm": 6.498581886291504, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8625378012657166, + "num_tokens": 323454624.0, + "step": 8481 + }, + { + "epoch": 1.0789975830047067, + "ewc_loss": 0.05644117668271065, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024702894734218717, + "grad_norm": 6.5150861740112305, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8621158599853516, + "num_tokens": 323492862.0, + "step": 8482 + }, + { + "epoch": 1.0791247932832972, + "ewc_loss": 0.05640125274658203, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002466297009959817, + "grad_norm": 6.495821952819824, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8579176664352417, + "num_tokens": 323536012.0, + "step": 8483 + }, + { + "epoch": 1.0792520035618878, + "ewc_loss": 0.05640338733792305, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024665106320753694, + "grad_norm": 6.499157905578613, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8654025793075562, + "num_tokens": 323573495.0, + "step": 8484 + }, + { + "epoch": 1.0793792138404783, + "ewc_loss": 0.056416600942611694, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024678322370164096, + "grad_norm": 6.485366344451904, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.871828019618988, + "num_tokens": 323610900.0, + "step": 8485 + }, + { + "epoch": 1.0795064241190688, + "ewc_loss": 0.05640198290348053, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002466370351612568, + "grad_norm": 6.519889831542969, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8599278330802917, + "num_tokens": 323650983.0, + "step": 8486 + }, + { + "epoch": 1.0796336343976594, + "ewc_loss": 0.056390516459941864, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024652236606925726, + "grad_norm": 6.486479759216309, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8538426160812378, + "num_tokens": 323690155.0, + "step": 8487 + }, + { + "epoch": 1.0797608446762499, + "ewc_loss": 0.056394994258880615, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002465671277604997, + "grad_norm": 6.533107280731201, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8753060102462769, + "num_tokens": 323732212.0, + "step": 8488 + }, + { + "epoch": 1.0798880549548404, + "ewc_loss": 0.0563560351729393, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002461775147821754, + "grad_norm": 6.45591402053833, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8705641031265259, + "num_tokens": 323773400.0, + "step": 8489 + }, + { + "epoch": 1.080015265233431, + "ewc_loss": 0.05641184002161026, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002467355807311833, + "grad_norm": 6.520724773406982, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8683406114578247, + "num_tokens": 323812753.0, + "step": 8490 + }, + { + "epoch": 1.0801424755120215, + "ewc_loss": 0.05630616098642349, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002456788206472993, + "grad_norm": 6.436100482940674, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8641611337661743, + "num_tokens": 323855835.0, + "step": 8491 + }, + { + "epoch": 1.0802696857906118, + "ewc_loss": 0.05625925958156586, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002476512163411826, + "grad_norm": 6.486426830291748, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8752973079681396, + "num_tokens": 323896065.0, + "step": 8492 + }, + { + "epoch": 1.0803968960692023, + "ewc_loss": 0.05641254410147667, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002467426238581538, + "grad_norm": 6.56390905380249, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8650469779968262, + "num_tokens": 323934351.0, + "step": 8493 + }, + { + "epoch": 1.0805241063477928, + "ewc_loss": 0.05639845132827759, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024660167400725186, + "grad_norm": 6.555587291717529, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8515108823776245, + "num_tokens": 323968878.0, + "step": 8494 + }, + { + "epoch": 1.0806513166263834, + "ewc_loss": 0.0564478375017643, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024709556601010263, + "grad_norm": 6.54873514175415, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8611634969711304, + "num_tokens": 324004838.0, + "step": 8495 + }, + { + "epoch": 1.080778526904974, + "ewc_loss": 0.05627832189202309, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002454004134051502, + "grad_norm": 6.472790241241455, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8769166469573975, + "num_tokens": 324040851.0, + "step": 8496 + }, + { + "epoch": 1.0809057371835644, + "ewc_loss": 0.05648505687713623, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024746774579398334, + "grad_norm": 6.565643310546875, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8502873778343201, + "num_tokens": 324084675.0, + "step": 8497 + }, + { + "epoch": 1.081032947462155, + "ewc_loss": 0.05631069839000702, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024572419351898134, + "grad_norm": 6.473669528961182, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8669267296791077, + "num_tokens": 324122136.0, + "step": 8498 + }, + { + "epoch": 1.0811601577407455, + "ewc_loss": 0.05646071955561638, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024722437956370413, + "grad_norm": 6.568284511566162, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8603718280792236, + "num_tokens": 324153142.0, + "step": 8499 + }, + { + "epoch": 1.081287368019336, + "ewc_loss": 0.056287042796611786, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024548760848119855, + "grad_norm": 6.512757778167725, + "learning_rate": 1e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8433794975280762, + "num_tokens": 324187244.0, + "step": 8500 + }, + { + "epoch": 1.0814145782979265, + "ewc_loss": 0.05638381093740463, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002464552817400545, + "grad_norm": 6.590351581573486, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8645368814468384, + "num_tokens": 324225644.0, + "step": 8501 + }, + { + "epoch": 1.081541788576517, + "ewc_loss": 0.05629146099090576, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024553181719966233, + "grad_norm": 6.429834842681885, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8695778846740723, + "num_tokens": 324270394.0, + "step": 8502 + }, + { + "epoch": 1.0816689988551076, + "ewc_loss": 0.05622445046901703, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024730307632125914, + "grad_norm": 6.52570104598999, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8735083341598511, + "num_tokens": 324311244.0, + "step": 8503 + }, + { + "epoch": 1.0817962091336981, + "ewc_loss": 0.056091662496328354, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024597521405667067, + "grad_norm": 6.495075702667236, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8429739475250244, + "num_tokens": 324350821.0, + "step": 8504 + }, + { + "epoch": 1.0819234194122884, + "ewc_loss": 0.05640918016433716, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024670897983014584, + "grad_norm": 6.484984874725342, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8613518476486206, + "num_tokens": 324388255.0, + "step": 8505 + }, + { + "epoch": 1.082050629690879, + "ewc_loss": 0.05619392544031143, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002469978353474289, + "grad_norm": 6.569059371948242, + "learning_rate": 1e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.836711049079895, + "num_tokens": 324430068.0, + "step": 8506 + }, + { + "epoch": 1.0821778399694695, + "ewc_loss": 0.05610187351703644, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024607733939774334, + "grad_norm": 6.490979194641113, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8678656816482544, + "num_tokens": 324465341.0, + "step": 8507 + }, + { + "epoch": 1.08230505024806, + "ewc_loss": 0.05621524155139923, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024721099180169404, + "grad_norm": 6.549123764038086, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8692479729652405, + "num_tokens": 324502199.0, + "step": 8508 + }, + { + "epoch": 1.0824322605266505, + "ewc_loss": 0.05640001967549324, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002466173900756985, + "grad_norm": 6.528756618499756, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.863288164138794, + "num_tokens": 324535898.0, + "step": 8509 + }, + { + "epoch": 1.082559470805241, + "ewc_loss": 0.05640507861971855, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002466679725330323, + "grad_norm": 6.54543924331665, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8520323038101196, + "num_tokens": 324574404.0, + "step": 8510 + }, + { + "epoch": 1.0826866810838316, + "ewc_loss": 0.056379370391368866, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002464108692947775, + "grad_norm": 6.493891716003418, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8607696294784546, + "num_tokens": 324611743.0, + "step": 8511 + }, + { + "epoch": 1.0828138913624221, + "ewc_loss": 0.05645937845110893, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002472109626978636, + "grad_norm": 6.4850993156433105, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8519688844680786, + "num_tokens": 324653126.0, + "step": 8512 + }, + { + "epoch": 1.0829411016410126, + "ewc_loss": 0.05651530623435974, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024777025100775063, + "grad_norm": 6.507887363433838, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8685314655303955, + "num_tokens": 324692957.0, + "step": 8513 + }, + { + "epoch": 1.0830683119196032, + "ewc_loss": 0.056464508175849915, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024726224364712834, + "grad_norm": 6.467412948608398, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8576453924179077, + "num_tokens": 324737214.0, + "step": 8514 + }, + { + "epoch": 1.0831955221981937, + "ewc_loss": 0.05659408122301102, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024855800438672304, + "grad_norm": 6.528379440307617, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8727606534957886, + "num_tokens": 324773098.0, + "step": 8515 + }, + { + "epoch": 1.083322732476784, + "ewc_loss": 0.05644211173057556, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002470382896717638, + "grad_norm": 6.471803188323975, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8506722450256348, + "num_tokens": 324815846.0, + "step": 8516 + }, + { + "epoch": 1.0834499427553745, + "ewc_loss": 0.0565694123506546, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024831132031977177, + "grad_norm": 6.4948039054870605, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8587155342102051, + "num_tokens": 324855912.0, + "step": 8517 + }, + { + "epoch": 1.083577153033965, + "ewc_loss": 0.05651352182030678, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024775241035968065, + "grad_norm": 6.589903831481934, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8556783199310303, + "num_tokens": 324893090.0, + "step": 8518 + }, + { + "epoch": 1.0837043633125556, + "ewc_loss": 0.056494973599910736, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002475669316481799, + "grad_norm": 6.52825403213501, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8627315759658813, + "num_tokens": 324929779.0, + "step": 8519 + }, + { + "epoch": 1.0838315735911461, + "ewc_loss": 0.056577168405056, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024838888202793896, + "grad_norm": 6.59045934677124, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8674303293228149, + "num_tokens": 324961349.0, + "step": 8520 + }, + { + "epoch": 1.0839587838697367, + "ewc_loss": 0.05627760291099548, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024783459957689047, + "grad_norm": 6.51809549331665, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8618650436401367, + "num_tokens": 324998688.0, + "step": 8521 + }, + { + "epoch": 1.0840859941483272, + "ewc_loss": 0.05630385875701904, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002480971743352711, + "grad_norm": 6.565093517303467, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8541404604911804, + "num_tokens": 325040601.0, + "step": 8522 + }, + { + "epoch": 1.0842132044269177, + "ewc_loss": 0.05620487034320831, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002471072948537767, + "grad_norm": 6.507433891296387, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8654046058654785, + "num_tokens": 325081729.0, + "step": 8523 + }, + { + "epoch": 1.0843404147055082, + "ewc_loss": 0.05628669261932373, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002479255199432373, + "grad_norm": 6.611187934875488, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8511545658111572, + "num_tokens": 325118886.0, + "step": 8524 + }, + { + "epoch": 1.0844676249840988, + "ewc_loss": 0.056109003722667694, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024614864378236234, + "grad_norm": 6.464698314666748, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.879697859287262, + "num_tokens": 325156417.0, + "step": 8525 + }, + { + "epoch": 1.0845948352626893, + "ewc_loss": 0.05661733075976372, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024879048578441143, + "grad_norm": 6.5628132820129395, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8733727335929871, + "num_tokens": 325200902.0, + "step": 8526 + }, + { + "epoch": 1.0847220455412798, + "ewc_loss": 0.056144870817661285, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024650729028508067, + "grad_norm": 6.4830002784729, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8636999130249023, + "num_tokens": 325233362.0, + "step": 8527 + }, + { + "epoch": 1.0848492558198704, + "ewc_loss": 0.056329358369112015, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.000248352182097733, + "grad_norm": 6.559812545776367, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8611429333686829, + "num_tokens": 325274646.0, + "step": 8528 + }, + { + "epoch": 1.0849764660984607, + "ewc_loss": 0.05614997819066048, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024655836750753224, + "grad_norm": 6.528841495513916, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8771519064903259, + "num_tokens": 325304735.0, + "step": 8529 + }, + { + "epoch": 1.0851036763770512, + "ewc_loss": 0.05650855973362923, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002477027883287519, + "grad_norm": 6.574882507324219, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8648033142089844, + "num_tokens": 325339833.0, + "step": 8530 + }, + { + "epoch": 1.0852308866556417, + "ewc_loss": 0.05642528831958771, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024687006953172386, + "grad_norm": 6.523641109466553, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8500571250915527, + "num_tokens": 325373838.0, + "step": 8531 + }, + { + "epoch": 1.0853580969342322, + "ewc_loss": 0.0565774068236351, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024839123943820596, + "grad_norm": 6.547551155090332, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8827081918716431, + "num_tokens": 325412913.0, + "step": 8532 + }, + { + "epoch": 1.0854853072128228, + "ewc_loss": 0.056455135345458984, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024716852931305766, + "grad_norm": 6.500580787658691, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8674890995025635, + "num_tokens": 325451830.0, + "step": 8533 + }, + { + "epoch": 1.0856125174914133, + "ewc_loss": 0.05649179592728615, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024753515026532114, + "grad_norm": 6.577359676361084, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8449022769927979, + "num_tokens": 325487162.0, + "step": 8534 + }, + { + "epoch": 1.0857397277700038, + "ewc_loss": 0.056499749422073364, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002476146910339594, + "grad_norm": 6.530214786529541, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8722343444824219, + "num_tokens": 325525906.0, + "step": 8535 + }, + { + "epoch": 1.0858669380485944, + "ewc_loss": 0.05649232119321823, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002475404180586338, + "grad_norm": 6.524086952209473, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8745814561843872, + "num_tokens": 325559925.0, + "step": 8536 + }, + { + "epoch": 1.0859941483271849, + "ewc_loss": 0.056577183306217194, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002483889984432608, + "grad_norm": 6.565431594848633, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8610535860061646, + "num_tokens": 325599980.0, + "step": 8537 + }, + { + "epoch": 1.0861213586057754, + "ewc_loss": 0.05646545812487602, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002472717605996877, + "grad_norm": 6.5704193115234375, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.845246434211731, + "num_tokens": 325631745.0, + "step": 8538 + }, + { + "epoch": 1.086248568884366, + "ewc_loss": 0.05659346282482147, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024855180527083576, + "grad_norm": 6.520071506500244, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8586896657943726, + "num_tokens": 325673583.0, + "step": 8539 + }, + { + "epoch": 1.0863757791629565, + "ewc_loss": 0.056503795087337494, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002476551162544638, + "grad_norm": 6.536065101623535, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8541562557220459, + "num_tokens": 325712995.0, + "step": 8540 + }, + { + "epoch": 1.0865029894415468, + "ewc_loss": 0.056479353457689285, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002474107313901186, + "grad_norm": 6.509322166442871, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.867595911026001, + "num_tokens": 325748259.0, + "step": 8541 + }, + { + "epoch": 1.0866301997201373, + "ewc_loss": 0.0563054084777832, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024811268667690456, + "grad_norm": 6.51124382019043, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8675991296768188, + "num_tokens": 325787676.0, + "step": 8542 + }, + { + "epoch": 1.0867574099987278, + "ewc_loss": 0.05647305026650429, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002473476924933493, + "grad_norm": 6.487697601318359, + "learning_rate": 1e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8373538851737976, + "num_tokens": 325827819.0, + "step": 8543 + }, + { + "epoch": 1.0868846202773184, + "ewc_loss": 0.05654655024409294, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024808268062770367, + "grad_norm": 6.497373104095459, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8711124658584595, + "num_tokens": 325866328.0, + "step": 8544 + }, + { + "epoch": 1.0870118305559089, + "ewc_loss": 0.056317806243896484, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002482366398908198, + "grad_norm": 6.518365859985352, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8569388389587402, + "num_tokens": 325906752.0, + "step": 8545 + }, + { + "epoch": 1.0871390408344994, + "ewc_loss": 0.056531600654125214, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002479331742506474, + "grad_norm": 6.5138773918151855, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8562664985656738, + "num_tokens": 325943765.0, + "step": 8546 + }, + { + "epoch": 1.08726625111309, + "ewc_loss": 0.05649815872311592, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024759877123869956, + "grad_norm": 6.485056400299072, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8684541583061218, + "num_tokens": 325981471.0, + "step": 8547 + }, + { + "epoch": 1.0873934613916805, + "ewc_loss": 0.05655229091644287, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002481401024851948, + "grad_norm": 6.515731334686279, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8673616647720337, + "num_tokens": 326017253.0, + "step": 8548 + }, + { + "epoch": 1.087520671670271, + "ewc_loss": 0.05623895674943924, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002474481880199164, + "grad_norm": 6.462601184844971, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8628484010696411, + "num_tokens": 326053484.0, + "step": 8549 + }, + { + "epoch": 1.0876478819488615, + "ewc_loss": 0.05659353733062744, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024855256197042763, + "grad_norm": 6.591750144958496, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8681299686431885, + "num_tokens": 326084779.0, + "step": 8550 + }, + { + "epoch": 1.087775092227452, + "ewc_loss": 0.056500665843486786, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024762385874055326, + "grad_norm": 6.455863952636719, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8600536584854126, + "num_tokens": 326120795.0, + "step": 8551 + }, + { + "epoch": 1.0879023025060426, + "ewc_loss": 0.056415073573589325, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002492093190085143, + "grad_norm": 6.557028770446777, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8597487211227417, + "num_tokens": 326155991.0, + "step": 8552 + }, + { + "epoch": 1.0880295127846331, + "ewc_loss": 0.05625336617231369, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002475922810845077, + "grad_norm": 6.432297706604004, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8583236336708069, + "num_tokens": 326195738.0, + "step": 8553 + }, + { + "epoch": 1.0881567230632234, + "ewc_loss": 0.05642910301685333, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.00024934959947131574, + "grad_norm": 6.524265766143799, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8623054027557373, + "num_tokens": 326237773.0, + "step": 8554 + }, + { + "epoch": 1.088283933341814, + "ewc_loss": 0.05634862929582596, + "ewc_loss_diag": 3.147125244140625e-05, + "ewc_loss_parallel": 0.0002485449076630175, + "grad_norm": 6.472344875335693, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8758975267410278, + "num_tokens": 326278589.0, + "step": 8555 + }, + { + "epoch": 1.0884111436204045, + "ewc_loss": 0.05684274435043335, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002486032317392528, + "grad_norm": 6.534859657287598, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8548431396484375, + "num_tokens": 326316862.0, + "step": 8556 + }, + { + "epoch": 1.088538353898995, + "ewc_loss": 0.05676719546318054, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024784772540442646, + "grad_norm": 6.501323699951172, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8584144115447998, + "num_tokens": 326353892.0, + "step": 8557 + }, + { + "epoch": 1.0886655641775855, + "ewc_loss": 0.05682944506406784, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.000248470256337896, + "grad_norm": 6.516849517822266, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8668282628059387, + "num_tokens": 326389478.0, + "step": 8558 + }, + { + "epoch": 1.088792774456176, + "ewc_loss": 0.05683600902557373, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002485358854755759, + "grad_norm": 6.495207786560059, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.857298731803894, + "num_tokens": 326428072.0, + "step": 8559 + }, + { + "epoch": 1.0889199847347666, + "ewc_loss": 0.05685146898031235, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002486904850229621, + "grad_norm": 6.52005672454834, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8658130764961243, + "num_tokens": 326466374.0, + "step": 8560 + }, + { + "epoch": 1.0890471950133571, + "ewc_loss": 0.056913822889328, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024931400548666716, + "grad_norm": 6.550923824310303, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8579601049423218, + "num_tokens": 326501081.0, + "step": 8561 + }, + { + "epoch": 1.0891744052919476, + "ewc_loss": 0.05683786794543266, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002485544537194073, + "grad_norm": 6.545228481292725, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8710148334503174, + "num_tokens": 326532539.0, + "step": 8562 + }, + { + "epoch": 1.0893016155705382, + "ewc_loss": 0.05684451013803482, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024862089776434004, + "grad_norm": 6.486143112182617, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8668770790100098, + "num_tokens": 326569481.0, + "step": 8563 + }, + { + "epoch": 1.0894288258491287, + "ewc_loss": 0.05683869123458862, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024856271920725703, + "grad_norm": 6.522505283355713, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8602765798568726, + "num_tokens": 326609746.0, + "step": 8564 + }, + { + "epoch": 1.089556036127719, + "ewc_loss": 0.0568619966506958, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002487957535777241, + "grad_norm": 6.545225143432617, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.851371169090271, + "num_tokens": 326652379.0, + "step": 8565 + }, + { + "epoch": 1.0896832464063095, + "ewc_loss": 0.056759435683488846, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002477701345924288, + "grad_norm": 6.538471698760986, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8576555252075195, + "num_tokens": 326687617.0, + "step": 8566 + }, + { + "epoch": 1.0898104566849, + "ewc_loss": 0.05680312216281891, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002482069830875844, + "grad_norm": 6.507775783538818, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8536765575408936, + "num_tokens": 326724155.0, + "step": 8567 + }, + { + "epoch": 1.0899376669634906, + "ewc_loss": 0.05690564215183258, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002492321946192533, + "grad_norm": 6.5102949142456055, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8607296943664551, + "num_tokens": 326768234.0, + "step": 8568 + }, + { + "epoch": 1.0900648772420811, + "ewc_loss": 0.05675020068883896, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002476777881383896, + "grad_norm": 6.487157344818115, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8670282363891602, + "num_tokens": 326810423.0, + "step": 8569 + }, + { + "epoch": 1.0901920875206716, + "ewc_loss": 0.056898169219493866, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024915748508647084, + "grad_norm": 6.572244167327881, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.85966956615448, + "num_tokens": 326851834.0, + "step": 8570 + }, + { + "epoch": 1.0903192977992622, + "ewc_loss": 0.056514590978622437, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002477630914654583, + "grad_norm": 6.531962871551514, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8486406803131104, + "num_tokens": 326889334.0, + "step": 8571 + }, + { + "epoch": 1.0904465080778527, + "ewc_loss": 0.05662277713418007, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024884496815502644, + "grad_norm": 6.581338405609131, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8503047823905945, + "num_tokens": 326931884.0, + "step": 8572 + }, + { + "epoch": 1.0905737183564432, + "ewc_loss": 0.05652278661727905, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002478450769558549, + "grad_norm": 6.549453258514404, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8691891431808472, + "num_tokens": 326969043.0, + "step": 8573 + }, + { + "epoch": 1.0907009286350338, + "ewc_loss": 0.05642807483673096, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024689792189747095, + "grad_norm": 6.525968551635742, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8626463413238525, + "num_tokens": 326998369.0, + "step": 8574 + }, + { + "epoch": 1.0908281389136243, + "ewc_loss": 0.05651945620775223, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002478117239661515, + "grad_norm": 6.509620189666748, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8616237640380859, + "num_tokens": 327036909.0, + "step": 8575 + }, + { + "epoch": 1.0909553491922148, + "ewc_loss": 0.056488122791051865, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024749842123128474, + "grad_norm": 6.5237531661987305, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8535809516906738, + "num_tokens": 327076053.0, + "step": 8576 + }, + { + "epoch": 1.0910825594708053, + "ewc_loss": 0.05655374750494957, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024815465440042317, + "grad_norm": 6.524862289428711, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8694952130317688, + "num_tokens": 327112285.0, + "step": 8577 + }, + { + "epoch": 1.0912097697493957, + "ewc_loss": 0.056757356971502304, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002477493544574827, + "grad_norm": 6.596482276916504, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8671751618385315, + "num_tokens": 327151591.0, + "step": 8578 + }, + { + "epoch": 1.0913369800279862, + "ewc_loss": 0.056502752006053925, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024764472618699074, + "grad_norm": 6.576589107513428, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8465366363525391, + "num_tokens": 327193600.0, + "step": 8579 + }, + { + "epoch": 1.0914641903065767, + "ewc_loss": 0.05669049173593521, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024708069395273924, + "grad_norm": 6.512443542480469, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8524714708328247, + "num_tokens": 327230249.0, + "step": 8580 + }, + { + "epoch": 1.0915914005851672, + "ewc_loss": 0.056777894496917725, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024795474018901587, + "grad_norm": 6.561375617980957, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8761463761329651, + "num_tokens": 327270043.0, + "step": 8581 + }, + { + "epoch": 1.0917186108637578, + "ewc_loss": 0.05671568587422371, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002473326458130032, + "grad_norm": 6.58173131942749, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8635363578796387, + "num_tokens": 327309396.0, + "step": 8582 + }, + { + "epoch": 1.0918458211423483, + "ewc_loss": 0.056711431592702866, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024729009601287544, + "grad_norm": 6.533298492431641, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8686554431915283, + "num_tokens": 327349497.0, + "step": 8583 + }, + { + "epoch": 1.0919730314209388, + "ewc_loss": 0.05669821798801422, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024715796462260187, + "grad_norm": 6.650191783905029, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8540773391723633, + "num_tokens": 327385648.0, + "step": 8584 + }, + { + "epoch": 1.0921002416995294, + "ewc_loss": 0.056672602891922, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002469017927069217, + "grad_norm": 6.531332015991211, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8630713224411011, + "num_tokens": 327426379.0, + "step": 8585 + }, + { + "epoch": 1.0922274519781199, + "ewc_loss": 0.05664026737213135, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024657847825437784, + "grad_norm": 6.49291467666626, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8670469522476196, + "num_tokens": 327467555.0, + "step": 8586 + }, + { + "epoch": 1.0923546622567104, + "ewc_loss": 0.05644814670085907, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002470986801199615, + "grad_norm": 6.598437786102295, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8583771586418152, + "num_tokens": 327505303.0, + "step": 8587 + }, + { + "epoch": 1.092481872535301, + "ewc_loss": 0.05662517249584198, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002464274875819683, + "grad_norm": 6.521259307861328, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8682905435562134, + "num_tokens": 327547854.0, + "step": 8588 + }, + { + "epoch": 1.0926090828138915, + "ewc_loss": 0.05646403506398201, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024725752882659435, + "grad_norm": 6.569587230682373, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8650080561637878, + "num_tokens": 327584132.0, + "step": 8589 + }, + { + "epoch": 1.0927362930924818, + "ewc_loss": 0.056364357471466064, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024626078084111214, + "grad_norm": 6.486087322235107, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8495355844497681, + "num_tokens": 327629250.0, + "step": 8590 + }, + { + "epoch": 1.0928635033710723, + "ewc_loss": 0.05651998519897461, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002478170208632946, + "grad_norm": 6.606127738952637, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8539180755615234, + "num_tokens": 327666988.0, + "step": 8591 + }, + { + "epoch": 1.0929907136496628, + "ewc_loss": 0.05638779327273369, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024649512488394976, + "grad_norm": 6.479262351989746, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.864149808883667, + "num_tokens": 327705955.0, + "step": 8592 + }, + { + "epoch": 1.0931179239282534, + "ewc_loss": 0.05660756677389145, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024869287153705955, + "grad_norm": 6.639674663543701, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8532127141952515, + "num_tokens": 327746362.0, + "step": 8593 + }, + { + "epoch": 1.0932451342068439, + "ewc_loss": 0.056449078023433685, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024710799334570765, + "grad_norm": 6.4817986488342285, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8614838719367981, + "num_tokens": 327789034.0, + "step": 8594 + }, + { + "epoch": 1.0933723444854344, + "ewc_loss": 0.05655622482299805, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024817942176014185, + "grad_norm": 6.640519142150879, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8514441251754761, + "num_tokens": 327824203.0, + "step": 8595 + }, + { + "epoch": 1.093499554764025, + "ewc_loss": 0.056472811847925186, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024734530597925186, + "grad_norm": 6.5798139572143555, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8625508546829224, + "num_tokens": 327860340.0, + "step": 8596 + }, + { + "epoch": 1.0936267650426155, + "ewc_loss": 0.05650841072201729, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024770130403339863, + "grad_norm": 6.6212158203125, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.85387122631073, + "num_tokens": 327901709.0, + "step": 8597 + }, + { + "epoch": 1.093753975321206, + "ewc_loss": 0.05637727305293083, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024638991453684866, + "grad_norm": 6.556362152099609, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8632814884185791, + "num_tokens": 327940327.0, + "step": 8598 + }, + { + "epoch": 1.0938811855997965, + "ewc_loss": 0.056384071707725525, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002464579010847956, + "grad_norm": 6.544850826263428, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.866401731967926, + "num_tokens": 327977966.0, + "step": 8599 + }, + { + "epoch": 1.094008395878387, + "ewc_loss": 0.05643715709447861, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002469887840561569, + "grad_norm": 6.524799823760986, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8575375080108643, + "num_tokens": 328021143.0, + "step": 8600 + }, + { + "epoch": 1.0941356061569776, + "ewc_loss": 0.05637861043214798, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002464032731950283, + "grad_norm": 6.592094898223877, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8676048517227173, + "num_tokens": 328060792.0, + "step": 8601 + }, + { + "epoch": 1.094262816435568, + "ewc_loss": 0.05644439533352852, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002470611361786723, + "grad_norm": 6.5383710861206055, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8618283271789551, + "num_tokens": 328099698.0, + "step": 8602 + }, + { + "epoch": 1.0943900267141584, + "ewc_loss": 0.056453999131917953, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024715717881917953, + "grad_norm": 6.548261642456055, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8646817207336426, + "num_tokens": 328141075.0, + "step": 8603 + }, + { + "epoch": 1.094517236992749, + "ewc_loss": 0.056446537375450134, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002470825857017189, + "grad_norm": 6.6285400390625, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8541190028190613, + "num_tokens": 328178945.0, + "step": 8604 + }, + { + "epoch": 1.0946444472713395, + "ewc_loss": 0.056509144604206085, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002477086381986737, + "grad_norm": 6.584247589111328, + "learning_rate": 1e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8348840475082397, + "num_tokens": 328214423.0, + "step": 8605 + }, + { + "epoch": 1.09477165754993, + "ewc_loss": 0.056498318910598755, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002476003719493747, + "grad_norm": 6.617794990539551, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8614416122436523, + "num_tokens": 328248640.0, + "step": 8606 + }, + { + "epoch": 1.0948988678285205, + "ewc_loss": 0.056419387459754944, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024681107606738806, + "grad_norm": 6.579636573791504, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8396502733230591, + "num_tokens": 328283016.0, + "step": 8607 + }, + { + "epoch": 1.095026078107111, + "ewc_loss": 0.056573137640953064, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024834854411892593, + "grad_norm": 6.579667091369629, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8655601739883423, + "num_tokens": 328322377.0, + "step": 8608 + }, + { + "epoch": 1.0951532883857016, + "ewc_loss": 0.05657040327787399, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024832121562212706, + "grad_norm": 6.630581378936768, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8637639284133911, + "num_tokens": 328359409.0, + "step": 8609 + }, + { + "epoch": 1.0952804986642921, + "ewc_loss": 0.056494519114494324, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002475623623467982, + "grad_norm": 6.576291561126709, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8660218119621277, + "num_tokens": 328401257.0, + "step": 8610 + }, + { + "epoch": 1.0954077089428826, + "ewc_loss": 0.05654244124889374, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002480416151229292, + "grad_norm": 6.545640468597412, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8636536002159119, + "num_tokens": 328435886.0, + "step": 8611 + }, + { + "epoch": 1.0955349192214732, + "ewc_loss": 0.05657476559281349, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002483648422639817, + "grad_norm": 6.557274341583252, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.851375937461853, + "num_tokens": 328472091.0, + "step": 8612 + }, + { + "epoch": 1.0956621295000637, + "ewc_loss": 0.05687170848250389, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002488928730599582, + "grad_norm": 6.622583866119385, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8533209562301636, + "num_tokens": 328509453.0, + "step": 8613 + }, + { + "epoch": 1.095789339778654, + "ewc_loss": 0.05681223049759865, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024829807807691395, + "grad_norm": 6.535195350646973, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8615471720695496, + "num_tokens": 328545590.0, + "step": 8614 + }, + { + "epoch": 1.0959165500572445, + "ewc_loss": 0.05687641352415085, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002489399048499763, + "grad_norm": 6.555551528930664, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8508368134498596, + "num_tokens": 328583093.0, + "step": 8615 + }, + { + "epoch": 1.096043760335835, + "ewc_loss": 0.05686767399311066, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002488525351509452, + "grad_norm": 6.577073097229004, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8444595336914062, + "num_tokens": 328625312.0, + "step": 8616 + }, + { + "epoch": 1.0961709706144256, + "ewc_loss": 0.05675456300377846, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024772141478024423, + "grad_norm": 6.588682651519775, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8543105125427246, + "num_tokens": 328655392.0, + "step": 8617 + }, + { + "epoch": 1.0962981808930161, + "ewc_loss": 0.05684632807970047, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002486390876583755, + "grad_norm": 6.531664848327637, + "learning_rate": 1e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.8460444808006287, + "num_tokens": 328693686.0, + "step": 8618 + }, + { + "epoch": 1.0964253911716066, + "ewc_loss": 0.056841786950826645, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024859365657903254, + "grad_norm": 6.544947624206543, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8714224100112915, + "num_tokens": 328728051.0, + "step": 8619 + }, + { + "epoch": 1.0965526014501972, + "ewc_loss": 0.056908685714006424, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.000249262637225911, + "grad_norm": 6.5499958992004395, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.850414514541626, + "num_tokens": 328768091.0, + "step": 8620 + }, + { + "epoch": 1.0966798117287877, + "ewc_loss": 0.05688001960515976, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002489759644959122, + "grad_norm": 6.53713846206665, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8570693731307983, + "num_tokens": 328803851.0, + "step": 8621 + }, + { + "epoch": 1.0968070220073782, + "ewc_loss": 0.05703942850232124, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025057006860151887, + "grad_norm": 6.587459564208984, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8711704015731812, + "num_tokens": 328837429.0, + "step": 8622 + }, + { + "epoch": 1.0969342322859688, + "ewc_loss": 0.05685862898826599, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002487620513420552, + "grad_norm": 6.538667678833008, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8595532178878784, + "num_tokens": 328874264.0, + "step": 8623 + }, + { + "epoch": 1.0970614425645593, + "ewc_loss": 0.056994639337062836, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002501221897546202, + "grad_norm": 6.586174488067627, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8628665208816528, + "num_tokens": 328911627.0, + "step": 8624 + }, + { + "epoch": 1.0971886528431498, + "ewc_loss": 0.05689777806401253, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002491535560693592, + "grad_norm": 6.581028461456299, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.85355544090271, + "num_tokens": 328944471.0, + "step": 8625 + }, + { + "epoch": 1.0973158631217403, + "ewc_loss": 0.05694969743490219, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002496727684047073, + "grad_norm": 6.619449615478516, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8683028221130371, + "num_tokens": 328983826.0, + "step": 8626 + }, + { + "epoch": 1.0974430734003306, + "ewc_loss": 0.05686796456575394, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002488554164301604, + "grad_norm": 6.550878047943115, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8619174361228943, + "num_tokens": 329019385.0, + "step": 8627 + }, + { + "epoch": 1.0975702836789212, + "ewc_loss": 0.05693681165575981, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002495438966434449, + "grad_norm": 6.529550552368164, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8490463495254517, + "num_tokens": 329065179.0, + "step": 8628 + }, + { + "epoch": 1.0976974939575117, + "ewc_loss": 0.056888267397880554, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002490584447514266, + "grad_norm": 6.5941009521484375, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8682503700256348, + "num_tokens": 329101077.0, + "step": 8629 + }, + { + "epoch": 1.0978247042361022, + "ewc_loss": 0.05691734328866005, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002493492211215198, + "grad_norm": 6.558382511138916, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8656653761863708, + "num_tokens": 329137538.0, + "step": 8630 + }, + { + "epoch": 1.0979519145146928, + "ewc_loss": 0.05687824636697769, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024895824026316404, + "grad_norm": 6.593803882598877, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.875072717666626, + "num_tokens": 329171952.0, + "step": 8631 + }, + { + "epoch": 1.0980791247932833, + "ewc_loss": 0.056933723390102386, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024951298837549984, + "grad_norm": 6.523365497589111, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8695949912071228, + "num_tokens": 329214956.0, + "step": 8632 + }, + { + "epoch": 1.0982063350718738, + "ewc_loss": 0.056822311133146286, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002483988937456161, + "grad_norm": 6.543776988983154, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8513982892036438, + "num_tokens": 329254228.0, + "step": 8633 + }, + { + "epoch": 1.0983335453504643, + "ewc_loss": 0.05687771365046501, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024895291426219046, + "grad_norm": 6.634751796722412, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8624387979507446, + "num_tokens": 329288337.0, + "step": 8634 + }, + { + "epoch": 1.0984607556290549, + "ewc_loss": 0.05695255845785141, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002497013483662158, + "grad_norm": 6.55621337890625, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8545222282409668, + "num_tokens": 329329118.0, + "step": 8635 + }, + { + "epoch": 1.0985879659076454, + "ewc_loss": 0.05687670037150383, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002489427861291915, + "grad_norm": 6.568809509277344, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8468091487884521, + "num_tokens": 329366301.0, + "step": 8636 + }, + { + "epoch": 1.098715176186236, + "ewc_loss": 0.05689085274934769, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024908428895287216, + "grad_norm": 6.6446356773376465, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8490579128265381, + "num_tokens": 329401612.0, + "step": 8637 + }, + { + "epoch": 1.0988423864648265, + "ewc_loss": 0.056832827627658844, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002485040749888867, + "grad_norm": 6.4930100440979, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8629831075668335, + "num_tokens": 329440403.0, + "step": 8638 + }, + { + "epoch": 1.0989695967434168, + "ewc_loss": 0.05692526325583458, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002494284126441926, + "grad_norm": 6.551214694976807, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8709575533866882, + "num_tokens": 329478699.0, + "step": 8639 + }, + { + "epoch": 1.0990968070220073, + "ewc_loss": 0.05691366642713547, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024931246298365295, + "grad_norm": 6.548736572265625, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8597265481948853, + "num_tokens": 329517860.0, + "step": 8640 + }, + { + "epoch": 1.0992240173005978, + "ewc_loss": 0.056894104927778244, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002491168270353228, + "grad_norm": 6.571257591247559, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8664963245391846, + "num_tokens": 329555385.0, + "step": 8641 + }, + { + "epoch": 1.0993512275791884, + "ewc_loss": 0.0569147951900959, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002493237261660397, + "grad_norm": 6.58151388168335, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8571922779083252, + "num_tokens": 329594775.0, + "step": 8642 + }, + { + "epoch": 1.0994784378577789, + "ewc_loss": 0.056864380836486816, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024881958961486816, + "grad_norm": 6.551379203796387, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8653737306594849, + "num_tokens": 329629872.0, + "step": 8643 + }, + { + "epoch": 1.0996056481363694, + "ewc_loss": 0.056908443570137024, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024926019250415266, + "grad_norm": 6.599828243255615, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8557188510894775, + "num_tokens": 329665263.0, + "step": 8644 + }, + { + "epoch": 1.09973285841496, + "ewc_loss": 0.05683466047048569, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.000248522381298244, + "grad_norm": 6.552011489868164, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8513864278793335, + "num_tokens": 329705853.0, + "step": 8645 + }, + { + "epoch": 1.0998600686935505, + "ewc_loss": 0.05691090598702431, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002492848434485495, + "grad_norm": 6.632600784301758, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8687309622764587, + "num_tokens": 329741690.0, + "step": 8646 + }, + { + "epoch": 1.099987278972141, + "ewc_loss": 0.056856878101825714, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024874458904378116, + "grad_norm": 6.456387042999268, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8543814420700073, + "num_tokens": 329781548.0, + "step": 8647 + }, + { + "epoch": 1.1001144892507315, + "ewc_loss": 0.056993018835783005, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002501059789210558, + "grad_norm": 6.552650451660156, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.86757493019104, + "num_tokens": 329822304.0, + "step": 8648 + }, + { + "epoch": 1.100241699529322, + "ewc_loss": 0.05684656649827957, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002486414450686425, + "grad_norm": 6.559948921203613, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8625496625900269, + "num_tokens": 329856065.0, + "step": 8649 + }, + { + "epoch": 1.1003689098079126, + "ewc_loss": 0.05697360262274742, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024991179816424847, + "grad_norm": 6.539844989776611, + "learning_rate": 1e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8469570279121399, + "num_tokens": 329895002.0, + "step": 8650 + }, + { + "epoch": 1.100496120086503, + "ewc_loss": 0.05691011995077133, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024927695631049573, + "grad_norm": 6.514183521270752, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8559888601303101, + "num_tokens": 329938810.0, + "step": 8651 + }, + { + "epoch": 1.1006233303650934, + "ewc_loss": 0.05706912279129028, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002508670149836689, + "grad_norm": 6.633098602294922, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8733305931091309, + "num_tokens": 329976513.0, + "step": 8652 + }, + { + "epoch": 1.100750540643684, + "ewc_loss": 0.05694383382797241, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.000249614124186337, + "grad_norm": 6.538755893707275, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8686956763267517, + "num_tokens": 330014013.0, + "step": 8653 + }, + { + "epoch": 1.1008777509222745, + "ewc_loss": 0.05695915222167969, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002497672976460308, + "grad_norm": 6.596943378448486, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.85677170753479, + "num_tokens": 330052577.0, + "step": 8654 + }, + { + "epoch": 1.101004961200865, + "ewc_loss": 0.05697701871395111, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024994599516503513, + "grad_norm": 6.55188512802124, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8594017624855042, + "num_tokens": 330091385.0, + "step": 8655 + }, + { + "epoch": 1.1011321714794555, + "ewc_loss": 0.056939441710710526, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002495702065061778, + "grad_norm": 6.545865058898926, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8695913553237915, + "num_tokens": 330130285.0, + "step": 8656 + }, + { + "epoch": 1.101259381758046, + "ewc_loss": 0.05695345997810364, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024971039965748787, + "grad_norm": 6.591534614562988, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8479714393615723, + "num_tokens": 330166823.0, + "step": 8657 + }, + { + "epoch": 1.1013865920366366, + "ewc_loss": 0.05695277452468872, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002497035020496696, + "grad_norm": 6.707700729370117, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8634098172187805, + "num_tokens": 330200655.0, + "step": 8658 + }, + { + "epoch": 1.101513802315227, + "ewc_loss": 0.05682981014251709, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002484738943167031, + "grad_norm": 6.515491962432861, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8606826663017273, + "num_tokens": 330243656.0, + "step": 8659 + }, + { + "epoch": 1.1016410125938176, + "ewc_loss": 0.05691199004650116, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002492956700734794, + "grad_norm": 6.635258674621582, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8540812134742737, + "num_tokens": 330287200.0, + "step": 8660 + }, + { + "epoch": 1.1017682228724082, + "ewc_loss": 0.05673372000455856, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002475129731465131, + "grad_norm": 6.528408527374268, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8633676767349243, + "num_tokens": 330324794.0, + "step": 8661 + }, + { + "epoch": 1.1018954331509987, + "ewc_loss": 0.05693291500210762, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024950492661446333, + "grad_norm": 6.624654293060303, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.862631618976593, + "num_tokens": 330365297.0, + "step": 8662 + }, + { + "epoch": 1.102022643429589, + "ewc_loss": 0.057040125131607056, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002481356495991349, + "grad_norm": 6.5990447998046875, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8786848783493042, + "num_tokens": 330408046.0, + "step": 8663 + }, + { + "epoch": 1.1021498537081795, + "ewc_loss": 0.056826867163181305, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002484444703441113, + "grad_norm": 6.601527214050293, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8590390682220459, + "num_tokens": 330447815.0, + "step": 8664 + }, + { + "epoch": 1.10227706398677, + "ewc_loss": 0.05696965008974075, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00024743087124079466, + "grad_norm": 6.583128929138184, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8506181836128235, + "num_tokens": 330487796.0, + "step": 8665 + }, + { + "epoch": 1.1024042742653606, + "ewc_loss": 0.05682520940899849, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.000248427881160751, + "grad_norm": 6.613967418670654, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.872346043586731, + "num_tokens": 330531155.0, + "step": 8666 + }, + { + "epoch": 1.1025314845439511, + "ewc_loss": 0.05674333870410919, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024760919041000307, + "grad_norm": 6.536276817321777, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8699710369110107, + "num_tokens": 330568611.0, + "step": 8667 + }, + { + "epoch": 1.1026586948225416, + "ewc_loss": 0.056649189442396164, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002491090854164213, + "grad_norm": 6.654422760009766, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8564932346343994, + "num_tokens": 330608085.0, + "step": 8668 + }, + { + "epoch": 1.1027859051011322, + "ewc_loss": 0.056698739528656006, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002471631742082536, + "grad_norm": 6.627443313598633, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8431699275970459, + "num_tokens": 330644776.0, + "step": 8669 + }, + { + "epoch": 1.1029131153797227, + "ewc_loss": 0.05683736130595207, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024854938965290785, + "grad_norm": 6.564403533935547, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8504987359046936, + "num_tokens": 330686401.0, + "step": 8670 + }, + { + "epoch": 1.1030403256583132, + "ewc_loss": 0.05658639222383499, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002484810829628259, + "grad_norm": 6.66369104385376, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.866195797920227, + "num_tokens": 330723115.0, + "step": 8671 + }, + { + "epoch": 1.1031675359369038, + "ewc_loss": 0.05675162002444267, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002476919908076525, + "grad_norm": 6.589518070220947, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8765785694122314, + "num_tokens": 330761170.0, + "step": 8672 + }, + { + "epoch": 1.1032947462154943, + "ewc_loss": 0.056677211076021194, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002493892970960587, + "grad_norm": 6.663695812225342, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8613675832748413, + "num_tokens": 330790230.0, + "step": 8673 + }, + { + "epoch": 1.1034219564940848, + "ewc_loss": 0.05679662525653839, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002481420524418354, + "grad_norm": 6.61143159866333, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8722522854804993, + "num_tokens": 330826178.0, + "step": 8674 + }, + { + "epoch": 1.1035491667726753, + "ewc_loss": 0.05689159780740738, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024909176863729954, + "grad_norm": 6.682332992553711, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8685261011123657, + "num_tokens": 330866341.0, + "step": 8675 + }, + { + "epoch": 1.1036763770512656, + "ewc_loss": 0.05676308274269104, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002478066016919911, + "grad_norm": 6.582719802856445, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8475203514099121, + "num_tokens": 330903955.0, + "step": 8676 + }, + { + "epoch": 1.1038035873298562, + "ewc_loss": 0.05685119703412056, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024868774926289916, + "grad_norm": 6.64261531829834, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8578029870986938, + "num_tokens": 330943024.0, + "step": 8677 + }, + { + "epoch": 1.1039307976084467, + "ewc_loss": 0.056783076375722885, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024800654500722885, + "grad_norm": 6.589054584503174, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8550398349761963, + "num_tokens": 330981004.0, + "step": 8678 + }, + { + "epoch": 1.1040580078870372, + "ewc_loss": 0.05684126168489456, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024858841788955033, + "grad_norm": 6.564751625061035, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8602402806282043, + "num_tokens": 331021532.0, + "step": 8679 + }, + { + "epoch": 1.1041852181656278, + "ewc_loss": 0.056871458888053894, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024889037013053894, + "grad_norm": 6.662327766418457, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8679955005645752, + "num_tokens": 331058010.0, + "step": 8680 + }, + { + "epoch": 1.1043124284442183, + "ewc_loss": 0.05677240714430809, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024789985036477447, + "grad_norm": 6.572213172912598, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8707141876220703, + "num_tokens": 331090713.0, + "step": 8681 + }, + { + "epoch": 1.1044396387228088, + "ewc_loss": 0.05685196444392204, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024869543267413974, + "grad_norm": 6.622883319854736, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8669615983963013, + "num_tokens": 331127532.0, + "step": 8682 + }, + { + "epoch": 1.1045668490013993, + "ewc_loss": 0.05683233588933945, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002484991564415395, + "grad_norm": 6.620598316192627, + "learning_rate": 1e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.8398818969726562, + "num_tokens": 331172088.0, + "step": 8683 + }, + { + "epoch": 1.1046940592799899, + "ewc_loss": 0.0568859800696373, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024903559824451804, + "grad_norm": 6.599812030792236, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8556469678878784, + "num_tokens": 331213419.0, + "step": 8684 + }, + { + "epoch": 1.1048212695585804, + "ewc_loss": 0.05694489926099777, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002496247470844537, + "grad_norm": 6.667332172393799, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8606907725334167, + "num_tokens": 331250102.0, + "step": 8685 + }, + { + "epoch": 1.104948479837171, + "ewc_loss": 0.056868985295295715, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002488656609784812, + "grad_norm": 6.585356712341309, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8634099960327148, + "num_tokens": 331284597.0, + "step": 8686 + }, + { + "epoch": 1.1050756901157615, + "ewc_loss": 0.05697828158736229, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002499585971236229, + "grad_norm": 6.64211368560791, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8546655178070068, + "num_tokens": 331330688.0, + "step": 8687 + }, + { + "epoch": 1.1052029003943518, + "ewc_loss": 0.056904807686805725, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002492238418199122, + "grad_norm": 6.643113613128662, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8640516400337219, + "num_tokens": 331368777.0, + "step": 8688 + }, + { + "epoch": 1.1053301106729423, + "ewc_loss": 0.05687426030635834, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002489183680154383, + "grad_norm": 6.5233564376831055, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8658754825592041, + "num_tokens": 331414369.0, + "step": 8689 + }, + { + "epoch": 1.1054573209515328, + "ewc_loss": 0.057084955275058746, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002510253107175231, + "grad_norm": 6.644571781158447, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8679579496383667, + "num_tokens": 331452788.0, + "step": 8690 + }, + { + "epoch": 1.1055845312301233, + "ewc_loss": 0.056976668536663055, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002499424444977194, + "grad_norm": 6.648390769958496, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8496255278587341, + "num_tokens": 331489575.0, + "step": 8691 + }, + { + "epoch": 1.1057117415087139, + "ewc_loss": 0.056982532143592834, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002500010887160897, + "grad_norm": 6.5868425369262695, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8789154887199402, + "num_tokens": 331529129.0, + "step": 8692 + }, + { + "epoch": 1.1058389517873044, + "ewc_loss": 0.057008370757102966, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002502594725228846, + "grad_norm": 6.660591125488281, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8566266298294067, + "num_tokens": 331564989.0, + "step": 8693 + }, + { + "epoch": 1.105966162065895, + "ewc_loss": 0.056876108050346375, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024893684894777834, + "grad_norm": 6.596304416656494, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8639488220214844, + "num_tokens": 331602561.0, + "step": 8694 + }, + { + "epoch": 1.1060933723444855, + "ewc_loss": 0.057051192969083786, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.000250687706284225, + "grad_norm": 6.658705234527588, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.862549901008606, + "num_tokens": 331646902.0, + "step": 8695 + }, + { + "epoch": 1.106220582623076, + "ewc_loss": 0.05692986398935318, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024947442580014467, + "grad_norm": 6.564436435699463, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8625866770744324, + "num_tokens": 331682347.0, + "step": 8696 + }, + { + "epoch": 1.1063477929016665, + "ewc_loss": 0.057114772498607635, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002513235085643828, + "grad_norm": 6.590018272399902, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8531041145324707, + "num_tokens": 331725813.0, + "step": 8697 + }, + { + "epoch": 1.106475003180257, + "ewc_loss": 0.05699658393859863, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002501416311133653, + "grad_norm": 6.615784645080566, + "learning_rate": 1e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8409570455551147, + "num_tokens": 331767856.0, + "step": 8698 + }, + { + "epoch": 1.1066022134588476, + "ewc_loss": 0.05708077549934387, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025098351761698723, + "grad_norm": 6.613957405090332, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8703013062477112, + "num_tokens": 331804238.0, + "step": 8699 + }, + { + "epoch": 1.106729423737438, + "ewc_loss": 0.05710074305534363, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025118322810158134, + "grad_norm": 6.633876323699951, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8613342642784119, + "num_tokens": 331839963.0, + "step": 8700 + }, + { + "epoch": 1.1068566340160284, + "ewc_loss": 0.05696144700050354, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002497902314644307, + "grad_norm": 6.587777614593506, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8651936054229736, + "num_tokens": 331881563.0, + "step": 8701 + }, + { + "epoch": 1.106983844294619, + "ewc_loss": 0.05711301416158676, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025130590074695647, + "grad_norm": 6.571773529052734, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8566740155220032, + "num_tokens": 331925540.0, + "step": 8702 + }, + { + "epoch": 1.1071110545732095, + "ewc_loss": 0.05706613138318062, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002508370962459594, + "grad_norm": 6.667093276977539, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8691767454147339, + "num_tokens": 331963171.0, + "step": 8703 + }, + { + "epoch": 1.1072382648518, + "ewc_loss": 0.05703608691692352, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025053665740415454, + "grad_norm": 6.579618453979492, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8812824487686157, + "num_tokens": 331998310.0, + "step": 8704 + }, + { + "epoch": 1.1073654751303905, + "ewc_loss": 0.05721324309706688, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002523082075640559, + "grad_norm": 6.6760640144348145, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8669768571853638, + "num_tokens": 332034181.0, + "step": 8705 + }, + { + "epoch": 1.107492685408981, + "ewc_loss": 0.05699785053730011, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002501542621757835, + "grad_norm": 6.60019063949585, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8709125518798828, + "num_tokens": 332075368.0, + "step": 8706 + }, + { + "epoch": 1.1076198956875716, + "ewc_loss": 0.05716835707426071, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002518593391869217, + "grad_norm": 6.6218180656433105, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8730826377868652, + "num_tokens": 332113281.0, + "step": 8707 + }, + { + "epoch": 1.107747105966162, + "ewc_loss": 0.05705135315656662, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002506893069949001, + "grad_norm": 6.607300758361816, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8538050651550293, + "num_tokens": 332154206.0, + "step": 8708 + }, + { + "epoch": 1.1078743162447526, + "ewc_loss": 0.05711915343999863, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025136733893305063, + "grad_norm": 6.646318435668945, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8472654819488525, + "num_tokens": 332195409.0, + "step": 8709 + }, + { + "epoch": 1.1080015265233432, + "ewc_loss": 0.05717276409268379, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025190343149006367, + "grad_norm": 6.584481716156006, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.869683027267456, + "num_tokens": 332233836.0, + "step": 8710 + }, + { + "epoch": 1.1081287368019337, + "ewc_loss": 0.05715462565422058, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002517220564186573, + "grad_norm": 6.683889865875244, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8666927218437195, + "num_tokens": 332274167.0, + "step": 8711 + }, + { + "epoch": 1.108255947080524, + "ewc_loss": 0.05700629949569702, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002502387505955994, + "grad_norm": 6.5725531578063965, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8579238057136536, + "num_tokens": 332315856.0, + "step": 8712 + }, + { + "epoch": 1.1083831573591145, + "ewc_loss": 0.05720216780900955, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025219746748916805, + "grad_norm": 6.681993007659912, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8612745404243469, + "num_tokens": 332366365.0, + "step": 8713 + }, + { + "epoch": 1.108510367637705, + "ewc_loss": 0.057025086134672165, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002504266449250281, + "grad_norm": 6.780299663543701, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8585940599441528, + "num_tokens": 332400411.0, + "step": 8714 + }, + { + "epoch": 1.1086375779162956, + "ewc_loss": 0.0569697767496109, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002498735557310283, + "grad_norm": 6.540687561035156, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8644627332687378, + "num_tokens": 332441959.0, + "step": 8715 + }, + { + "epoch": 1.108764788194886, + "ewc_loss": 0.05718803405761719, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025205613928847015, + "grad_norm": 6.941884994506836, + "learning_rate": 1e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8486822247505188, + "num_tokens": 332479545.0, + "step": 8716 + }, + { + "epoch": 1.1088919984734766, + "ewc_loss": 0.05683480203151703, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002485238073859364, + "grad_norm": 6.5990800857543945, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8717769980430603, + "num_tokens": 332516240.0, + "step": 8717 + }, + { + "epoch": 1.1090192087520672, + "ewc_loss": 0.05722133070230484, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002523890871088952, + "grad_norm": 6.807941913604736, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8665251731872559, + "num_tokens": 332553488.0, + "step": 8718 + }, + { + "epoch": 1.1091464190306577, + "ewc_loss": 0.056489139795303345, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00024750857846811414, + "grad_norm": 6.5790114402771, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8516125679016113, + "num_tokens": 332588646.0, + "step": 8719 + }, + { + "epoch": 1.1092736293092482, + "ewc_loss": 0.05705268308520317, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.0002531440113671124, + "grad_norm": 6.9075727462768555, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8511231541633606, + "num_tokens": 332629103.0, + "step": 8720 + }, + { + "epoch": 1.1094008395878387, + "ewc_loss": 0.05679503083229065, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00024812607443891466, + "grad_norm": 6.564404010772705, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8666167259216309, + "num_tokens": 332664820.0, + "step": 8721 + }, + { + "epoch": 1.1095280498664293, + "ewc_loss": 0.0569702573120594, + "ewc_loss_diag": 3.170967102050781e-05, + "ewc_loss_parallel": 0.00025231976178474724, + "grad_norm": 6.867753028869629, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8634548187255859, + "num_tokens": 332699772.0, + "step": 8722 + }, + { + "epoch": 1.1096552601450198, + "ewc_loss": 0.056895628571510315, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002491320774424821, + "grad_norm": 6.566636085510254, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8699012994766235, + "num_tokens": 332733478.0, + "step": 8723 + }, + { + "epoch": 1.1097824704236103, + "ewc_loss": 0.05722576379776001, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002524334122426808, + "grad_norm": 6.712307453155518, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8577022552490234, + "num_tokens": 332774733.0, + "step": 8724 + }, + { + "epoch": 1.1099096807022006, + "ewc_loss": 0.05699392408132553, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002501150011084974, + "grad_norm": 6.672059535980225, + "learning_rate": 1e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8373383283615112, + "num_tokens": 332812970.0, + "step": 8725 + }, + { + "epoch": 1.1100368909807912, + "ewc_loss": 0.05711738020181656, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025134955649264157, + "grad_norm": 6.661526203155518, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8579792380332947, + "num_tokens": 332854974.0, + "step": 8726 + }, + { + "epoch": 1.1101641012593817, + "ewc_loss": 0.05714454501867294, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002516212116461247, + "grad_norm": 6.732952117919922, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8474711179733276, + "num_tokens": 332886462.0, + "step": 8727 + }, + { + "epoch": 1.1102913115379722, + "ewc_loss": 0.05703039467334747, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025047973031178117, + "grad_norm": 6.6221699714660645, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8571819067001343, + "num_tokens": 332928614.0, + "step": 8728 + }, + { + "epoch": 1.1104185218165628, + "ewc_loss": 0.05716216564178467, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002517974644433707, + "grad_norm": 6.77716588973999, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8627382516860962, + "num_tokens": 332962447.0, + "step": 8729 + }, + { + "epoch": 1.1105457320951533, + "ewc_loss": 0.0569545216858387, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002497209934517741, + "grad_norm": 6.675859451293945, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8522476553916931, + "num_tokens": 332991395.0, + "step": 8730 + }, + { + "epoch": 1.1106729423737438, + "ewc_loss": 0.057067520916461945, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002508510078769177, + "grad_norm": 6.642663478851318, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8546028137207031, + "num_tokens": 333027852.0, + "step": 8731 + }, + { + "epoch": 1.1108001526523343, + "ewc_loss": 0.05701432377099991, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002503190189599991, + "grad_norm": 6.665792465209961, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8635560274124146, + "num_tokens": 333063337.0, + "step": 8732 + }, + { + "epoch": 1.1109273629309249, + "ewc_loss": 0.05710133910179138, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002511891652829945, + "grad_norm": 6.69549560546875, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8508046865463257, + "num_tokens": 333094632.0, + "step": 8733 + }, + { + "epoch": 1.1110545732095154, + "ewc_loss": 0.056978773325681686, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002499635156709701, + "grad_norm": 6.529013156890869, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8743541240692139, + "num_tokens": 333134071.0, + "step": 8734 + }, + { + "epoch": 1.111181783488106, + "ewc_loss": 0.05721534043550491, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025232916232198477, + "grad_norm": 6.646678447723389, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8663540482521057, + "num_tokens": 333171288.0, + "step": 8735 + }, + { + "epoch": 1.1113089937666965, + "ewc_loss": 0.05700252205133438, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025020100292749703, + "grad_norm": 6.577942848205566, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8673091530799866, + "num_tokens": 333212111.0, + "step": 8736 + }, + { + "epoch": 1.1114362040452868, + "ewc_loss": 0.057149603962898254, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002516717941034585, + "grad_norm": 6.668548583984375, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8658376932144165, + "num_tokens": 333247557.0, + "step": 8737 + }, + { + "epoch": 1.1115634143238773, + "ewc_loss": 0.05707830563187599, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002509588375687599, + "grad_norm": 6.640021324157715, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.861564576625824, + "num_tokens": 333278705.0, + "step": 8738 + }, + { + "epoch": 1.1116906246024678, + "ewc_loss": 0.0571988970041275, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002521647547837347, + "grad_norm": 6.696812629699707, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.863411545753479, + "num_tokens": 333316493.0, + "step": 8739 + }, + { + "epoch": 1.1118178348810583, + "ewc_loss": 0.0573882982134819, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025161736994050443, + "grad_norm": 6.527559757232666, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8635820746421814, + "num_tokens": 333360384.0, + "step": 8740 + }, + { + "epoch": 1.1119450451596489, + "ewc_loss": 0.057265348732471466, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002528292534407228, + "grad_norm": 6.735300064086914, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8616922497749329, + "num_tokens": 333395568.0, + "step": 8741 + }, + { + "epoch": 1.1120722554382394, + "ewc_loss": 0.057061634957790375, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025079213082790375, + "grad_norm": 6.609898567199707, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.866104006767273, + "num_tokens": 333429267.0, + "step": 8742 + }, + { + "epoch": 1.11219946571683, + "ewc_loss": 0.05767747014760971, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025206769350916147, + "grad_norm": 6.5974555015563965, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.867792010307312, + "num_tokens": 333469908.0, + "step": 8743 + }, + { + "epoch": 1.1123266759954205, + "ewc_loss": 0.05718398839235306, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025201565586030483, + "grad_norm": 6.641295433044434, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8647462129592896, + "num_tokens": 333507865.0, + "step": 8744 + }, + { + "epoch": 1.112453886274011, + "ewc_loss": 0.057195812463760376, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025213390472345054, + "grad_norm": 6.680422306060791, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8727951049804688, + "num_tokens": 333544011.0, + "step": 8745 + }, + { + "epoch": 1.1125810965526015, + "ewc_loss": 0.05760568752884865, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025134984753094614, + "grad_norm": 6.59577751159668, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8644272685050964, + "num_tokens": 333581949.0, + "step": 8746 + }, + { + "epoch": 1.112708306831192, + "ewc_loss": 0.05769000202417374, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025219301460310817, + "grad_norm": 6.7059407234191895, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8699246644973755, + "num_tokens": 333615994.0, + "step": 8747 + }, + { + "epoch": 1.1128355171097826, + "ewc_loss": 0.05704290047287941, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025060478947125375, + "grad_norm": 6.593525409698486, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8768880367279053, + "num_tokens": 333654454.0, + "step": 8748 + }, + { + "epoch": 1.112962727388373, + "ewc_loss": 0.05720190331339836, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002521948190405965, + "grad_norm": 6.61569356918335, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.853421688079834, + "num_tokens": 333700206.0, + "step": 8749 + }, + { + "epoch": 1.1130899376669634, + "ewc_loss": 0.05764473229646683, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002517402754165232, + "grad_norm": 6.635336399078369, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8654271364212036, + "num_tokens": 333737538.0, + "step": 8750 + }, + { + "epoch": 1.113217147945554, + "ewc_loss": 0.05707612261176109, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025093700969591737, + "grad_norm": 6.643143177032471, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8629818558692932, + "num_tokens": 333771237.0, + "step": 8751 + }, + { + "epoch": 1.1133443582241445, + "ewc_loss": 0.05717334896326065, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025190928135998547, + "grad_norm": 6.644269943237305, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8593674898147583, + "num_tokens": 333806522.0, + "step": 8752 + }, + { + "epoch": 1.113471568502735, + "ewc_loss": 0.0576498880982399, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025179184740409255, + "grad_norm": 6.574567794799805, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8610802888870239, + "num_tokens": 333846656.0, + "step": 8753 + }, + { + "epoch": 1.1135987787813255, + "ewc_loss": 0.05723007023334503, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025247648591175675, + "grad_norm": 6.651132583618164, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8533064126968384, + "num_tokens": 333886294.0, + "step": 8754 + }, + { + "epoch": 1.113725989059916, + "ewc_loss": 0.05719752609729767, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002521510177757591, + "grad_norm": 6.585600852966309, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8575516939163208, + "num_tokens": 333923868.0, + "step": 8755 + }, + { + "epoch": 1.1138531993385066, + "ewc_loss": 0.05733155459165573, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002534913073759526, + "grad_norm": 6.655013084411621, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.867060124874115, + "num_tokens": 333957187.0, + "step": 8756 + }, + { + "epoch": 1.113980409617097, + "ewc_loss": 0.057204749435186386, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025222328258678317, + "grad_norm": 6.58333158493042, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8763943314552307, + "num_tokens": 333999061.0, + "step": 8757 + }, + { + "epoch": 1.1141076198956876, + "ewc_loss": 0.05736161023378372, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002537918626330793, + "grad_norm": 6.636152744293213, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8564619421958923, + "num_tokens": 334043595.0, + "step": 8758 + }, + { + "epoch": 1.1142348301742782, + "ewc_loss": 0.057272426784038544, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002529000339563936, + "grad_norm": 6.631838321685791, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8669258952140808, + "num_tokens": 334078487.0, + "step": 8759 + }, + { + "epoch": 1.1143620404528687, + "ewc_loss": 0.05717364698648453, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025191224995069206, + "grad_norm": 6.553295612335205, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.869473397731781, + "num_tokens": 334119610.0, + "step": 8760 + }, + { + "epoch": 1.114489250731459, + "ewc_loss": 0.057381413877010345, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025398991419933736, + "grad_norm": 6.672596454620361, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8570426106452942, + "num_tokens": 334160149.0, + "step": 8761 + }, + { + "epoch": 1.1146164610100495, + "ewc_loss": 0.05719674006104469, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025214318884536624, + "grad_norm": 6.591341018676758, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8628169298171997, + "num_tokens": 334198667.0, + "step": 8762 + }, + { + "epoch": 1.11474367128864, + "ewc_loss": 0.05733669549226761, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002535427629482001, + "grad_norm": 6.704701900482178, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8630752563476562, + "num_tokens": 334233838.0, + "step": 8763 + }, + { + "epoch": 1.1148708815672306, + "ewc_loss": 0.05722532421350479, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002524290466681123, + "grad_norm": 6.5710768699646, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.853888988494873, + "num_tokens": 334275703.0, + "step": 8764 + }, + { + "epoch": 1.114998091845821, + "ewc_loss": 0.057398974895477295, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025416555581614375, + "grad_norm": 6.75670051574707, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8738611340522766, + "num_tokens": 334311119.0, + "step": 8765 + }, + { + "epoch": 1.1151253021244116, + "ewc_loss": 0.05709047615528107, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002510805206838995, + "grad_norm": 6.612010478973389, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8551808595657349, + "num_tokens": 334348298.0, + "step": 8766 + }, + { + "epoch": 1.1152525124030022, + "ewc_loss": 0.05734451860189438, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002536209940444678, + "grad_norm": 6.652102470397949, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8486831188201904, + "num_tokens": 334386986.0, + "step": 8767 + }, + { + "epoch": 1.1153797226815927, + "ewc_loss": 0.05716405808925629, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002518163528293371, + "grad_norm": 6.557361602783203, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8655800223350525, + "num_tokens": 334423263.0, + "step": 8768 + }, + { + "epoch": 1.1155069329601832, + "ewc_loss": 0.057283058762550354, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025300635024905205, + "grad_norm": 6.634913921356201, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8785340785980225, + "num_tokens": 334460363.0, + "step": 8769 + }, + { + "epoch": 1.1156341432387737, + "ewc_loss": 0.057224832475185394, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002524241281207651, + "grad_norm": 6.627634525299072, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8656278252601624, + "num_tokens": 334501883.0, + "step": 8770 + }, + { + "epoch": 1.1157613535173643, + "ewc_loss": 0.05725627392530441, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025273850769735873, + "grad_norm": 6.703090190887451, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8608946800231934, + "num_tokens": 334536242.0, + "step": 8771 + }, + { + "epoch": 1.1158885637959548, + "ewc_loss": 0.05721138045191765, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002522895811125636, + "grad_norm": 6.622476100921631, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.874397873878479, + "num_tokens": 334569685.0, + "step": 8772 + }, + { + "epoch": 1.1160157740745453, + "ewc_loss": 0.05753201246261597, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002530544879846275, + "grad_norm": 6.665120601654053, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8658992052078247, + "num_tokens": 334604946.0, + "step": 8773 + }, + { + "epoch": 1.1161429843531356, + "ewc_loss": 0.057215169072151184, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002523274742998183, + "grad_norm": 6.571892261505127, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8678857684135437, + "num_tokens": 334645495.0, + "step": 8774 + }, + { + "epoch": 1.1162701946317262, + "ewc_loss": 0.057270586490631104, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002528816694393754, + "grad_norm": 6.619382381439209, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8559672832489014, + "num_tokens": 334684239.0, + "step": 8775 + }, + { + "epoch": 1.1163974049103167, + "ewc_loss": 0.057277873158454895, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002529545163270086, + "grad_norm": 6.714564323425293, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8675684332847595, + "num_tokens": 334713799.0, + "step": 8776 + }, + { + "epoch": 1.1165246151889072, + "ewc_loss": 0.05739609897136688, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000251695339102298, + "grad_norm": 6.584692478179932, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8830970525741577, + "num_tokens": 334753374.0, + "step": 8777 + }, + { + "epoch": 1.1166518254674977, + "ewc_loss": 0.057327546179294586, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025345126050524414, + "grad_norm": 6.6478400230407715, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8623766899108887, + "num_tokens": 334792867.0, + "step": 8778 + }, + { + "epoch": 1.1167790357460883, + "ewc_loss": 0.05717545375227928, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002519303234294057, + "grad_norm": 6.580906391143799, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8585169315338135, + "num_tokens": 334832664.0, + "step": 8779 + }, + { + "epoch": 1.1169062460246788, + "ewc_loss": 0.05751587077975273, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025289307814091444, + "grad_norm": 6.63795804977417, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8556140661239624, + "num_tokens": 334876763.0, + "step": 8780 + }, + { + "epoch": 1.1170334563032693, + "ewc_loss": 0.05751097574830055, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002528441254980862, + "grad_norm": 6.660263538360596, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8627361059188843, + "num_tokens": 334909431.0, + "step": 8781 + }, + { + "epoch": 1.1171606665818599, + "ewc_loss": 0.05729510262608528, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025312681100331247, + "grad_norm": 6.6841254234313965, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8606232404708862, + "num_tokens": 334941939.0, + "step": 8782 + }, + { + "epoch": 1.1172878768604504, + "ewc_loss": 0.05739806592464447, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025171504239551723, + "grad_norm": 6.602559566497803, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8719761371612549, + "num_tokens": 334981338.0, + "step": 8783 + }, + { + "epoch": 1.117415087139041, + "ewc_loss": 0.057607345283031464, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025380784063600004, + "grad_norm": 6.644556045532227, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8625880479812622, + "num_tokens": 335020660.0, + "step": 8784 + }, + { + "epoch": 1.1175422974176314, + "ewc_loss": 0.05746297910809517, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002523641742300242, + "grad_norm": 6.6419453620910645, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8648678064346313, + "num_tokens": 335057377.0, + "step": 8785 + }, + { + "epoch": 1.1176695076962218, + "ewc_loss": 0.057537805289030075, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025311243371106684, + "grad_norm": 6.674406051635742, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8529344797134399, + "num_tokens": 335092912.0, + "step": 8786 + }, + { + "epoch": 1.1177967179748123, + "ewc_loss": 0.057428061962127686, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002520150155760348, + "grad_norm": 6.609977722167969, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8771743178367615, + "num_tokens": 335129157.0, + "step": 8787 + }, + { + "epoch": 1.1179239282534028, + "ewc_loss": 0.05758184194564819, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002535527746658772, + "grad_norm": 6.689284801483154, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8620102405548096, + "num_tokens": 335168065.0, + "step": 8788 + }, + { + "epoch": 1.1180511385319933, + "ewc_loss": 0.05741610378026962, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025189542793668807, + "grad_norm": 6.529382228851318, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.861093282699585, + "num_tokens": 335218876.0, + "step": 8789 + }, + { + "epoch": 1.1181783488105839, + "ewc_loss": 0.057605668902397156, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025379107682965696, + "grad_norm": 6.624180793762207, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8723186254501343, + "num_tokens": 335256300.0, + "step": 8790 + }, + { + "epoch": 1.1183055590891744, + "ewc_loss": 0.0575103722512722, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025283810100518167, + "grad_norm": 6.636466979980469, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8729234337806702, + "num_tokens": 335291035.0, + "step": 8791 + }, + { + "epoch": 1.118432769367765, + "ewc_loss": 0.05752972885966301, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002530316705815494, + "grad_norm": 6.598526477813721, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8745248317718506, + "num_tokens": 335330092.0, + "step": 8792 + }, + { + "epoch": 1.1185599796463555, + "ewc_loss": 0.05757146328687668, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025344901951029897, + "grad_norm": 6.668705940246582, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8481563329696655, + "num_tokens": 335367146.0, + "step": 8793 + }, + { + "epoch": 1.118687189924946, + "ewc_loss": 0.05751337856054306, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002528681652620435, + "grad_norm": 6.588308811187744, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8622051477432251, + "num_tokens": 335405069.0, + "step": 8794 + }, + { + "epoch": 1.1188144002035365, + "ewc_loss": 0.05766620114445686, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002543963782954961, + "grad_norm": 6.668045520782471, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.852721095085144, + "num_tokens": 335443779.0, + "step": 8795 + }, + { + "epoch": 1.118941610482127, + "ewc_loss": 0.05750630050897598, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002527973847463727, + "grad_norm": 6.613260269165039, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8543086051940918, + "num_tokens": 335485978.0, + "step": 8796 + }, + { + "epoch": 1.1190688207607176, + "ewc_loss": 0.0576048381626606, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025378275313414633, + "grad_norm": 6.636894226074219, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8668797016143799, + "num_tokens": 335530375.0, + "step": 8797 + }, + { + "epoch": 1.119196031039308, + "ewc_loss": 0.057504430413246155, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000252778670983389, + "grad_norm": 6.6044135093688965, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8659297227859497, + "num_tokens": 335569109.0, + "step": 8798 + }, + { + "epoch": 1.1193232413178984, + "ewc_loss": 0.05747738108038902, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025250817998312414, + "grad_norm": 6.625099182128906, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8475430011749268, + "num_tokens": 335599283.0, + "step": 8799 + }, + { + "epoch": 1.119450451596489, + "ewc_loss": 0.05731838941574097, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002533596707507968, + "grad_norm": 6.668440341949463, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8691772222518921, + "num_tokens": 335634792.0, + "step": 8800 + }, + { + "epoch": 1.1195776618750795, + "ewc_loss": 0.05720151960849762, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002521909773349762, + "grad_norm": 6.585788726806641, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8747624754905701, + "num_tokens": 335672869.0, + "step": 8801 + }, + { + "epoch": 1.11970487215367, + "ewc_loss": 0.057572733610868454, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002534617087803781, + "grad_norm": 6.630881309509277, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8679711222648621, + "num_tokens": 335714996.0, + "step": 8802 + }, + { + "epoch": 1.1198320824322605, + "ewc_loss": 0.05726930499076843, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.000252868834650144, + "grad_norm": 6.5897417068481445, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8534107208251953, + "num_tokens": 335756884.0, + "step": 8803 + }, + { + "epoch": 1.119959292710851, + "ewc_loss": 0.05757755786180496, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002535099338274449, + "grad_norm": 6.60601282119751, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8719221353530884, + "num_tokens": 335793935.0, + "step": 8804 + }, + { + "epoch": 1.1200865029894416, + "ewc_loss": 0.057544320821762085, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025317759718745947, + "grad_norm": 6.59574556350708, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8752212524414062, + "num_tokens": 335834169.0, + "step": 8805 + }, + { + "epoch": 1.120213713268032, + "ewc_loss": 0.05728710815310478, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002530468627810478, + "grad_norm": 6.610915660858154, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8804389834403992, + "num_tokens": 335872302.0, + "step": 8806 + }, + { + "epoch": 1.1203409235466226, + "ewc_loss": 0.057648371905088425, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025421808823011816, + "grad_norm": 6.676647663116455, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8630268573760986, + "num_tokens": 335908074.0, + "step": 8807 + }, + { + "epoch": 1.1204681338252132, + "ewc_loss": 0.057539984583854675, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025313423248007894, + "grad_norm": 6.616613388061523, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8737109303474426, + "num_tokens": 335946268.0, + "step": 8808 + }, + { + "epoch": 1.1205953441038037, + "ewc_loss": 0.05764045566320419, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025413892581127584, + "grad_norm": 6.655319690704346, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.863815188407898, + "num_tokens": 335985364.0, + "step": 8809 + }, + { + "epoch": 1.120722554382394, + "ewc_loss": 0.057511575520038605, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025285014999099076, + "grad_norm": 6.60234260559082, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8654028177261353, + "num_tokens": 336025300.0, + "step": 8810 + }, + { + "epoch": 1.1208497646609845, + "ewc_loss": 0.05758554860949516, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002535898529458791, + "grad_norm": 6.63216495513916, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8654346466064453, + "num_tokens": 336062105.0, + "step": 8811 + }, + { + "epoch": 1.120976974939575, + "ewc_loss": 0.057597704231739044, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002537114196456969, + "grad_norm": 6.635448455810547, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8525593280792236, + "num_tokens": 336102944.0, + "step": 8812 + }, + { + "epoch": 1.1211041852181656, + "ewc_loss": 0.05723778158426285, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002525536110624671, + "grad_norm": 6.557642936706543, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.889988899230957, + "num_tokens": 336142964.0, + "step": 8813 + }, + { + "epoch": 1.121231395496756, + "ewc_loss": 0.05745835602283478, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002547593612689525, + "grad_norm": 6.639009952545166, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8652526140213013, + "num_tokens": 336186962.0, + "step": 8814 + }, + { + "epoch": 1.1213586057753466, + "ewc_loss": 0.05724581331014633, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002526339085306972, + "grad_norm": 6.623340129852295, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8716663718223572, + "num_tokens": 336220775.0, + "step": 8815 + }, + { + "epoch": 1.1214858160539372, + "ewc_loss": 0.05737992376089096, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025397504214197397, + "grad_norm": 6.636348247528076, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8602942228317261, + "num_tokens": 336263067.0, + "step": 8816 + }, + { + "epoch": 1.1216130263325277, + "ewc_loss": 0.05729245766997337, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002531003556214273, + "grad_norm": 6.588432788848877, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8668810725212097, + "num_tokens": 336308463.0, + "step": 8817 + }, + { + "epoch": 1.1217402366111182, + "ewc_loss": 0.05743061751127243, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025448197266086936, + "grad_norm": 6.682034015655518, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8758336901664734, + "num_tokens": 336346965.0, + "step": 8818 + }, + { + "epoch": 1.1218674468897087, + "ewc_loss": 0.057241711765527725, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002525929012335837, + "grad_norm": 6.603452205657959, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8646811246871948, + "num_tokens": 336382114.0, + "step": 8819 + }, + { + "epoch": 1.1219946571682993, + "ewc_loss": 0.05739615112543106, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025413729599677026, + "grad_norm": 6.6920552253723145, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8675246238708496, + "num_tokens": 336414020.0, + "step": 8820 + }, + { + "epoch": 1.1221218674468898, + "ewc_loss": 0.05732522904872894, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002534280938562006, + "grad_norm": 6.619786739349365, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8606259822845459, + "num_tokens": 336452522.0, + "step": 8821 + }, + { + "epoch": 1.1222490777254803, + "ewc_loss": 0.0574086494743824, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002542622678447515, + "grad_norm": 6.666141033172607, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8739502429962158, + "num_tokens": 336484593.0, + "step": 8822 + }, + { + "epoch": 1.1223762880040706, + "ewc_loss": 0.05736776068806648, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002538533881306648, + "grad_norm": 6.725099563598633, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8441546559333801, + "num_tokens": 336518964.0, + "step": 8823 + }, + { + "epoch": 1.1225034982826612, + "ewc_loss": 0.05729466304183006, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002531224163249135, + "grad_norm": 6.70285177230835, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8497929573059082, + "num_tokens": 336556320.0, + "step": 8824 + }, + { + "epoch": 1.1226307085612517, + "ewc_loss": 0.057226501405239105, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025244077551178634, + "grad_norm": 6.595820903778076, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8528071045875549, + "num_tokens": 336595473.0, + "step": 8825 + }, + { + "epoch": 1.1227579188398422, + "ewc_loss": 0.05736325681209564, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002538083354011178, + "grad_norm": 6.6961164474487305, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8629994988441467, + "num_tokens": 336631793.0, + "step": 8826 + }, + { + "epoch": 1.1228851291184327, + "ewc_loss": 0.05720793455839157, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002522551512811333, + "grad_norm": 6.607127666473389, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8474023342132568, + "num_tokens": 336671230.0, + "step": 8827 + }, + { + "epoch": 1.1230123393970233, + "ewc_loss": 0.05733653903007507, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025354119134135544, + "grad_norm": 6.621787071228027, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8638100028038025, + "num_tokens": 336712015.0, + "step": 8828 + }, + { + "epoch": 1.1231395496756138, + "ewc_loss": 0.057520750910043716, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002529418852645904, + "grad_norm": 6.693180561065674, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8469853401184082, + "num_tokens": 336741206.0, + "step": 8829 + }, + { + "epoch": 1.1232667599542043, + "ewc_loss": 0.05755738914012909, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025330824428237975, + "grad_norm": 6.605282783508301, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8557801842689514, + "num_tokens": 336778838.0, + "step": 8830 + }, + { + "epoch": 1.1233939702327949, + "ewc_loss": 0.057652607560157776, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025426046340726316, + "grad_norm": 6.617969036102295, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8682417869567871, + "num_tokens": 336815471.0, + "step": 8831 + }, + { + "epoch": 1.1235211805113854, + "ewc_loss": 0.05732802301645279, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025345603353343904, + "grad_norm": 6.630688667297363, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8647063374519348, + "num_tokens": 336850870.0, + "step": 8832 + }, + { + "epoch": 1.123648390789976, + "ewc_loss": 0.057899996638298035, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002542929141782224, + "grad_norm": 6.629446029663086, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8673555850982666, + "num_tokens": 336891779.0, + "step": 8833 + }, + { + "epoch": 1.1237756010685664, + "ewc_loss": 0.05758325755596161, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002535669773351401, + "grad_norm": 6.595065593719482, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8730168342590332, + "num_tokens": 336931171.0, + "step": 8834 + }, + { + "epoch": 1.1239028113471567, + "ewc_loss": 0.0579395666718483, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025468860985711217, + "grad_norm": 6.670166015625, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8730090856552124, + "num_tokens": 336963080.0, + "step": 8835 + }, + { + "epoch": 1.1240300216257473, + "ewc_loss": 0.0578400194644928, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002536931715440005, + "grad_norm": 6.6416425704956055, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8480479717254639, + "num_tokens": 337000817.0, + "step": 8836 + }, + { + "epoch": 1.1241572319043378, + "ewc_loss": 0.05765113607048988, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025424573686905205, + "grad_norm": 6.654942512512207, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8559056520462036, + "num_tokens": 337037249.0, + "step": 8837 + }, + { + "epoch": 1.1242844421829283, + "ewc_loss": 0.057978082448244095, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025507379905320704, + "grad_norm": 6.602950096130371, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8710592985153198, + "num_tokens": 337077752.0, + "step": 8838 + }, + { + "epoch": 1.1244116524615189, + "ewc_loss": 0.05790074169635773, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002543003938626498, + "grad_norm": 6.648540496826172, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8590064644813538, + "num_tokens": 337119649.0, + "step": 8839 + }, + { + "epoch": 1.1245388627401094, + "ewc_loss": 0.05800516903400421, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002553446392994374, + "grad_norm": 6.689303874969482, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8469727039337158, + "num_tokens": 337154376.0, + "step": 8840 + }, + { + "epoch": 1.1246660730187, + "ewc_loss": 0.05790429562330246, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025433592963963747, + "grad_norm": 6.662030220031738, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.871259331703186, + "num_tokens": 337193377.0, + "step": 8841 + }, + { + "epoch": 1.1247932832972904, + "ewc_loss": 0.05796965956687927, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002549895434640348, + "grad_norm": 6.681814670562744, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8522733449935913, + "num_tokens": 337233176.0, + "step": 8842 + }, + { + "epoch": 1.124920493575881, + "ewc_loss": 0.05787121504545212, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025400513550266623, + "grad_norm": 6.669895648956299, + "learning_rate": 1e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8486999273300171, + "num_tokens": 337262995.0, + "step": 8843 + }, + { + "epoch": 1.1250477038544715, + "ewc_loss": 0.05771385878324509, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002548729826230556, + "grad_norm": 6.699592590332031, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8611730337142944, + "num_tokens": 337302466.0, + "step": 8844 + }, + { + "epoch": 1.125174914133062, + "ewc_loss": 0.05765138939023018, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002542482689023018, + "grad_norm": 6.64329719543457, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8744117021560669, + "num_tokens": 337341197.0, + "step": 8845 + }, + { + "epoch": 1.1253021244116526, + "ewc_loss": 0.05775249004364014, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025525924866087735, + "grad_norm": 6.690727233886719, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8661565184593201, + "num_tokens": 337375602.0, + "step": 8846 + }, + { + "epoch": 1.125429334690243, + "ewc_loss": 0.05798046663403511, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002550976350903511, + "grad_norm": 6.76240348815918, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8545698523521423, + "num_tokens": 337406685.0, + "step": 8847 + }, + { + "epoch": 1.1255565449688334, + "ewc_loss": 0.05791322886943817, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025442527839913964, + "grad_norm": 6.639230251312256, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8578641414642334, + "num_tokens": 337447364.0, + "step": 8848 + }, + { + "epoch": 1.125683755247424, + "ewc_loss": 0.05766835808753967, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000254417973337695, + "grad_norm": 6.700578212738037, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8623698949813843, + "num_tokens": 337487840.0, + "step": 8849 + }, + { + "epoch": 1.1258109655260145, + "ewc_loss": 0.057580575346946716, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025354011449962854, + "grad_norm": 6.6707000732421875, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.866924524307251, + "num_tokens": 337525117.0, + "step": 8850 + }, + { + "epoch": 1.125938175804605, + "ewc_loss": 0.05763951689004898, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025412955437786877, + "grad_norm": 6.6695027351379395, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8558418154716492, + "num_tokens": 337567024.0, + "step": 8851 + }, + { + "epoch": 1.1260653860831955, + "ewc_loss": 0.05769263952970505, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025466078659519553, + "grad_norm": 6.724115371704102, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8610235452651978, + "num_tokens": 337600117.0, + "step": 8852 + }, + { + "epoch": 1.126192596361786, + "ewc_loss": 0.057796984910964966, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025326284230686724, + "grad_norm": 6.617125988006592, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8468608856201172, + "num_tokens": 337637188.0, + "step": 8853 + }, + { + "epoch": 1.1263198066403766, + "ewc_loss": 0.05771028250455856, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025483721401542425, + "grad_norm": 6.718134880065918, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8585149049758911, + "num_tokens": 337676492.0, + "step": 8854 + }, + { + "epoch": 1.126447016918967, + "ewc_loss": 0.05759034678339958, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025363784516230226, + "grad_norm": 6.6462273597717285, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8766946792602539, + "num_tokens": 337713468.0, + "step": 8855 + }, + { + "epoch": 1.1265742271975576, + "ewc_loss": 0.05767861753702164, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000254520564340055, + "grad_norm": 6.721246242523193, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8494924306869507, + "num_tokens": 337750944.0, + "step": 8856 + }, + { + "epoch": 1.1267014374761481, + "ewc_loss": 0.05763518065214157, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025408618967048824, + "grad_norm": 6.653532981872559, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8768433332443237, + "num_tokens": 337793802.0, + "step": 8857 + }, + { + "epoch": 1.1268286477547387, + "ewc_loss": 0.05756111815571785, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002533455553930253, + "grad_norm": 6.687186241149902, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8752764463424683, + "num_tokens": 337828550.0, + "step": 8858 + }, + { + "epoch": 1.126955858033329, + "ewc_loss": 0.057546645402908325, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025320082204416394, + "grad_norm": 6.651183605194092, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8713655471801758, + "num_tokens": 337862168.0, + "step": 8859 + }, + { + "epoch": 1.1270830683119195, + "ewc_loss": 0.057626936584711075, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002540037385188043, + "grad_norm": 6.679520130157471, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.858582615852356, + "num_tokens": 337898837.0, + "step": 8860 + }, + { + "epoch": 1.12721027859051, + "ewc_loss": 0.057573508471250534, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002534694503992796, + "grad_norm": 6.681657314300537, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8489774465560913, + "num_tokens": 337939608.0, + "step": 8861 + }, + { + "epoch": 1.1273374888691006, + "ewc_loss": 0.0575779490172863, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025351386284455657, + "grad_norm": 6.657819747924805, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8614515662193298, + "num_tokens": 337983412.0, + "step": 8862 + }, + { + "epoch": 1.127464699147691, + "ewc_loss": 0.057561393827199936, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025334832025691867, + "grad_norm": 6.695738792419434, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8725221157073975, + "num_tokens": 338022171.0, + "step": 8863 + }, + { + "epoch": 1.1275919094262816, + "ewc_loss": 0.05757308006286621, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002534652012400329, + "grad_norm": 6.682974815368652, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8587422370910645, + "num_tokens": 338058838.0, + "step": 8864 + }, + { + "epoch": 1.1277191197048722, + "ewc_loss": 0.05754982307553291, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002532326034270227, + "grad_norm": 6.714224338531494, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8572784662246704, + "num_tokens": 338095080.0, + "step": 8865 + }, + { + "epoch": 1.1278463299834627, + "ewc_loss": 0.05753815174102783, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002531158970668912, + "grad_norm": 6.7432756423950195, + "learning_rate": 1e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.8448737263679504, + "num_tokens": 338134206.0, + "step": 8866 + }, + { + "epoch": 1.1279735402620532, + "ewc_loss": 0.05752657353878021, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025300009292550385, + "grad_norm": 6.679467678070068, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8686139583587646, + "num_tokens": 338173794.0, + "step": 8867 + }, + { + "epoch": 1.1281007505406437, + "ewc_loss": 0.05756636708974838, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025339802959933877, + "grad_norm": 6.7260823249816895, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8588205575942993, + "num_tokens": 338206591.0, + "step": 8868 + }, + { + "epoch": 1.1282279608192343, + "ewc_loss": 0.05723445862531662, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002525203744880855, + "grad_norm": 6.680371284484863, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8753692507743835, + "num_tokens": 338238728.0, + "step": 8869 + }, + { + "epoch": 1.1283551710978248, + "ewc_loss": 0.05752667039632797, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002530010533519089, + "grad_norm": 6.661227703094482, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8722622990608215, + "num_tokens": 338282372.0, + "step": 8870 + }, + { + "epoch": 1.1284823813764153, + "ewc_loss": 0.05754069238901138, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025314127560704947, + "grad_norm": 6.6967597007751465, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8570548892021179, + "num_tokens": 338319175.0, + "step": 8871 + }, + { + "epoch": 1.1286095916550056, + "ewc_loss": 0.05754886567592621, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025322302826680243, + "grad_norm": 6.727962017059326, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8593722581863403, + "num_tokens": 338357967.0, + "step": 8872 + }, + { + "epoch": 1.1287368019335962, + "ewc_loss": 0.05751810222864151, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002529153716750443, + "grad_norm": 6.688706398010254, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8692383170127869, + "num_tokens": 338391431.0, + "step": 8873 + }, + { + "epoch": 1.1288640122121867, + "ewc_loss": 0.0575704500079155, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002534388913773, + "grad_norm": 6.706857204437256, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8704572319984436, + "num_tokens": 338426298.0, + "step": 8874 + }, + { + "epoch": 1.1289912224907772, + "ewc_loss": 0.05759250745177269, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025365944020450115, + "grad_norm": 6.700741291046143, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8442486524581909, + "num_tokens": 338468147.0, + "step": 8875 + }, + { + "epoch": 1.1291184327693677, + "ewc_loss": 0.057570114731788635, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002534355444367975, + "grad_norm": 6.704546928405762, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.846116840839386, + "num_tokens": 338517095.0, + "step": 8876 + }, + { + "epoch": 1.1292456430479583, + "ewc_loss": 0.05755864828824997, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002533208462409675, + "grad_norm": 6.683174133300781, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8681763410568237, + "num_tokens": 338549925.0, + "step": 8877 + }, + { + "epoch": 1.1293728533265488, + "ewc_loss": 0.05756358057260513, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002533701772335917, + "grad_norm": 6.784087181091309, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8652388453483582, + "num_tokens": 338583966.0, + "step": 8878 + }, + { + "epoch": 1.1295000636051393, + "ewc_loss": 0.057478707283735275, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002525214513298124, + "grad_norm": 6.671191215515137, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8665485382080078, + "num_tokens": 338622715.0, + "step": 8879 + }, + { + "epoch": 1.1296272738837299, + "ewc_loss": 0.05755581334233284, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025329249911010265, + "grad_norm": 6.731618881225586, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8493586182594299, + "num_tokens": 338659920.0, + "step": 8880 + }, + { + "epoch": 1.1297544841623204, + "ewc_loss": 0.057488344609737396, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025261781411245465, + "grad_norm": 6.671466827392578, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8559366464614868, + "num_tokens": 338698183.0, + "step": 8881 + }, + { + "epoch": 1.129881694440911, + "ewc_loss": 0.05754796415567398, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025321400607936084, + "grad_norm": 6.757785320281982, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8508731126785278, + "num_tokens": 338735853.0, + "step": 8882 + }, + { + "epoch": 1.1300089047195012, + "ewc_loss": 0.057450875639915466, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025224313139915466, + "grad_norm": 6.687108039855957, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.851980447769165, + "num_tokens": 338771714.0, + "step": 8883 + }, + { + "epoch": 1.1301361149980917, + "ewc_loss": 0.05779363214969635, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025322928559035063, + "grad_norm": 6.708069801330566, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8810802698135376, + "num_tokens": 338813823.0, + "step": 8884 + }, + { + "epoch": 1.1302633252766823, + "ewc_loss": 0.057487353682518005, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025260791881009936, + "grad_norm": 6.604846954345703, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8664063215255737, + "num_tokens": 338858137.0, + "step": 8885 + }, + { + "epoch": 1.1303905355552728, + "ewc_loss": 0.05757388472557068, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002534732047934085, + "grad_norm": 6.7697319984436035, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8508422374725342, + "num_tokens": 338893653.0, + "step": 8886 + }, + { + "epoch": 1.1305177458338633, + "ewc_loss": 0.05749284476041794, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025266280863434076, + "grad_norm": 6.7038187980651855, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.843786895275116, + "num_tokens": 338931230.0, + "step": 8887 + }, + { + "epoch": 1.1306449561124539, + "ewc_loss": 0.057539358735084534, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002531279460527003, + "grad_norm": 6.718268394470215, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8623790740966797, + "num_tokens": 338966921.0, + "step": 8888 + }, + { + "epoch": 1.1307721663910444, + "ewc_loss": 0.05750248581171036, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002527592587284744, + "grad_norm": 6.712453842163086, + "learning_rate": 1e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8435556888580322, + "num_tokens": 339000957.0, + "step": 8889 + }, + { + "epoch": 1.130899376669635, + "ewc_loss": 0.05754447728395462, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025317916879430413, + "grad_norm": 6.676713943481445, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8543606996536255, + "num_tokens": 339035876.0, + "step": 8890 + }, + { + "epoch": 1.1310265869482254, + "ewc_loss": 0.05760795623064041, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002538139233365655, + "grad_norm": 6.625734329223633, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.862822413444519, + "num_tokens": 339073700.0, + "step": 8891 + }, + { + "epoch": 1.131153797226816, + "ewc_loss": 0.05783519521355629, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025364491739310324, + "grad_norm": 6.6754350662231445, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.86619633436203, + "num_tokens": 339108222.0, + "step": 8892 + }, + { + "epoch": 1.1312810075054065, + "ewc_loss": 0.057909563183784485, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025438860757276416, + "grad_norm": 6.661352634429932, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.873430609703064, + "num_tokens": 339145749.0, + "step": 8893 + }, + { + "epoch": 1.131408217783997, + "ewc_loss": 0.0576351173222065, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002540855493862182, + "grad_norm": 6.673526287078857, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.847689151763916, + "num_tokens": 339183852.0, + "step": 8894 + }, + { + "epoch": 1.1315354280625876, + "ewc_loss": 0.05799999088048935, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002552928635850549, + "grad_norm": 6.682263374328613, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8762866258621216, + "num_tokens": 339218766.0, + "step": 8895 + }, + { + "epoch": 1.131662638341178, + "ewc_loss": 0.05785783752799034, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025387134519405663, + "grad_norm": 6.640835762023926, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8630772829055786, + "num_tokens": 339259932.0, + "step": 8896 + }, + { + "epoch": 1.1317898486197684, + "ewc_loss": 0.05798186734318733, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002551116340328008, + "grad_norm": 6.640478134155273, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8670085668563843, + "num_tokens": 339302201.0, + "step": 8897 + }, + { + "epoch": 1.131917058898359, + "ewc_loss": 0.0579746812582016, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002550398057792336, + "grad_norm": 6.745983600616455, + "learning_rate": 1e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8334437608718872, + "num_tokens": 339342091.0, + "step": 8898 + }, + { + "epoch": 1.1320442691769494, + "ewc_loss": 0.05787961930036545, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002540891873650253, + "grad_norm": 6.639911651611328, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8572902679443359, + "num_tokens": 339379951.0, + "step": 8899 + }, + { + "epoch": 1.13217147945554, + "ewc_loss": 0.058019183576107025, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025548480334691703, + "grad_norm": 6.680856227874756, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8561278581619263, + "num_tokens": 339422243.0, + "step": 8900 + }, + { + "epoch": 1.1322986897341305, + "ewc_loss": 0.057929664850234985, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002545896277297288, + "grad_norm": 6.580714225769043, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8770320415496826, + "num_tokens": 339464969.0, + "step": 8901 + }, + { + "epoch": 1.132425900012721, + "ewc_loss": 0.05782558023929596, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002559901913627982, + "grad_norm": 6.702358245849609, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8653793334960938, + "num_tokens": 339502382.0, + "step": 8902 + }, + { + "epoch": 1.1325531102913116, + "ewc_loss": 0.05764716863632202, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002542060683481395, + "grad_norm": 6.662415504455566, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8473572731018066, + "num_tokens": 339542875.0, + "step": 8903 + }, + { + "epoch": 1.132680320569902, + "ewc_loss": 0.05774703994393349, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025520476629026234, + "grad_norm": 6.701079368591309, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8481731414794922, + "num_tokens": 339580776.0, + "step": 8904 + }, + { + "epoch": 1.1328075308484926, + "ewc_loss": 0.05776486545801163, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025538302725180984, + "grad_norm": 6.654958248138428, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8755772113800049, + "num_tokens": 339620195.0, + "step": 8905 + }, + { + "epoch": 1.1329347411270831, + "ewc_loss": 0.05770815908908844, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002548159391153604, + "grad_norm": 6.63940954208374, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8699920177459717, + "num_tokens": 339659234.0, + "step": 8906 + }, + { + "epoch": 1.1330619514056737, + "ewc_loss": 0.058071352541446686, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025600651861168444, + "grad_norm": 6.693338394165039, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8525350689888, + "num_tokens": 339703259.0, + "step": 8907 + }, + { + "epoch": 1.133189161684264, + "ewc_loss": 0.05793658643960953, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002546588075347245, + "grad_norm": 6.7249650955200195, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8700196743011475, + "num_tokens": 339745901.0, + "step": 8908 + }, + { + "epoch": 1.1333163719628545, + "ewc_loss": 0.057964276522397995, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002549357304815203, + "grad_norm": 6.622719764709473, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8531204462051392, + "num_tokens": 339790417.0, + "step": 8909 + }, + { + "epoch": 1.133443582241445, + "ewc_loss": 0.057808034121990204, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025581472436897457, + "grad_norm": 6.728997230529785, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8547577857971191, + "num_tokens": 339827224.0, + "step": 8910 + }, + { + "epoch": 1.1335707925200356, + "ewc_loss": 0.05762162804603577, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025395065313205123, + "grad_norm": 6.6377177238464355, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8574394583702087, + "num_tokens": 339859932.0, + "step": 8911 + }, + { + "epoch": 1.133698002798626, + "ewc_loss": 0.057806290686130524, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000255797291174531, + "grad_norm": 6.809699535369873, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8504745960235596, + "num_tokens": 339902416.0, + "step": 8912 + }, + { + "epoch": 1.1338252130772166, + "ewc_loss": 0.05759294331073761, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002536638348829001, + "grad_norm": 6.649112224578857, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8694207668304443, + "num_tokens": 339940495.0, + "step": 8913 + }, + { + "epoch": 1.1339524233558071, + "ewc_loss": 0.05782627314329147, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025599708897061646, + "grad_norm": 6.663135528564453, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8438562154769897, + "num_tokens": 339983010.0, + "step": 8914 + }, + { + "epoch": 1.1340796336343977, + "ewc_loss": 0.0576418936252594, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002541533322073519, + "grad_norm": 6.637986660003662, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8626012802124023, + "num_tokens": 340020683.0, + "step": 8915 + }, + { + "epoch": 1.1342068439129882, + "ewc_loss": 0.05800757557153702, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002553687372710556, + "grad_norm": 6.703996658325195, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8535563349723816, + "num_tokens": 340063179.0, + "step": 8916 + }, + { + "epoch": 1.1343340541915787, + "ewc_loss": 0.057736217975616455, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002550965582486242, + "grad_norm": 6.6519455909729, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8719722032546997, + "num_tokens": 340100166.0, + "step": 8917 + }, + { + "epoch": 1.1344612644701693, + "ewc_loss": 0.05776935815811157, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025542796356603503, + "grad_norm": 6.678966522216797, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8581314086914062, + "num_tokens": 340144325.0, + "step": 8918 + }, + { + "epoch": 1.1345884747487598, + "ewc_loss": 0.05772203579545021, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025495473528280854, + "grad_norm": 6.721124649047852, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8605258464813232, + "num_tokens": 340180563.0, + "step": 8919 + }, + { + "epoch": 1.1347156850273503, + "ewc_loss": 0.057772669941186905, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002554610837250948, + "grad_norm": 6.691721439361572, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8491660356521606, + "num_tokens": 340224621.0, + "step": 8920 + }, + { + "epoch": 1.1348428953059406, + "ewc_loss": 0.05771353468298912, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002548697229940444, + "grad_norm": 6.656781196594238, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8670173287391663, + "num_tokens": 340264810.0, + "step": 8921 + }, + { + "epoch": 1.1349701055845312, + "ewc_loss": 0.057751040905714035, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025524478405714035, + "grad_norm": 6.742010593414307, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8736267685890198, + "num_tokens": 340301158.0, + "step": 8922 + }, + { + "epoch": 1.1350973158631217, + "ewc_loss": 0.05771934241056442, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002549277851358056, + "grad_norm": 6.730448246002197, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8614922165870667, + "num_tokens": 340339806.0, + "step": 8923 + }, + { + "epoch": 1.1352245261417122, + "ewc_loss": 0.05770677328109741, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025480211479589343, + "grad_norm": 6.6989617347717285, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8669183254241943, + "num_tokens": 340386412.0, + "step": 8924 + }, + { + "epoch": 1.1353517364203027, + "ewc_loss": 0.057694800198078156, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025468238163739443, + "grad_norm": 6.647661209106445, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8718923330307007, + "num_tokens": 340431391.0, + "step": 8925 + }, + { + "epoch": 1.1354789466988933, + "ewc_loss": 0.05773376673460007, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002550720237195492, + "grad_norm": 6.696718692779541, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8655372262001038, + "num_tokens": 340465904.0, + "step": 8926 + }, + { + "epoch": 1.1356061569774838, + "ewc_loss": 0.057752206921577454, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002552564546931535, + "grad_norm": 6.705960273742676, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8667728304862976, + "num_tokens": 340504290.0, + "step": 8927 + }, + { + "epoch": 1.1357333672560743, + "ewc_loss": 0.05769357457756996, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025467012892477214, + "grad_norm": 6.7145538330078125, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.856793999671936, + "num_tokens": 340539100.0, + "step": 8928 + }, + { + "epoch": 1.1358605775346649, + "ewc_loss": 0.05772823840379715, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002550167846493423, + "grad_norm": 6.696030616760254, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8767015933990479, + "num_tokens": 340569005.0, + "step": 8929 + }, + { + "epoch": 1.1359877878132554, + "ewc_loss": 0.05767787992954254, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025451320107094944, + "grad_norm": 6.652275562286377, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8612846732139587, + "num_tokens": 340613476.0, + "step": 8930 + }, + { + "epoch": 1.136114998091846, + "ewc_loss": 0.0577431358397007, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025516573805361986, + "grad_norm": 6.710500240325928, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.854137659072876, + "num_tokens": 340647982.0, + "step": 8931 + }, + { + "epoch": 1.1362422083704362, + "ewc_loss": 0.057656727731227875, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025430164532735944, + "grad_norm": 6.695409774780273, + "learning_rate": 1e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.8396010994911194, + "num_tokens": 340683270.0, + "step": 8932 + }, + { + "epoch": 1.1363694186490267, + "ewc_loss": 0.057733941823244095, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025507379905320704, + "grad_norm": 6.812841415405273, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8652167320251465, + "num_tokens": 340719955.0, + "step": 8933 + }, + { + "epoch": 1.1364966289276173, + "ewc_loss": 0.057523880153894424, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025297317188233137, + "grad_norm": 6.644357681274414, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8632833957672119, + "num_tokens": 340753934.0, + "step": 8934 + }, + { + "epoch": 1.1366238392062078, + "ewc_loss": 0.05776207894086838, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025535517488606274, + "grad_norm": 6.767299175262451, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8688177466392517, + "num_tokens": 340787545.0, + "step": 8935 + }, + { + "epoch": 1.1367510494847983, + "ewc_loss": 0.05779337137937546, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025322669534944, + "grad_norm": 6.6096696853637695, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8716824054718018, + "num_tokens": 340826362.0, + "step": 8936 + }, + { + "epoch": 1.1368782597633889, + "ewc_loss": 0.057659365236759186, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002543280425015837, + "grad_norm": 6.753829479217529, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8666496276855469, + "num_tokens": 340857871.0, + "step": 8937 + }, + { + "epoch": 1.1370054700419794, + "ewc_loss": 0.05755059793591499, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002532403450459242, + "grad_norm": 6.544698238372803, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8679238557815552, + "num_tokens": 340897736.0, + "step": 8938 + }, + { + "epoch": 1.13713268032057, + "ewc_loss": 0.05786953866481781, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002564297756180167, + "grad_norm": 6.742632865905762, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8582872748374939, + "num_tokens": 340935774.0, + "step": 8939 + }, + { + "epoch": 1.1372598905991604, + "ewc_loss": 0.05763332545757294, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002540676505304873, + "grad_norm": 6.691259860992432, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8638205528259277, + "num_tokens": 340971497.0, + "step": 8940 + }, + { + "epoch": 1.137387100877751, + "ewc_loss": 0.057816144078969955, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000255895807640627, + "grad_norm": 6.720972537994385, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8626816272735596, + "num_tokens": 341016209.0, + "step": 8941 + }, + { + "epoch": 1.1375143111563415, + "ewc_loss": 0.05766839161515236, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025441829347983, + "grad_norm": 6.647736549377441, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8573285937309265, + "num_tokens": 341052137.0, + "step": 8942 + }, + { + "epoch": 1.137641521434932, + "ewc_loss": 0.05774650350213051, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002551994111854583, + "grad_norm": 6.746354579925537, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.865180492401123, + "num_tokens": 341083417.0, + "step": 8943 + }, + { + "epoch": 1.1377687317135226, + "ewc_loss": 0.057635582983493805, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025409020599909127, + "grad_norm": 6.628329753875732, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8652119040489197, + "num_tokens": 341127798.0, + "step": 8944 + }, + { + "epoch": 1.137895941992113, + "ewc_loss": 0.05784443020820618, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000256178667768836, + "grad_norm": 6.704409122467041, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.848482608795166, + "num_tokens": 341173485.0, + "step": 8945 + }, + { + "epoch": 1.1380231522707034, + "ewc_loss": 0.057677119970321655, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025450557586736977, + "grad_norm": 6.71539831161499, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8510634303092957, + "num_tokens": 341212834.0, + "step": 8946 + }, + { + "epoch": 1.138150362549294, + "ewc_loss": 0.05748138204216957, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002549896016716957, + "grad_norm": 6.703459739685059, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8555693030357361, + "num_tokens": 341246497.0, + "step": 8947 + }, + { + "epoch": 1.1382775728278844, + "ewc_loss": 0.05744565278291702, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025463232304900885, + "grad_norm": 6.670485973358154, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8449383974075317, + "num_tokens": 341291498.0, + "step": 8948 + }, + { + "epoch": 1.138404783106475, + "ewc_loss": 0.057481907308101654, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002549948694650084, + "grad_norm": 6.726569175720215, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8551590442657471, + "num_tokens": 341328104.0, + "step": 8949 + }, + { + "epoch": 1.1385319933850655, + "ewc_loss": 0.0574883371591568, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002550591598264873, + "grad_norm": 6.733185291290283, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8741271495819092, + "num_tokens": 341359042.0, + "step": 8950 + }, + { + "epoch": 1.138659203663656, + "ewc_loss": 0.05773108825087547, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000255045248195529, + "grad_norm": 6.699436187744141, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8613868951797485, + "num_tokens": 341403529.0, + "step": 8951 + }, + { + "epoch": 1.1387864139422466, + "ewc_loss": 0.05775681883096695, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025530255516059697, + "grad_norm": 6.692821979522705, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.868055522441864, + "num_tokens": 341442509.0, + "step": 8952 + }, + { + "epoch": 1.138913624220837, + "ewc_loss": 0.05780234932899475, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002557578554842621, + "grad_norm": 6.6379923820495605, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8599271774291992, + "num_tokens": 341483015.0, + "step": 8953 + }, + { + "epoch": 1.1390408344994276, + "ewc_loss": 0.05781465768814087, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002558809646870941, + "grad_norm": 6.720145225524902, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8614761829376221, + "num_tokens": 341519676.0, + "step": 8954 + }, + { + "epoch": 1.1391680447780181, + "ewc_loss": 0.057790301740169525, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002556373947300017, + "grad_norm": 6.682602405548096, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8619451522827148, + "num_tokens": 341563483.0, + "step": 8955 + }, + { + "epoch": 1.1392952550566087, + "ewc_loss": 0.05781202018260956, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025585456751286983, + "grad_norm": 6.722752571105957, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.852985143661499, + "num_tokens": 341598387.0, + "step": 8956 + }, + { + "epoch": 1.139422465335199, + "ewc_loss": 0.057787735015153885, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025561172515153885, + "grad_norm": 6.675610542297363, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8656153678894043, + "num_tokens": 341640253.0, + "step": 8957 + }, + { + "epoch": 1.1395496756137895, + "ewc_loss": 0.05780661106109619, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025580046349205077, + "grad_norm": 6.691200256347656, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8882473707199097, + "num_tokens": 341675788.0, + "step": 8958 + }, + { + "epoch": 1.13967688589238, + "ewc_loss": 0.05753503739833832, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025552615988999605, + "grad_norm": 6.686450958251953, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8621371388435364, + "num_tokens": 341716917.0, + "step": 8959 + }, + { + "epoch": 1.1398040961709706, + "ewc_loss": 0.05785944685339928, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025632884353399277, + "grad_norm": 6.712449550628662, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8614641427993774, + "num_tokens": 341758936.0, + "step": 8960 + }, + { + "epoch": 1.139931306449561, + "ewc_loss": 0.05776531249284744, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025538750924170017, + "grad_norm": 6.708407878875732, + "learning_rate": 1e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.838790774345398, + "num_tokens": 341797612.0, + "step": 8961 + }, + { + "epoch": 1.1400585167281516, + "ewc_loss": 0.05755101889371872, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.000255685969023034, + "grad_norm": 6.716163158416748, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8574309945106506, + "num_tokens": 341836061.0, + "step": 8962 + }, + { + "epoch": 1.1401857270067421, + "ewc_loss": 0.057569555938243866, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002558713313192129, + "grad_norm": 6.728745460510254, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8606896996498108, + "num_tokens": 341875465.0, + "step": 8963 + }, + { + "epoch": 1.1403129372853327, + "ewc_loss": 0.05772348493337631, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000254969228990376, + "grad_norm": 6.7277421951293945, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.85678631067276, + "num_tokens": 341919795.0, + "step": 8964 + }, + { + "epoch": 1.1404401475639232, + "ewc_loss": 0.057509321719408035, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025526899844408035, + "grad_norm": 6.765536785125732, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8599192500114441, + "num_tokens": 341953291.0, + "step": 8965 + }, + { + "epoch": 1.1405673578425137, + "ewc_loss": 0.05771470442414284, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025488142273388803, + "grad_norm": 6.68264627456665, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8799607753753662, + "num_tokens": 341988364.0, + "step": 8966 + }, + { + "epoch": 1.1406945681211043, + "ewc_loss": 0.057839177548885345, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002561261644586921, + "grad_norm": 6.7817816734313965, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8656314015388489, + "num_tokens": 342028478.0, + "step": 8967 + }, + { + "epoch": 1.1408217783996948, + "ewc_loss": 0.05774826928973198, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025521707721054554, + "grad_norm": 6.677340984344482, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8697909712791443, + "num_tokens": 342067348.0, + "step": 8968 + }, + { + "epoch": 1.1409489886782853, + "ewc_loss": 0.057535819709300995, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002555339888203889, + "grad_norm": 6.799281120300293, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8582824468612671, + "num_tokens": 342105845.0, + "step": 8969 + }, + { + "epoch": 1.1410761989568756, + "ewc_loss": 0.05769307538866997, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002546651230659336, + "grad_norm": 6.671333312988281, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.861660361289978, + "num_tokens": 342142072.0, + "step": 8970 + }, + { + "epoch": 1.1412034092354661, + "ewc_loss": 0.057903748005628586, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025677186204120517, + "grad_norm": 6.760311126708984, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8644723892211914, + "num_tokens": 342176592.0, + "step": 8971 + }, + { + "epoch": 1.1413306195140567, + "ewc_loss": 0.05774906277656555, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025522499345242977, + "grad_norm": 6.682703495025635, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8665292263031006, + "num_tokens": 342211896.0, + "step": 8972 + }, + { + "epoch": 1.1414578297926472, + "ewc_loss": 0.057845309376716614, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025618745712563396, + "grad_norm": 6.6824564933776855, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8657045960426331, + "num_tokens": 342256078.0, + "step": 8973 + }, + { + "epoch": 1.1415850400712377, + "ewc_loss": 0.05780273675918579, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025576172629371285, + "grad_norm": 6.690273761749268, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8651748895645142, + "num_tokens": 342296766.0, + "step": 8974 + }, + { + "epoch": 1.1417122503498283, + "ewc_loss": 0.057838015258312225, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025611455203033984, + "grad_norm": 6.734641075134277, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8605144023895264, + "num_tokens": 342333689.0, + "step": 8975 + }, + { + "epoch": 1.1418394606284188, + "ewc_loss": 0.05757526308298111, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025592840393073857, + "grad_norm": 6.714966297149658, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8732848763465881, + "num_tokens": 342369605.0, + "step": 8976 + }, + { + "epoch": 1.1419666709070093, + "ewc_loss": 0.05787187069654465, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002564530586823821, + "grad_norm": 6.724262237548828, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.870452880859375, + "num_tokens": 342410806.0, + "step": 8977 + }, + { + "epoch": 1.1420938811855998, + "ewc_loss": 0.05789784714579582, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002567128394730389, + "grad_norm": 6.766991138458252, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8874748945236206, + "num_tokens": 342446957.0, + "step": 8978 + }, + { + "epoch": 1.1422210914641904, + "ewc_loss": 0.05787725746631622, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002565069298725575, + "grad_norm": 6.727452754974365, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8497759103775024, + "num_tokens": 342480717.0, + "step": 8979 + }, + { + "epoch": 1.142348301742781, + "ewc_loss": 0.057613782584667206, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025631359312683344, + "grad_norm": 6.741348743438721, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8698441982269287, + "num_tokens": 342520431.0, + "step": 8980 + }, + { + "epoch": 1.1424755120213712, + "ewc_loss": 0.05761362984776497, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002563120797276497, + "grad_norm": 6.692081928253174, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8585332036018372, + "num_tokens": 342561735.0, + "step": 8981 + }, + { + "epoch": 1.1426027222999617, + "ewc_loss": 0.057908643037080765, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002568208146840334, + "grad_norm": 6.764494895935059, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8743459582328796, + "num_tokens": 342593899.0, + "step": 8982 + }, + { + "epoch": 1.1427299325785523, + "ewc_loss": 0.05766214430332184, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025679724058136344, + "grad_norm": 6.796189308166504, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8614901304244995, + "num_tokens": 342628044.0, + "step": 8983 + }, + { + "epoch": 1.1428571428571428, + "ewc_loss": 0.05770977586507797, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002572735247667879, + "grad_norm": 6.792637348175049, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8593635559082031, + "num_tokens": 342665018.0, + "step": 8984 + }, + { + "epoch": 1.1429843531357333, + "ewc_loss": 0.05755763128399849, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002557520929258317, + "grad_norm": 6.702601909637451, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.850141167640686, + "num_tokens": 342704832.0, + "step": 8985 + }, + { + "epoch": 1.1431115634143239, + "ewc_loss": 0.05767262727022171, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025690204347483814, + "grad_norm": 6.748768329620361, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8707367181777954, + "num_tokens": 342741598.0, + "step": 8986 + }, + { + "epoch": 1.1432387736929144, + "ewc_loss": 0.057632122188806534, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002564970054663718, + "grad_norm": 6.7736992835998535, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8582435846328735, + "num_tokens": 342777302.0, + "step": 8987 + }, + { + "epoch": 1.143365983971505, + "ewc_loss": 0.05756056308746338, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002557814004831016, + "grad_norm": 6.739104270935059, + "learning_rate": 1e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8428729176521301, + "num_tokens": 342809578.0, + "step": 8988 + }, + { + "epoch": 1.1434931942500954, + "ewc_loss": 0.05763714760541916, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025654726778157055, + "grad_norm": 6.674049377441406, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8624023199081421, + "num_tokens": 342845809.0, + "step": 8989 + }, + { + "epoch": 1.143620404528686, + "ewc_loss": 0.05762391537427902, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002564149326644838, + "grad_norm": 6.7198052406311035, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8488869071006775, + "num_tokens": 342886118.0, + "step": 8990 + }, + { + "epoch": 1.1437476148072765, + "ewc_loss": 0.05790754407644272, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025680981343612075, + "grad_norm": 6.753928184509277, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8566315770149231, + "num_tokens": 342922517.0, + "step": 8991 + }, + { + "epoch": 1.143874825085867, + "ewc_loss": 0.057929977774620056, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002570341748651117, + "grad_norm": 6.731907844543457, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8561275005340576, + "num_tokens": 342959741.0, + "step": 8992 + }, + { + "epoch": 1.1440020353644575, + "ewc_loss": 0.0579652301967144, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025738668045960367, + "grad_norm": 6.757909297943115, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.875232458114624, + "num_tokens": 342991418.0, + "step": 8993 + }, + { + "epoch": 1.144129245643048, + "ewc_loss": 0.05791613459587097, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025689569883979857, + "grad_norm": 6.754300594329834, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8803473711013794, + "num_tokens": 343024263.0, + "step": 8994 + }, + { + "epoch": 1.1442564559216384, + "ewc_loss": 0.057905569672584534, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002567900519352406, + "grad_norm": 6.7102813720703125, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8618701100349426, + "num_tokens": 343066388.0, + "step": 8995 + }, + { + "epoch": 1.144383666200229, + "ewc_loss": 0.05797121673822403, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002574465179350227, + "grad_norm": 6.760580539703369, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8642013072967529, + "num_tokens": 343107631.0, + "step": 8996 + }, + { + "epoch": 1.1445108764788194, + "ewc_loss": 0.05788608640432358, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025659523089416325, + "grad_norm": 6.723520755767822, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8689314126968384, + "num_tokens": 343143645.0, + "step": 8997 + }, + { + "epoch": 1.14463808675741, + "ewc_loss": 0.05803894251585007, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002581238222774118, + "grad_norm": 6.800848007202148, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8486137390136719, + "num_tokens": 343182076.0, + "step": 8998 + }, + { + "epoch": 1.1447652970360005, + "ewc_loss": 0.058092206716537476, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002562150184530765, + "grad_norm": 6.654311180114746, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8808861970901489, + "num_tokens": 343222808.0, + "step": 8999 + }, + { + "epoch": 1.144892507314591, + "ewc_loss": 0.058124370872974396, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025897807790897787, + "grad_norm": 6.756272315979004, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8583550453186035, + "num_tokens": 343268229.0, + "step": 9000 + }, + { + "epoch": 1.1450197175931816, + "ewc_loss": 0.057842426002025604, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002561586443334818, + "grad_norm": 6.68231725692749, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.854878306388855, + "num_tokens": 343304459.0, + "step": 9001 + }, + { + "epoch": 1.145146927871772, + "ewc_loss": 0.05795314908027649, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025726587045937777, + "grad_norm": 6.805334568023682, + "learning_rate": 1e-06, + "loss": 0.5925, + "mean_token_accuracy": 0.8240336179733276, + "num_tokens": 343343211.0, + "step": 9002 + }, + { + "epoch": 1.1452741381503626, + "ewc_loss": 0.057923316955566406, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025696755619719625, + "grad_norm": 6.668316841125488, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8582508563995361, + "num_tokens": 343388498.0, + "step": 9003 + }, + { + "epoch": 1.1454013484289531, + "ewc_loss": 0.058021754026412964, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002579519059509039, + "grad_norm": 6.748476982116699, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8595049381256104, + "num_tokens": 343430101.0, + "step": 9004 + }, + { + "epoch": 1.1455285587075437, + "ewc_loss": 0.05789172276854515, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025665160501375794, + "grad_norm": 6.6934990882873535, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8497888445854187, + "num_tokens": 343466454.0, + "step": 9005 + }, + { + "epoch": 1.145655768986134, + "ewc_loss": 0.05798415467143059, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025757591356523335, + "grad_norm": 6.767412185668945, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.876745879650116, + "num_tokens": 343506960.0, + "step": 9006 + }, + { + "epoch": 1.1457829792647245, + "ewc_loss": 0.05784428119659424, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025617718347348273, + "grad_norm": 6.699091911315918, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8697333335876465, + "num_tokens": 343541133.0, + "step": 9007 + }, + { + "epoch": 1.145910189543315, + "ewc_loss": 0.05819897726178169, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025728275068104267, + "grad_norm": 6.71228551864624, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8704564571380615, + "num_tokens": 343580266.0, + "step": 9008 + }, + { + "epoch": 1.1460373998219056, + "ewc_loss": 0.05807376652956009, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002560306456871331, + "grad_norm": 6.716019630432129, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.861136794090271, + "num_tokens": 343611691.0, + "step": 9009 + }, + { + "epoch": 1.146164610100496, + "ewc_loss": 0.058082401752471924, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002561169967520982, + "grad_norm": 6.696217060089111, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8677709102630615, + "num_tokens": 343649355.0, + "step": 9010 + }, + { + "epoch": 1.1462918203790866, + "ewc_loss": 0.05823216587305069, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025761465076357126, + "grad_norm": 6.728323459625244, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8750242590904236, + "num_tokens": 343681606.0, + "step": 9011 + }, + { + "epoch": 1.1464190306576771, + "ewc_loss": 0.05811484903097153, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025644144625402987, + "grad_norm": 6.692067623138428, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8725626468658447, + "num_tokens": 343721592.0, + "step": 9012 + }, + { + "epoch": 1.1465462409362677, + "ewc_loss": 0.05827903375029564, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025808330974541605, + "grad_norm": 6.709462642669678, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8593025207519531, + "num_tokens": 343764084.0, + "step": 9013 + }, + { + "epoch": 1.1466734512148582, + "ewc_loss": 0.05815102905035019, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002568032359704375, + "grad_norm": 6.750177383422852, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.863580584526062, + "num_tokens": 343801213.0, + "step": 9014 + }, + { + "epoch": 1.1468006614934487, + "ewc_loss": 0.058131471276283264, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025660768733359873, + "grad_norm": 6.731198310852051, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8624875545501709, + "num_tokens": 343839761.0, + "step": 9015 + }, + { + "epoch": 1.1469278717720393, + "ewc_loss": 0.05817987769842148, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025709174224175513, + "grad_norm": 6.928714275360107, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8620657920837402, + "num_tokens": 343877792.0, + "step": 9016 + }, + { + "epoch": 1.1470550820506298, + "ewc_loss": 0.05802633985877037, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025555636966601014, + "grad_norm": 6.650309085845947, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.868412971496582, + "num_tokens": 343915107.0, + "step": 9017 + }, + { + "epoch": 1.1471822923292203, + "ewc_loss": 0.058233343064785004, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002576263796072453, + "grad_norm": 6.783921241760254, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.866127610206604, + "num_tokens": 343949783.0, + "step": 9018 + }, + { + "epoch": 1.1473095026078106, + "ewc_loss": 0.05806436389684677, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025593661121092737, + "grad_norm": 6.705158233642578, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8440399169921875, + "num_tokens": 343991058.0, + "step": 9019 + }, + { + "epoch": 1.1474367128864011, + "ewc_loss": 0.05815737694501877, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002568667405284941, + "grad_norm": 6.723654747009277, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8637245893478394, + "num_tokens": 344030424.0, + "step": 9020 + }, + { + "epoch": 1.1475639231649917, + "ewc_loss": 0.05787214636802673, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025645585265010595, + "grad_norm": 6.638236999511719, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8640743494033813, + "num_tokens": 344071385.0, + "step": 9021 + }, + { + "epoch": 1.1476911334435822, + "ewc_loss": 0.058289676904678345, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025818971334956586, + "grad_norm": 6.76492166519165, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8710274696350098, + "num_tokens": 344109725.0, + "step": 9022 + }, + { + "epoch": 1.1478183437221727, + "ewc_loss": 0.05797649919986725, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025749937049113214, + "grad_norm": 6.761674880981445, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8692305088043213, + "num_tokens": 344143947.0, + "step": 9023 + }, + { + "epoch": 1.1479455540007633, + "ewc_loss": 0.058015644550323486, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002578908170107752, + "grad_norm": 6.801908493041992, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8508883118629456, + "num_tokens": 344179969.0, + "step": 9024 + }, + { + "epoch": 1.1480727642793538, + "ewc_loss": 0.05792316794395447, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002569660427980125, + "grad_norm": 6.733325958251953, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8735369443893433, + "num_tokens": 344212279.0, + "step": 9025 + }, + { + "epoch": 1.1481999745579443, + "ewc_loss": 0.058039598166942596, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002581303706392646, + "grad_norm": 6.71967077255249, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8686450719833374, + "num_tokens": 344248402.0, + "step": 9026 + }, + { + "epoch": 1.1483271848365348, + "ewc_loss": 0.05829249322414398, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025821791496127844, + "grad_norm": 6.782512187957764, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.859256386756897, + "num_tokens": 344286557.0, + "step": 9027 + }, + { + "epoch": 1.1484543951151254, + "ewc_loss": 0.058242328464984894, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002577162522356957, + "grad_norm": 6.644770622253418, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8629121780395508, + "num_tokens": 344331014.0, + "step": 9028 + }, + { + "epoch": 1.148581605393716, + "ewc_loss": 0.05837304890155792, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025902347988449037, + "grad_norm": 6.819295883178711, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8579007387161255, + "num_tokens": 344373239.0, + "step": 9029 + }, + { + "epoch": 1.1487088156723062, + "ewc_loss": 0.058179594576358795, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025708889006637037, + "grad_norm": 6.685710906982422, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8773505687713623, + "num_tokens": 344409871.0, + "step": 9030 + }, + { + "epoch": 1.1488360259508967, + "ewc_loss": 0.058397434651851654, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025926734087988734, + "grad_norm": 6.7860107421875, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8533397912979126, + "num_tokens": 344452108.0, + "step": 9031 + }, + { + "epoch": 1.1489632362294873, + "ewc_loss": 0.058169007301330566, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025698303943499923, + "grad_norm": 6.762205123901367, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8569273948669434, + "num_tokens": 344489798.0, + "step": 9032 + }, + { + "epoch": 1.1490904465080778, + "ewc_loss": 0.05824536830186844, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025774663663469255, + "grad_norm": 6.769279956817627, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8707196116447449, + "num_tokens": 344528194.0, + "step": 9033 + }, + { + "epoch": 1.1492176567866683, + "ewc_loss": 0.05786404758691788, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025637485668994486, + "grad_norm": 6.741340160369873, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8475037813186646, + "num_tokens": 344559699.0, + "step": 9034 + }, + { + "epoch": 1.1493448670652588, + "ewc_loss": 0.05823136121034622, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002576065598987043, + "grad_norm": 6.750485420227051, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8553383946418762, + "num_tokens": 344598234.0, + "step": 9035 + }, + { + "epoch": 1.1494720773438494, + "ewc_loss": 0.0578775554895401, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000256509956670925, + "grad_norm": 6.67624568939209, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8553928136825562, + "num_tokens": 344634780.0, + "step": 9036 + }, + { + "epoch": 1.14959928762244, + "ewc_loss": 0.058112747967243195, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025886186631396413, + "grad_norm": 6.725908279418945, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8660125732421875, + "num_tokens": 344677167.0, + "step": 9037 + }, + { + "epoch": 1.1497264979010304, + "ewc_loss": 0.05800725519657135, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002578069397713989, + "grad_norm": 6.745330333709717, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8490525484085083, + "num_tokens": 344713824.0, + "step": 9038 + }, + { + "epoch": 1.149853708179621, + "ewc_loss": 0.057954445481300354, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000257278821663931, + "grad_norm": 6.684413433074951, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.849639892578125, + "num_tokens": 344755814.0, + "step": 9039 + }, + { + "epoch": 1.1499809184582115, + "ewc_loss": 0.058047130703926086, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002582056913524866, + "grad_norm": 6.702722549438477, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8771651983261108, + "num_tokens": 344795022.0, + "step": 9040 + }, + { + "epoch": 1.150108128736802, + "ewc_loss": 0.058056414127349854, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002582985325716436, + "grad_norm": 6.732354640960693, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8718166351318359, + "num_tokens": 344833906.0, + "step": 9041 + }, + { + "epoch": 1.1502353390153925, + "ewc_loss": 0.05808769911527634, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000258611369645223, + "grad_norm": 6.751445293426514, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8669657111167908, + "num_tokens": 344875975.0, + "step": 9042 + }, + { + "epoch": 1.150362549293983, + "ewc_loss": 0.05825500935316086, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002578430576249957, + "grad_norm": 6.7492828369140625, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.865892767906189, + "num_tokens": 344915433.0, + "step": 9043 + }, + { + "epoch": 1.1504897595725734, + "ewc_loss": 0.05812543258070946, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002589887008070946, + "grad_norm": 6.795882701873779, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8656208515167236, + "num_tokens": 344949038.0, + "step": 9044 + }, + { + "epoch": 1.150616969851164, + "ewc_loss": 0.05799415707588196, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025767594343051314, + "grad_norm": 6.7733988761901855, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8633977174758911, + "num_tokens": 344988260.0, + "step": 9045 + }, + { + "epoch": 1.1507441801297544, + "ewc_loss": 0.058038532733917236, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025811968953348696, + "grad_norm": 6.731870174407959, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8742667436599731, + "num_tokens": 345024355.0, + "step": 9046 + }, + { + "epoch": 1.150871390408345, + "ewc_loss": 0.057963430881500244, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002573686942923814, + "grad_norm": 6.746349811553955, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8714642524719238, + "num_tokens": 345063041.0, + "step": 9047 + }, + { + "epoch": 1.1509986006869355, + "ewc_loss": 0.057968053966760635, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002574149111751467, + "grad_norm": 6.744258403778076, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8811166286468506, + "num_tokens": 345097813.0, + "step": 9048 + }, + { + "epoch": 1.151125810965526, + "ewc_loss": 0.057959914207458496, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002573335077613592, + "grad_norm": 6.737755298614502, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8581376075744629, + "num_tokens": 345134534.0, + "step": 9049 + }, + { + "epoch": 1.1512530212441165, + "ewc_loss": 0.058018412441015244, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025791849475353956, + "grad_norm": 6.8282647132873535, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8661125898361206, + "num_tokens": 345171018.0, + "step": 9050 + }, + { + "epoch": 1.151380231522707, + "ewc_loss": 0.05790906399488449, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025682500563561916, + "grad_norm": 6.728102207183838, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8743975162506104, + "num_tokens": 345208006.0, + "step": 9051 + }, + { + "epoch": 1.1515074418012976, + "ewc_loss": 0.05798272788524628, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025756165268830955, + "grad_norm": 6.717810153961182, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8799006938934326, + "num_tokens": 345246616.0, + "step": 9052 + }, + { + "epoch": 1.1516346520798881, + "ewc_loss": 0.05794667452573776, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000257201143540442, + "grad_norm": 6.7005615234375, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8786170482635498, + "num_tokens": 345288095.0, + "step": 9053 + }, + { + "epoch": 1.1517618623584787, + "ewc_loss": 0.057995617389678955, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025769052444957197, + "grad_norm": 6.747936248779297, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8499472141265869, + "num_tokens": 345333291.0, + "step": 9054 + }, + { + "epoch": 1.151889072637069, + "ewc_loss": 0.05775493383407593, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.000257725128903985, + "grad_norm": 6.752658843994141, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8609635233879089, + "num_tokens": 345375093.0, + "step": 9055 + }, + { + "epoch": 1.1520162829156595, + "ewc_loss": 0.057690490037202835, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025708068278618157, + "grad_norm": 6.763147354125977, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8565404415130615, + "num_tokens": 345415547.0, + "step": 9056 + }, + { + "epoch": 1.15214349319425, + "ewc_loss": 0.05793490260839462, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002570834185462445, + "grad_norm": 6.7343549728393555, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8669930100440979, + "num_tokens": 345457340.0, + "step": 9057 + }, + { + "epoch": 1.1522707034728406, + "ewc_loss": 0.0576469823718071, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002566455805208534, + "grad_norm": 6.777336597442627, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8583724498748779, + "num_tokens": 345494711.0, + "step": 9058 + }, + { + "epoch": 1.152397913751431, + "ewc_loss": 0.05772693455219269, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002574451209511608, + "grad_norm": 6.7058820724487305, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8542169332504272, + "num_tokens": 345534249.0, + "step": 9059 + }, + { + "epoch": 1.1525251240300216, + "ewc_loss": 0.05787472426891327, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002589230425655842, + "grad_norm": 6.791039943695068, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8616699576377869, + "num_tokens": 345570832.0, + "step": 9060 + }, + { + "epoch": 1.1526523343086121, + "ewc_loss": 0.05767223611474037, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025689814356155694, + "grad_norm": 6.750864505767822, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8581662178039551, + "num_tokens": 345609007.0, + "step": 9061 + }, + { + "epoch": 1.1527795445872027, + "ewc_loss": 0.05779080092906952, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002580838045105338, + "grad_norm": 6.769082069396973, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.874468207359314, + "num_tokens": 345647979.0, + "step": 9062 + }, + { + "epoch": 1.1529067548657932, + "ewc_loss": 0.05796560272574425, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025739040574990213, + "grad_norm": 6.808261394500732, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8557935953140259, + "num_tokens": 345681821.0, + "step": 9063 + }, + { + "epoch": 1.1530339651443837, + "ewc_loss": 0.05770805478096008, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025725632440298796, + "grad_norm": 6.708531856536865, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.852357029914856, + "num_tokens": 345724274.0, + "step": 9064 + }, + { + "epoch": 1.1531611754229742, + "ewc_loss": 0.05796048790216446, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002573392412159592, + "grad_norm": 6.741461277008057, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8799247741699219, + "num_tokens": 345763558.0, + "step": 9065 + }, + { + "epoch": 1.1532883857015648, + "ewc_loss": 0.057955238968133926, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002572867670096457, + "grad_norm": 6.754245758056641, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8577295541763306, + "num_tokens": 345798486.0, + "step": 9066 + }, + { + "epoch": 1.1534155959801553, + "ewc_loss": 0.05764990299940109, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.00025667480076663196, + "grad_norm": 6.694845199584961, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8598904609680176, + "num_tokens": 345839379.0, + "step": 9067 + }, + { + "epoch": 1.1535428062587456, + "ewc_loss": 0.05798402428627014, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002575746038928628, + "grad_norm": 6.7369208335876465, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.858853816986084, + "num_tokens": 345876675.0, + "step": 9068 + }, + { + "epoch": 1.1536700165373361, + "ewc_loss": 0.057926930487155914, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002570037031546235, + "grad_norm": 6.688419342041016, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8816434741020203, + "num_tokens": 345913482.0, + "step": 9069 + }, + { + "epoch": 1.1537972268159267, + "ewc_loss": 0.05801425129175186, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025787687627598643, + "grad_norm": 6.739912986755371, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8571983575820923, + "num_tokens": 345947671.0, + "step": 9070 + }, + { + "epoch": 1.1539244370945172, + "ewc_loss": 0.058062948286533356, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025836387067101896, + "grad_norm": 6.743738651275635, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.866336464881897, + "num_tokens": 345981520.0, + "step": 9071 + }, + { + "epoch": 1.1540516473731077, + "ewc_loss": 0.057973310351371765, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000257467501796782, + "grad_norm": 6.745538711547852, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.84548419713974, + "num_tokens": 346020050.0, + "step": 9072 + }, + { + "epoch": 1.1541788576516983, + "ewc_loss": 0.05796723812818527, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002574067621026188, + "grad_norm": 6.760586738586426, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8535920977592468, + "num_tokens": 346050000.0, + "step": 9073 + }, + { + "epoch": 1.1543060679302888, + "ewc_loss": 0.05806111916899681, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025834556436166167, + "grad_norm": 6.813905715942383, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8571294546127319, + "num_tokens": 346085038.0, + "step": 9074 + }, + { + "epoch": 1.1544332782088793, + "ewc_loss": 0.05822396278381348, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025753260706551373, + "grad_norm": 6.652957916259766, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8734235763549805, + "num_tokens": 346125871.0, + "step": 9075 + }, + { + "epoch": 1.1545604884874698, + "ewc_loss": 0.05846825987100601, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002599755534902215, + "grad_norm": 6.773715496063232, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.871086835861206, + "num_tokens": 346160460.0, + "step": 9076 + }, + { + "epoch": 1.1546876987660604, + "ewc_loss": 0.05818261206150055, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025711909984238446, + "grad_norm": 6.744453430175781, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8518226146697998, + "num_tokens": 346201621.0, + "step": 9077 + }, + { + "epoch": 1.154814909044651, + "ewc_loss": 0.05808534473180771, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025858779554255307, + "grad_norm": 6.727840423583984, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8645803928375244, + "num_tokens": 346242680.0, + "step": 9078 + }, + { + "epoch": 1.1549421193232412, + "ewc_loss": 0.058061547577381134, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002583498426247388, + "grad_norm": 6.733937740325928, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8535904884338379, + "num_tokens": 346275774.0, + "step": 9079 + }, + { + "epoch": 1.1550693296018317, + "ewc_loss": 0.05805826187133789, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025831698440015316, + "grad_norm": 6.674989700317383, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8645084500312805, + "num_tokens": 346317648.0, + "step": 9080 + }, + { + "epoch": 1.1551965398804223, + "ewc_loss": 0.05847860872745514, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002600790758151561, + "grad_norm": 6.729493141174316, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8650153875350952, + "num_tokens": 346357736.0, + "step": 9081 + }, + { + "epoch": 1.1553237501590128, + "ewc_loss": 0.05818822979927063, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002596166741568595, + "grad_norm": 6.681362152099609, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8608489036560059, + "num_tokens": 346397961.0, + "step": 9082 + }, + { + "epoch": 1.1554509604376033, + "ewc_loss": 0.05811825022101402, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025891687255352736, + "grad_norm": 6.731251239776611, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8744956851005554, + "num_tokens": 346438268.0, + "step": 9083 + }, + { + "epoch": 1.1555781707161938, + "ewc_loss": 0.0584559440612793, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002598523860797286, + "grad_norm": 6.728797435760498, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8532937169075012, + "num_tokens": 346479024.0, + "step": 9084 + }, + { + "epoch": 1.1557053809947844, + "ewc_loss": 0.05839620903134346, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002592550590634346, + "grad_norm": 6.673603534698486, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8688238859176636, + "num_tokens": 346523526.0, + "step": 9085 + }, + { + "epoch": 1.155832591273375, + "ewc_loss": 0.05850165709853172, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000260309549048543, + "grad_norm": 6.726600646972656, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8709155321121216, + "num_tokens": 346561985.0, + "step": 9086 + }, + { + "epoch": 1.1559598015519654, + "ewc_loss": 0.05842222273349762, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025951521820388734, + "grad_norm": 6.712187767028809, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8501235246658325, + "num_tokens": 346605600.0, + "step": 9087 + }, + { + "epoch": 1.156087011830556, + "ewc_loss": 0.05846653878688812, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025995838223025203, + "grad_norm": 6.75085973739624, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8758372068405151, + "num_tokens": 346640798.0, + "step": 9088 + }, + { + "epoch": 1.1562142221091465, + "ewc_loss": 0.058407075703144073, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00025936373276636004, + "grad_norm": 6.699669361114502, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8580156564712524, + "num_tokens": 346678483.0, + "step": 9089 + }, + { + "epoch": 1.156341432387737, + "ewc_loss": 0.05844484269618988, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002597414131741971, + "grad_norm": 6.7461419105529785, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8792262077331543, + "num_tokens": 346714604.0, + "step": 9090 + }, + { + "epoch": 1.1564686426663275, + "ewc_loss": 0.05814211070537567, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002591554948594421, + "grad_norm": 6.685303688049316, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8748863935470581, + "num_tokens": 346747203.0, + "step": 9091 + }, + { + "epoch": 1.156595852944918, + "ewc_loss": 0.05852285027503967, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002605214831419289, + "grad_norm": 6.715179443359375, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8608095049858093, + "num_tokens": 346790650.0, + "step": 9092 + }, + { + "epoch": 1.1567230632235084, + "ewc_loss": 0.05839978903532028, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002592908567748964, + "grad_norm": 6.74949836730957, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8836466670036316, + "num_tokens": 346822872.0, + "step": 9093 + }, + { + "epoch": 1.156850273502099, + "ewc_loss": 0.05822652205824852, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025999959325417876, + "grad_norm": 6.736344337463379, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8691636323928833, + "num_tokens": 346857999.0, + "step": 9094 + }, + { + "epoch": 1.1569774837806894, + "ewc_loss": 0.05815167352557182, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002592511009424925, + "grad_norm": 6.718733310699463, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8702902793884277, + "num_tokens": 346891804.0, + "step": 9095 + }, + { + "epoch": 1.15710469405928, + "ewc_loss": 0.05789768323302269, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002591526135802269, + "grad_norm": 6.733280181884766, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8614290356636047, + "num_tokens": 346931554.0, + "step": 9096 + }, + { + "epoch": 1.1572319043378705, + "ewc_loss": 0.05822429060935974, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002599772997200489, + "grad_norm": 6.765374183654785, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8500921726226807, + "num_tokens": 346965214.0, + "step": 9097 + }, + { + "epoch": 1.157359114616461, + "ewc_loss": 0.05807642266154289, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002584985923022032, + "grad_norm": 6.662513732910156, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8745547533035278, + "num_tokens": 347001219.0, + "step": 9098 + }, + { + "epoch": 1.1574863248950515, + "ewc_loss": 0.05822474882006645, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002599818690214306, + "grad_norm": 6.736234188079834, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8456076383590698, + "num_tokens": 347043212.0, + "step": 9099 + }, + { + "epoch": 1.157613535173642, + "ewc_loss": 0.05792110413312912, + "ewc_loss_diag": 3.1948089599609375e-05, + "ewc_loss_parallel": 0.0002593868412077427, + "grad_norm": 6.715951919555664, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8755531907081604, + "num_tokens": 347082249.0, + "step": 9100 + }, + { + "epoch": 1.1577407454522326, + "ewc_loss": 0.05818624794483185, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002595968544483185, + "grad_norm": 6.713191986083984, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8595400452613831, + "num_tokens": 347119286.0, + "step": 9101 + }, + { + "epoch": 1.1578679557308231, + "ewc_loss": 0.05816052854061127, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002593396638985723, + "grad_norm": 6.680151462554932, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8749991655349731, + "num_tokens": 347155549.0, + "step": 9102 + }, + { + "epoch": 1.1579951660094137, + "ewc_loss": 0.05822645127773285, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00025999886565841734, + "grad_norm": 6.710250377655029, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8653550744056702, + "num_tokens": 347196846.0, + "step": 9103 + }, + { + "epoch": 1.158122376288004, + "ewc_loss": 0.05823637917637825, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002600981679279357, + "grad_norm": 6.651902675628662, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8771438002586365, + "num_tokens": 347234028.0, + "step": 9104 + }, + { + "epoch": 1.1582495865665945, + "ewc_loss": 0.05859158933162689, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026120885740965605, + "grad_norm": 6.73218297958374, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8737004399299622, + "num_tokens": 347276931.0, + "step": 9105 + }, + { + "epoch": 1.158376796845185, + "ewc_loss": 0.05847521498799324, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002600451116450131, + "grad_norm": 13.256133079528809, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8592750430107117, + "num_tokens": 347317139.0, + "step": 9106 + }, + { + "epoch": 1.1585040071237755, + "ewc_loss": 0.06792967021465302, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00035458969068713486, + "grad_norm": 7.873416423797607, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8641419410705566, + "num_tokens": 347355718.0, + "step": 9107 + }, + { + "epoch": 1.158631217402366, + "ewc_loss": 0.0569189079105854, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00024448204203508794, + "grad_norm": 6.3979387283325195, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8528326153755188, + "num_tokens": 347393403.0, + "step": 9108 + }, + { + "epoch": 1.1587584276809566, + "ewc_loss": 0.06054745614528656, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00028076753369532526, + "grad_norm": 7.218267440795898, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8698091506958008, + "num_tokens": 347433736.0, + "step": 9109 + }, + { + "epoch": 1.1588856379595471, + "ewc_loss": 0.059171855449676514, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002670115209184587, + "grad_norm": 6.643831729888916, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8644342422485352, + "num_tokens": 347474574.0, + "step": 9110 + }, + { + "epoch": 1.1590128482381377, + "ewc_loss": 0.05942051112651825, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002719394979067147, + "grad_norm": 7.011052131652832, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8699262142181396, + "num_tokens": 347518785.0, + "step": 9111 + }, + { + "epoch": 1.1591400585167282, + "ewc_loss": 0.05885778367519379, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026631218497641385, + "grad_norm": 6.757633686065674, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8513760566711426, + "num_tokens": 347557518.0, + "step": 9112 + }, + { + "epoch": 1.1592672687953187, + "ewc_loss": 0.05898575484752655, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026759193860925734, + "grad_norm": 6.945576190948486, + "learning_rate": 1e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8343667387962341, + "num_tokens": 347600094.0, + "step": 9113 + }, + { + "epoch": 1.1593944790739092, + "ewc_loss": 0.058862797915935516, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002663623308762908, + "grad_norm": 6.84078311920166, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8621166944503784, + "num_tokens": 347633709.0, + "step": 9114 + }, + { + "epoch": 1.1595216893524998, + "ewc_loss": 0.05869210511445999, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002646554494276643, + "grad_norm": 6.812698841094971, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8717761635780334, + "num_tokens": 347670027.0, + "step": 9115 + }, + { + "epoch": 1.1596488996310903, + "ewc_loss": 0.0586928054690361, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002646624343469739, + "grad_norm": 6.768131732940674, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8681083917617798, + "num_tokens": 347707488.0, + "step": 9116 + }, + { + "epoch": 1.1597761099096806, + "ewc_loss": 0.05864396318793297, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000264174013864249, + "grad_norm": 6.804678440093994, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8590322136878967, + "num_tokens": 347746030.0, + "step": 9117 + }, + { + "epoch": 1.1599033201882711, + "ewc_loss": 0.0585557222366333, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002632915857248008, + "grad_norm": 6.813652515411377, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8561372756958008, + "num_tokens": 347783409.0, + "step": 9118 + }, + { + "epoch": 1.1600305304668617, + "ewc_loss": 0.058457210659980774, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002623064792715013, + "grad_norm": 6.768075466156006, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8701568841934204, + "num_tokens": 347821335.0, + "step": 9119 + }, + { + "epoch": 1.1601577407454522, + "ewc_loss": 0.05880975350737572, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026339050964452326, + "grad_norm": 13.317852020263672, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.873599648475647, + "num_tokens": 347857590.0, + "step": 9120 + }, + { + "epoch": 1.1602849510240427, + "ewc_loss": 0.06841301172971725, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00035942308022640646, + "grad_norm": 8.000383377075195, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8753762245178223, + "num_tokens": 347897000.0, + "step": 9121 + }, + { + "epoch": 1.1604121613026332, + "ewc_loss": 0.057056985795497894, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00024586281506344676, + "grad_norm": 6.464053630828857, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8635138273239136, + "num_tokens": 347928947.0, + "step": 9122 + }, + { + "epoch": 1.1605393715812238, + "ewc_loss": 0.06064271181821823, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002817200729623437, + "grad_norm": 7.326141357421875, + "learning_rate": 1e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8328798413276672, + "num_tokens": 347959294.0, + "step": 9123 + }, + { + "epoch": 1.1606665818598143, + "ewc_loss": 0.059170983731746674, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026944419369101524, + "grad_norm": 6.731132984161377, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8695052266120911, + "num_tokens": 347993103.0, + "step": 9124 + }, + { + "epoch": 1.1607937921384048, + "ewc_loss": 0.05938383564352989, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002715727314352989, + "grad_norm": 7.056757926940918, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8755162358283997, + "num_tokens": 348028412.0, + "step": 9125 + }, + { + "epoch": 1.1609210024169954, + "ewc_loss": 0.05898270756006241, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002675614377949387, + "grad_norm": 6.742534160614014, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8461810350418091, + "num_tokens": 348071429.0, + "step": 9126 + }, + { + "epoch": 1.161048212695586, + "ewc_loss": 0.059139788150787354, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026913225883617997, + "grad_norm": 6.99099588394165, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8681097626686096, + "num_tokens": 348113222.0, + "step": 9127 + }, + { + "epoch": 1.1611754229741762, + "ewc_loss": 0.05889291316270828, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002666634973138571, + "grad_norm": 6.835328578948975, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8616666197776794, + "num_tokens": 348150188.0, + "step": 9128 + }, + { + "epoch": 1.1613026332527667, + "ewc_loss": 0.058731526136398315, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026504966081120074, + "grad_norm": 6.812734603881836, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8665953874588013, + "num_tokens": 348188547.0, + "step": 9129 + }, + { + "epoch": 1.1614298435313573, + "ewc_loss": 0.05877194181084633, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026545379660092294, + "grad_norm": 6.8580121994018555, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8584166765213013, + "num_tokens": 348228624.0, + "step": 9130 + }, + { + "epoch": 1.1615570538099478, + "ewc_loss": 0.0586157850921154, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002638922305777669, + "grad_norm": 6.8300395011901855, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8599332571029663, + "num_tokens": 348264932.0, + "step": 9131 + }, + { + "epoch": 1.1616842640885383, + "ewc_loss": 0.05856875702738762, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026342194178141654, + "grad_norm": 6.787107467651367, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8653680086135864, + "num_tokens": 348307680.0, + "step": 9132 + }, + { + "epoch": 1.1618114743671288, + "ewc_loss": 0.05879562720656395, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002632492396514863, + "grad_norm": 6.781930446624756, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.877800703048706, + "num_tokens": 348346449.0, + "step": 9133 + }, + { + "epoch": 1.1619386846457194, + "ewc_loss": 0.058464013040065765, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026237452402710915, + "grad_norm": 6.8136305809021, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8741497993469238, + "num_tokens": 348386393.0, + "step": 9134 + }, + { + "epoch": 1.16206589492431, + "ewc_loss": 0.05861332267522812, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002614261757116765, + "grad_norm": 6.784045696258545, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.857649564743042, + "num_tokens": 348429458.0, + "step": 9135 + }, + { + "epoch": 1.1621931052029004, + "ewc_loss": 0.058403003960847855, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026176442042924464, + "grad_norm": 6.78160285949707, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8514082431793213, + "num_tokens": 348471365.0, + "step": 9136 + }, + { + "epoch": 1.162320315481491, + "ewc_loss": 0.05842778831720352, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002620122395455837, + "grad_norm": 6.816822052001953, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8471654653549194, + "num_tokens": 348513425.0, + "step": 9137 + }, + { + "epoch": 1.1624475257600815, + "ewc_loss": 0.05838726460933685, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002616070269141346, + "grad_norm": 6.778071403503418, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8608192205429077, + "num_tokens": 348551181.0, + "step": 9138 + }, + { + "epoch": 1.162574736038672, + "ewc_loss": 0.05834154039621353, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026114980573765934, + "grad_norm": 6.751814842224121, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8843370079994202, + "num_tokens": 348583611.0, + "step": 9139 + }, + { + "epoch": 1.1627019463172625, + "ewc_loss": 0.05839964747428894, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026173083460889757, + "grad_norm": 6.740142345428467, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8616573810577393, + "num_tokens": 348624281.0, + "step": 9140 + }, + { + "epoch": 1.162829156595853, + "ewc_loss": 0.05843605101108551, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026209489442408085, + "grad_norm": 6.762727737426758, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8616799116134644, + "num_tokens": 348661416.0, + "step": 9141 + }, + { + "epoch": 1.1629563668744434, + "ewc_loss": 0.05835190787911415, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026125344447791576, + "grad_norm": 6.7902607917785645, + "learning_rate": 1e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8422501087188721, + "num_tokens": 348703907.0, + "step": 9142 + }, + { + "epoch": 1.163083577153034, + "ewc_loss": 0.05839996412396431, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026173400692641735, + "grad_norm": 6.857538223266602, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8752331137657166, + "num_tokens": 348735617.0, + "step": 9143 + }, + { + "epoch": 1.1632107874316244, + "ewc_loss": 0.05836441367864609, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002613785327412188, + "grad_norm": 6.761415004730225, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.861819863319397, + "num_tokens": 348773552.0, + "step": 9144 + }, + { + "epoch": 1.163337997710215, + "ewc_loss": 0.05844520032405853, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002621863968670368, + "grad_norm": 6.794699192047119, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8642290234565735, + "num_tokens": 348811274.0, + "step": 9145 + }, + { + "epoch": 1.1634652079888055, + "ewc_loss": 0.05831360071897507, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026087037986144423, + "grad_norm": 6.688323497772217, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8519977927207947, + "num_tokens": 348847773.0, + "step": 9146 + }, + { + "epoch": 1.163592418267396, + "ewc_loss": 0.05852430313825607, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026297738077118993, + "grad_norm": 6.7898430824279785, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8546320199966431, + "num_tokens": 348886356.0, + "step": 9147 + }, + { + "epoch": 1.1637196285459865, + "ewc_loss": 0.058439966291189194, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026213403907604516, + "grad_norm": 6.732491493225098, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8748692274093628, + "num_tokens": 348922892.0, + "step": 9148 + }, + { + "epoch": 1.163846838824577, + "ewc_loss": 0.05847501754760742, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026248456561006606, + "grad_norm": 6.746056079864502, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8817712068557739, + "num_tokens": 348954924.0, + "step": 9149 + }, + { + "epoch": 1.1639740491031676, + "ewc_loss": 0.05840909481048584, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002618253056425601, + "grad_norm": 6.748642444610596, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8684948086738586, + "num_tokens": 348997335.0, + "step": 9150 + }, + { + "epoch": 1.1641012593817581, + "ewc_loss": 0.058493249118328094, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002626668429002166, + "grad_norm": 6.784289836883545, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8523958921432495, + "num_tokens": 349038221.0, + "step": 9151 + }, + { + "epoch": 1.1642284696603487, + "ewc_loss": 0.05841228365898132, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026185723254457116, + "grad_norm": 6.7211480140686035, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.858478307723999, + "num_tokens": 349076559.0, + "step": 9152 + }, + { + "epoch": 1.164355679938939, + "ewc_loss": 0.0585516095161438, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002632504911161959, + "grad_norm": 6.8239054679870605, + "learning_rate": 1e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8414767980575562, + "num_tokens": 349112264.0, + "step": 9153 + }, + { + "epoch": 1.1644828902175295, + "ewc_loss": 0.05840548500418663, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002617892168927938, + "grad_norm": 6.703481197357178, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8747670650482178, + "num_tokens": 349146571.0, + "step": 9154 + }, + { + "epoch": 1.16461010049612, + "ewc_loss": 0.05865945667028427, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026432896265760064, + "grad_norm": 6.801115989685059, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8707901835441589, + "num_tokens": 349183674.0, + "step": 9155 + }, + { + "epoch": 1.1647373107747105, + "ewc_loss": 0.058433808386325836, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002620724553707987, + "grad_norm": 6.672566890716553, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8686873912811279, + "num_tokens": 349227042.0, + "step": 9156 + }, + { + "epoch": 1.164864521053301, + "ewc_loss": 0.058586493134498596, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026359930052421987, + "grad_norm": 6.785762786865234, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8580417037010193, + "num_tokens": 349264072.0, + "step": 9157 + }, + { + "epoch": 1.1649917313318916, + "ewc_loss": 0.058405518531799316, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026178956613875926, + "grad_norm": 6.714595317840576, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8645265698432922, + "num_tokens": 349304427.0, + "step": 9158 + }, + { + "epoch": 1.1651189416104821, + "ewc_loss": 0.058530502021312714, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026303937193006277, + "grad_norm": 6.709099292755127, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8615144491195679, + "num_tokens": 349346708.0, + "step": 9159 + }, + { + "epoch": 1.1652461518890727, + "ewc_loss": 0.05848715454339981, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026260592858307064, + "grad_norm": 6.722646236419678, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8618724942207336, + "num_tokens": 349387716.0, + "step": 9160 + }, + { + "epoch": 1.1653733621676632, + "ewc_loss": 0.05852051451802254, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002629395166877657, + "grad_norm": 6.784744739532471, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8598397970199585, + "num_tokens": 349425757.0, + "step": 9161 + }, + { + "epoch": 1.1655005724462537, + "ewc_loss": 0.05873128026723862, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026260578306391835, + "grad_norm": 6.736293792724609, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.870284914970398, + "num_tokens": 349472097.0, + "step": 9162 + }, + { + "epoch": 1.1656277827248442, + "ewc_loss": 0.058519285172224045, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000262927234871313, + "grad_norm": 6.801213264465332, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.860488772392273, + "num_tokens": 349513809.0, + "step": 9163 + }, + { + "epoch": 1.1657549930034348, + "ewc_loss": 0.0584762766957283, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002624971675686538, + "grad_norm": 6.799738883972168, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8782775402069092, + "num_tokens": 349544906.0, + "step": 9164 + }, + { + "epoch": 1.1658822032820253, + "ewc_loss": 0.058576785027980804, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026350223924964666, + "grad_norm": 6.802241802215576, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8582332134246826, + "num_tokens": 349582529.0, + "step": 9165 + }, + { + "epoch": 1.1660094135606156, + "ewc_loss": 0.05856255441904068, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026091851759701967, + "grad_norm": 6.790985584259033, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8586633205413818, + "num_tokens": 349624871.0, + "step": 9166 + }, + { + "epoch": 1.1661366238392061, + "ewc_loss": 0.05843392759561539, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002620736777316779, + "grad_norm": 6.787073612213135, + "learning_rate": 1e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8279232978820801, + "num_tokens": 349660783.0, + "step": 9167 + }, + { + "epoch": 1.1662638341177967, + "ewc_loss": 0.05837467312812805, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026148109463974833, + "grad_norm": 6.764308929443359, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8736900687217712, + "num_tokens": 349698430.0, + "step": 9168 + }, + { + "epoch": 1.1663910443963872, + "ewc_loss": 0.058417584747076035, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002619102306198329, + "grad_norm": 6.802495956420898, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8550373315811157, + "num_tokens": 349734981.0, + "step": 9169 + }, + { + "epoch": 1.1665182546749777, + "ewc_loss": 0.058378543704748154, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002615198027342558, + "grad_norm": 6.746789455413818, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.857071578502655, + "num_tokens": 349774267.0, + "step": 9170 + }, + { + "epoch": 1.1666454649535682, + "ewc_loss": 0.058471255004405975, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026244690525345504, + "grad_norm": 6.760149955749512, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.861268162727356, + "num_tokens": 349811400.0, + "step": 9171 + }, + { + "epoch": 1.1667726752321588, + "ewc_loss": 0.05839361250400543, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026167050236836076, + "grad_norm": 6.732793807983398, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.854896068572998, + "num_tokens": 349850031.0, + "step": 9172 + }, + { + "epoch": 1.1668998855107493, + "ewc_loss": 0.05852796137332916, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002630139933899045, + "grad_norm": 6.725075721740723, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8654555678367615, + "num_tokens": 349889967.0, + "step": 9173 + }, + { + "epoch": 1.1670270957893398, + "ewc_loss": 0.058742936700582504, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026272234390489757, + "grad_norm": 6.758358955383301, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.857601523399353, + "num_tokens": 349930276.0, + "step": 9174 + }, + { + "epoch": 1.1671543060679304, + "ewc_loss": 0.0585574135184288, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002633084950502962, + "grad_norm": 6.807702541351318, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.858683168888092, + "num_tokens": 349969889.0, + "step": 9175 + }, + { + "epoch": 1.1672815163465209, + "ewc_loss": 0.05843428894877434, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026207725750282407, + "grad_norm": 6.733227729797363, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8616929054260254, + "num_tokens": 350005729.0, + "step": 9176 + }, + { + "epoch": 1.1674087266251112, + "ewc_loss": 0.05860384553670883, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002637728175614029, + "grad_norm": 6.813999176025391, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8701494932174683, + "num_tokens": 350043130.0, + "step": 9177 + }, + { + "epoch": 1.1675359369037017, + "ewc_loss": 0.058450110256671906, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026223549502901733, + "grad_norm": 6.738325595855713, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8814057111740112, + "num_tokens": 350081233.0, + "step": 9178 + }, + { + "epoch": 1.1676631471822922, + "ewc_loss": 0.05849159508943558, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026265031192451715, + "grad_norm": 6.823601245880127, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.849597692489624, + "num_tokens": 350119403.0, + "step": 9179 + }, + { + "epoch": 1.1677903574608828, + "ewc_loss": 0.05847536772489548, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002624880289658904, + "grad_norm": 6.788942813873291, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8518097400665283, + "num_tokens": 350159099.0, + "step": 9180 + }, + { + "epoch": 1.1679175677394733, + "ewc_loss": 0.05843867361545563, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002621210878714919, + "grad_norm": 6.771501064300537, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.862586259841919, + "num_tokens": 350195642.0, + "step": 9181 + }, + { + "epoch": 1.1680447780180638, + "ewc_loss": 0.05857883393764496, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026352269924245775, + "grad_norm": 6.80941104888916, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8710265755653381, + "num_tokens": 350236721.0, + "step": 9182 + }, + { + "epoch": 1.1681719882966544, + "ewc_loss": 0.05873101204633713, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026260310551151633, + "grad_norm": 6.79323673248291, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8567827343940735, + "num_tokens": 350272691.0, + "step": 9183 + }, + { + "epoch": 1.168299198575245, + "ewc_loss": 0.05870576947927475, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002623506588861346, + "grad_norm": 6.721489429473877, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.861361026763916, + "num_tokens": 350318299.0, + "step": 9184 + }, + { + "epoch": 1.1684264088538354, + "ewc_loss": 0.05885426700115204, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026383562362752855, + "grad_norm": 6.806875705718994, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8615771532058716, + "num_tokens": 350360973.0, + "step": 9185 + }, + { + "epoch": 1.168553619132426, + "ewc_loss": 0.058713629841804504, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026242926833219826, + "grad_norm": 6.805826187133789, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8700367212295532, + "num_tokens": 350400516.0, + "step": 9186 + }, + { + "epoch": 1.1686808294110165, + "ewc_loss": 0.058540984988212585, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002631442039273679, + "grad_norm": 6.806704521179199, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.858555793762207, + "num_tokens": 350438147.0, + "step": 9187 + }, + { + "epoch": 1.168808039689607, + "ewc_loss": 0.05847499519586563, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026248430367559195, + "grad_norm": 6.777337074279785, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8666427135467529, + "num_tokens": 350481604.0, + "step": 9188 + }, + { + "epoch": 1.1689352499681975, + "ewc_loss": 0.05848650634288788, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002625994384288788, + "grad_norm": 6.764157295227051, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8825556635856628, + "num_tokens": 350518225.0, + "step": 9189 + }, + { + "epoch": 1.169062460246788, + "ewc_loss": 0.05843552201986313, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002620895975269377, + "grad_norm": 6.83715295791626, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8616096377372742, + "num_tokens": 350553752.0, + "step": 9190 + }, + { + "epoch": 1.1691896705253784, + "ewc_loss": 0.05848190188407898, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002625533670652658, + "grad_norm": 6.8062896728515625, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8691665530204773, + "num_tokens": 350593676.0, + "step": 9191 + }, + { + "epoch": 1.169316880803969, + "ewc_loss": 0.05848501995205879, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002625845663715154, + "grad_norm": 6.78779935836792, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8617610931396484, + "num_tokens": 350633618.0, + "step": 9192 + }, + { + "epoch": 1.1694440910825594, + "ewc_loss": 0.05876506119966507, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026294359122402966, + "grad_norm": 6.86288595199585, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8616635799407959, + "num_tokens": 350678221.0, + "step": 9193 + }, + { + "epoch": 1.16957130136115, + "ewc_loss": 0.05843121185898781, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002620464947540313, + "grad_norm": 6.770712375640869, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8734491467475891, + "num_tokens": 350718258.0, + "step": 9194 + }, + { + "epoch": 1.1696985116397405, + "ewc_loss": 0.05875169858336449, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026280994643457234, + "grad_norm": 13.247698783874512, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8728585243225098, + "num_tokens": 350757370.0, + "step": 9195 + }, + { + "epoch": 1.169825721918331, + "ewc_loss": 0.06838362663984299, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00035912924795411527, + "grad_norm": 8.01705265045166, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8637598752975464, + "num_tokens": 350801394.0, + "step": 9196 + }, + { + "epoch": 1.1699529321969215, + "ewc_loss": 0.057141900062561035, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002467119775246829, + "grad_norm": 6.424838542938232, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8545856475830078, + "num_tokens": 350840635.0, + "step": 9197 + }, + { + "epoch": 1.170080142475512, + "ewc_loss": 0.060861486941576004, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002839078370016068, + "grad_norm": 7.362883567810059, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8779652118682861, + "num_tokens": 350879616.0, + "step": 9198 + }, + { + "epoch": 1.1702073527541026, + "ewc_loss": 0.0594528503715992, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002698214666452259, + "grad_norm": 6.6696457862854, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8640017509460449, + "num_tokens": 350921710.0, + "step": 9199 + }, + { + "epoch": 1.1703345630326931, + "ewc_loss": 0.05976872891187668, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002754216839093715, + "grad_norm": 7.181692600250244, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8505991697311401, + "num_tokens": 350964218.0, + "step": 9200 + }, + { + "epoch": 1.1704617733112836, + "ewc_loss": 0.05911971628665924, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026893155882135034, + "grad_norm": 6.748308181762695, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8724769949913025, + "num_tokens": 351004511.0, + "step": 9201 + }, + { + "epoch": 1.170588983589874, + "ewc_loss": 0.05951901152729988, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002729244879446924, + "grad_norm": 7.090697288513184, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8574235439300537, + "num_tokens": 351044583.0, + "step": 9202 + }, + { + "epoch": 1.1707161938684645, + "ewc_loss": 0.059347279369831085, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002687657542992383, + "grad_norm": 6.824923038482666, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8471692204475403, + "num_tokens": 351083613.0, + "step": 9203 + }, + { + "epoch": 1.170843404147055, + "ewc_loss": 0.05914996564388275, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002692340058274567, + "grad_norm": 7.01621150970459, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8556349873542786, + "num_tokens": 351117412.0, + "step": 9204 + }, + { + "epoch": 1.1709706144256455, + "ewc_loss": 0.05886080488562584, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002663424238562584, + "grad_norm": 6.850471496582031, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8469974398612976, + "num_tokens": 351154779.0, + "step": 9205 + }, + { + "epoch": 1.171097824704236, + "ewc_loss": 0.058959439396858215, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026732878177426755, + "grad_norm": 6.8766398429870605, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8741923570632935, + "num_tokens": 351191412.0, + "step": 9206 + }, + { + "epoch": 1.1712250349828266, + "ewc_loss": 0.05869583040475845, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002646927023306489, + "grad_norm": 6.785839080810547, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8669378757476807, + "num_tokens": 351235465.0, + "step": 9207 + }, + { + "epoch": 1.1713522452614171, + "ewc_loss": 0.058740705251693726, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002651414542924613, + "grad_norm": 6.832396507263184, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8679280281066895, + "num_tokens": 351279326.0, + "step": 9208 + }, + { + "epoch": 1.1714794555400077, + "ewc_loss": 0.058714333921670914, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026487771538086236, + "grad_norm": 6.82348108291626, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.879479169845581, + "num_tokens": 351316678.0, + "step": 9209 + }, + { + "epoch": 1.1716066658185982, + "ewc_loss": 0.058662623167037964, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002643605985213071, + "grad_norm": 6.882474899291992, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8595555424690247, + "num_tokens": 351352641.0, + "step": 9210 + }, + { + "epoch": 1.1717338760971887, + "ewc_loss": 0.058640316128730774, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026413751766085625, + "grad_norm": 6.888474464416504, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8537826538085938, + "num_tokens": 351385845.0, + "step": 9211 + }, + { + "epoch": 1.1718610863757792, + "ewc_loss": 0.05857259780168533, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002634603588376194, + "grad_norm": 6.773690223693848, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8703477382659912, + "num_tokens": 351425220.0, + "step": 9212 + }, + { + "epoch": 1.1719882966543698, + "ewc_loss": 0.05865953490138054, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002643297193571925, + "grad_norm": 6.87030029296875, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8580385446548462, + "num_tokens": 351467515.0, + "step": 9213 + }, + { + "epoch": 1.1721155069329603, + "ewc_loss": 0.05851832777261734, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002629176597110927, + "grad_norm": 6.805346488952637, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8605711460113525, + "num_tokens": 351507571.0, + "step": 9214 + }, + { + "epoch": 1.1722427172115506, + "ewc_loss": 0.05863112211227417, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002640456077642739, + "grad_norm": 6.811685562133789, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8603413105010986, + "num_tokens": 351546150.0, + "step": 9215 + }, + { + "epoch": 1.1723699274901411, + "ewc_loss": 0.05846725404262543, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002624069165904075, + "grad_norm": 6.819381237030029, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8561729788780212, + "num_tokens": 351588574.0, + "step": 9216 + }, + { + "epoch": 1.1724971377687317, + "ewc_loss": 0.05857086926698685, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026344304205849767, + "grad_norm": 6.834784030914307, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8632999658584595, + "num_tokens": 351626914.0, + "step": 9217 + }, + { + "epoch": 1.1726243480473222, + "ewc_loss": 0.05858549475669861, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026358934701420367, + "grad_norm": 6.793739318847656, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8533475399017334, + "num_tokens": 351663574.0, + "step": 9218 + }, + { + "epoch": 1.1727515583259127, + "ewc_loss": 0.058473147451877594, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002624658518470824, + "grad_norm": 6.798701763153076, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8461050391197205, + "num_tokens": 351703275.0, + "step": 9219 + }, + { + "epoch": 1.1728787686045032, + "ewc_loss": 0.058602120727300644, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026375558809377253, + "grad_norm": 6.820096969604492, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8526492714881897, + "num_tokens": 351745713.0, + "step": 9220 + }, + { + "epoch": 1.1730059788830938, + "ewc_loss": 0.058519359678030014, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002629279624670744, + "grad_norm": 6.803285598754883, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8490164875984192, + "num_tokens": 351780789.0, + "step": 9221 + }, + { + "epoch": 1.1731331891616843, + "ewc_loss": 0.058632124215364456, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000264055619481951, + "grad_norm": 6.788327693939209, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8721601366996765, + "num_tokens": 351820352.0, + "step": 9222 + }, + { + "epoch": 1.1732603994402748, + "ewc_loss": 0.05854996293783188, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002632340183481574, + "grad_norm": 6.81459379196167, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8712522983551025, + "num_tokens": 351860011.0, + "step": 9223 + }, + { + "epoch": 1.1733876097188654, + "ewc_loss": 0.05862337350845337, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026396813336759806, + "grad_norm": 6.764698505401611, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.855635404586792, + "num_tokens": 351898742.0, + "step": 9224 + }, + { + "epoch": 1.1735148199974559, + "ewc_loss": 0.05864156037569046, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026414997410029173, + "grad_norm": 6.8276047706604, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8501657247543335, + "num_tokens": 351940201.0, + "step": 9225 + }, + { + "epoch": 1.1736420302760462, + "ewc_loss": 0.05859459191560745, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002636802673805505, + "grad_norm": 6.813292503356934, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8606432676315308, + "num_tokens": 351971041.0, + "step": 9226 + }, + { + "epoch": 1.1737692405546367, + "ewc_loss": 0.058635346591472626, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002640878374222666, + "grad_norm": 6.768346786499023, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8450871109962463, + "num_tokens": 352013002.0, + "step": 9227 + }, + { + "epoch": 1.1738964508332272, + "ewc_loss": 0.058621324598789215, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026394761516712606, + "grad_norm": 6.814635276794434, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8613413572311401, + "num_tokens": 352051435.0, + "step": 9228 + }, + { + "epoch": 1.1740236611118178, + "ewc_loss": 0.05857868492603302, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026352121494710445, + "grad_norm": 6.784460544586182, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8680778741836548, + "num_tokens": 352093295.0, + "step": 9229 + }, + { + "epoch": 1.1741508713904083, + "ewc_loss": 0.05854363739490509, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002631707757245749, + "grad_norm": 6.720655918121338, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8572918176651001, + "num_tokens": 352136401.0, + "step": 9230 + }, + { + "epoch": 1.1742780816689988, + "ewc_loss": 0.05871054530143738, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026483985129743814, + "grad_norm": 6.802276134490967, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8733127117156982, + "num_tokens": 352178696.0, + "step": 9231 + }, + { + "epoch": 1.1744052919475894, + "ewc_loss": 0.0585830882191658, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026356527814641595, + "grad_norm": 6.801436901092529, + "learning_rate": 1e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8437421321868896, + "num_tokens": 352214759.0, + "step": 9232 + }, + { + "epoch": 1.1745325022261799, + "ewc_loss": 0.05868397653102875, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026457413332536817, + "grad_norm": 6.79769229888916, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8755272626876831, + "num_tokens": 352250520.0, + "step": 9233 + }, + { + "epoch": 1.1746597125047704, + "ewc_loss": 0.0588567852973938, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002638608275447041, + "grad_norm": 6.7860589027404785, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8603806495666504, + "num_tokens": 352288773.0, + "step": 9234 + }, + { + "epoch": 1.174786922783361, + "ewc_loss": 0.05855333432555199, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002632677205838263, + "grad_norm": 6.739708423614502, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8701369762420654, + "num_tokens": 352326548.0, + "step": 9235 + }, + { + "epoch": 1.1749141330619515, + "ewc_loss": 0.05865098536014557, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002642442414071411, + "grad_norm": 6.797281265258789, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.871907114982605, + "num_tokens": 352366537.0, + "step": 9236 + }, + { + "epoch": 1.175041343340542, + "ewc_loss": 0.05883130058646202, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026360596530139446, + "grad_norm": 6.821348190307617, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8458667993545532, + "num_tokens": 352402390.0, + "step": 9237 + }, + { + "epoch": 1.1751685536191325, + "ewc_loss": 0.05851909518241882, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026292531401850283, + "grad_norm": 6.773081302642822, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8601720333099365, + "num_tokens": 352442357.0, + "step": 9238 + }, + { + "epoch": 1.175295763897723, + "ewc_loss": 0.05884534493088722, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026374642038717866, + "grad_norm": 13.284771919250488, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8602028489112854, + "num_tokens": 352476884.0, + "step": 9239 + }, + { + "epoch": 1.1754229741763134, + "ewc_loss": 0.06854434311389923, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00036073639057576656, + "grad_norm": 7.991053104400635, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8628971576690674, + "num_tokens": 352513448.0, + "step": 9240 + }, + { + "epoch": 1.175550184454904, + "ewc_loss": 0.05730510875582695, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002483440621290356, + "grad_norm": 6.420427322387695, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8821889162063599, + "num_tokens": 352548420.0, + "step": 9241 + }, + { + "epoch": 1.1756773947334944, + "ewc_loss": 0.06095054745674133, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00028479844331741333, + "grad_norm": 7.231678485870361, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8652801513671875, + "num_tokens": 352592157.0, + "step": 9242 + }, + { + "epoch": 1.175804605012085, + "ewc_loss": 0.05926309525966644, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00027036532992497087, + "grad_norm": 6.7172017097473145, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8543041944503784, + "num_tokens": 352626427.0, + "step": 9243 + }, + { + "epoch": 1.1759318152906755, + "ewc_loss": 0.0600152388215065, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002754453744273633, + "grad_norm": 7.053244113922119, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8732638955116272, + "num_tokens": 352667075.0, + "step": 9244 + }, + { + "epoch": 1.176059025569266, + "ewc_loss": 0.05922907218337059, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00027002510614693165, + "grad_norm": 6.772320747375488, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8605064153671265, + "num_tokens": 352707114.0, + "step": 9245 + }, + { + "epoch": 1.1761862358478565, + "ewc_loss": 0.05943736061453819, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002721079799812287, + "grad_norm": 6.990503787994385, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8689689040184021, + "num_tokens": 352738064.0, + "step": 9246 + }, + { + "epoch": 1.176313446126447, + "ewc_loss": 0.05915912240743637, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002693256246857345, + "grad_norm": 6.831593990325928, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8478989601135254, + "num_tokens": 352779964.0, + "step": 9247 + }, + { + "epoch": 1.1764406564050376, + "ewc_loss": 0.05914418026804924, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002691761765163392, + "grad_norm": 6.8946099281311035, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8708242177963257, + "num_tokens": 352819574.0, + "step": 9248 + }, + { + "epoch": 1.1765678666836281, + "ewc_loss": 0.05905459076166153, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026828030240722, + "grad_norm": 6.808229446411133, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8719371557235718, + "num_tokens": 352859151.0, + "step": 9249 + }, + { + "epoch": 1.1766950769622184, + "ewc_loss": 0.059258490800857544, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002678778546396643, + "grad_norm": 6.9073872566223145, + "learning_rate": 1e-06, + "loss": 0.5589, + "mean_token_accuracy": 0.8404317498207092, + "num_tokens": 352901119.0, + "step": 9250 + }, + { + "epoch": 1.176822287240809, + "ewc_loss": 0.058827027678489685, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000266004673903808, + "grad_norm": 6.757404804229736, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8612579703330994, + "num_tokens": 352945878.0, + "step": 9251 + }, + { + "epoch": 1.1769494975193995, + "ewc_loss": 0.0590151846408844, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026788623654283583, + "grad_norm": 6.92073392868042, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8565714359283447, + "num_tokens": 352979819.0, + "step": 9252 + }, + { + "epoch": 1.17707670779799, + "ewc_loss": 0.05865760147571564, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026431039441376925, + "grad_norm": 6.813920021057129, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.852654218673706, + "num_tokens": 353023552.0, + "step": 9253 + }, + { + "epoch": 1.1772039180765805, + "ewc_loss": 0.058936551213264465, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026709988014772534, + "grad_norm": 6.873730182647705, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8632293939590454, + "num_tokens": 353058178.0, + "step": 9254 + }, + { + "epoch": 1.177331128355171, + "ewc_loss": 0.05867215245962143, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026445588446222246, + "grad_norm": 6.833528518676758, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8719273805618286, + "num_tokens": 353091428.0, + "step": 9255 + }, + { + "epoch": 1.1774583386337616, + "ewc_loss": 0.0587298758327961, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002650331298355013, + "grad_norm": 6.803892612457275, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8669886589050293, + "num_tokens": 353131943.0, + "step": 9256 + }, + { + "epoch": 1.1775855489123521, + "ewc_loss": 0.05874807760119438, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002652151451911777, + "grad_norm": 6.869282245635986, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8550223112106323, + "num_tokens": 353171213.0, + "step": 9257 + }, + { + "epoch": 1.1777127591909426, + "ewc_loss": 0.05866466462612152, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026438102941028774, + "grad_norm": 6.8265228271484375, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.870406448841095, + "num_tokens": 353205057.0, + "step": 9258 + }, + { + "epoch": 1.1778399694695332, + "ewc_loss": 0.05868253856897354, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.000264559785136953, + "grad_norm": 6.752169609069824, + "learning_rate": 1e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8487930297851562, + "num_tokens": 353251164.0, + "step": 9259 + }, + { + "epoch": 1.1779671797481237, + "ewc_loss": 0.05877985805273056, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026553295901976526, + "grad_norm": 6.904808521270752, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8680586218833923, + "num_tokens": 353288202.0, + "step": 9260 + }, + { + "epoch": 1.1780943900267142, + "ewc_loss": 0.058920007199048996, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002644930500537157, + "grad_norm": 6.815746307373047, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8616744875907898, + "num_tokens": 353327396.0, + "step": 9261 + }, + { + "epoch": 1.1782216003053048, + "ewc_loss": 0.05873320996761322, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002650664537213743, + "grad_norm": 6.854345798492432, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8503425717353821, + "num_tokens": 353366512.0, + "step": 9262 + }, + { + "epoch": 1.1783488105838953, + "ewc_loss": 0.05868843197822571, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002646186912897974, + "grad_norm": 6.783479690551758, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8541083335876465, + "num_tokens": 353408546.0, + "step": 9263 + }, + { + "epoch": 1.1784760208624856, + "ewc_loss": 0.058813657611608505, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002658709418028593, + "grad_norm": 6.859132766723633, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8719789981842041, + "num_tokens": 353444950.0, + "step": 9264 + }, + { + "epoch": 1.1786032311410761, + "ewc_loss": 0.058578211814165115, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026351650012657046, + "grad_norm": 6.726106643676758, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8692214488983154, + "num_tokens": 353483306.0, + "step": 9265 + }, + { + "epoch": 1.1787304414196667, + "ewc_loss": 0.05882905423641205, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026602490106597543, + "grad_norm": 6.805044651031494, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8613874316215515, + "num_tokens": 353523038.0, + "step": 9266 + }, + { + "epoch": 1.1788576516982572, + "ewc_loss": 0.058649204671382904, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002642264007590711, + "grad_norm": 6.743838787078857, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8729925155639648, + "num_tokens": 353564266.0, + "step": 9267 + }, + { + "epoch": 1.1789848619768477, + "ewc_loss": 0.058816712349653244, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002659015008248389, + "grad_norm": 6.787212371826172, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8657702207565308, + "num_tokens": 353603558.0, + "step": 9268 + }, + { + "epoch": 1.1791120722554382, + "ewc_loss": 0.058792777359485626, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026566212181933224, + "grad_norm": 6.79959774017334, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8703667521476746, + "num_tokens": 353648097.0, + "step": 9269 + }, + { + "epoch": 1.1792392825340288, + "ewc_loss": 0.05873319134116173, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026506627909839153, + "grad_norm": 6.811060428619385, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8475741147994995, + "num_tokens": 353688970.0, + "step": 9270 + }, + { + "epoch": 1.1793664928126193, + "ewc_loss": 0.05899769067764282, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002652698603924364, + "grad_norm": 6.92068338394165, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8519289493560791, + "num_tokens": 353732505.0, + "step": 9271 + }, + { + "epoch": 1.1794937030912098, + "ewc_loss": 0.058628614991903305, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.0002640205202624202, + "grad_norm": 6.767330169677734, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8703567981719971, + "num_tokens": 353767460.0, + "step": 9272 + }, + { + "epoch": 1.1796209133698004, + "ewc_loss": 0.058817654848098755, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026591093046590686, + "grad_norm": 6.871084213256836, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8713371753692627, + "num_tokens": 353801951.0, + "step": 9273 + }, + { + "epoch": 1.1797481236483909, + "ewc_loss": 0.05864100903272629, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026414444437250495, + "grad_norm": 6.794370174407959, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8693675994873047, + "num_tokens": 353836815.0, + "step": 9274 + }, + { + "epoch": 1.1798753339269812, + "ewc_loss": 0.05875451862812042, + "ewc_loss_diag": 3.218650817871094e-05, + "ewc_loss_parallel": 0.00026527958107180893, + "grad_norm": 6.802346229553223, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8553584814071655, + "num_tokens": 353876296.0, + "step": 9275 + }, + { + "epoch": 1.1800025442055717, + "ewc_loss": 0.05904224142432213, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026571538182906806, + "grad_norm": 6.865863800048828, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.845178484916687, + "num_tokens": 353911792.0, + "step": 9276 + }, + { + "epoch": 1.1801297544841622, + "ewc_loss": 0.05891077220439911, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002644007035996765, + "grad_norm": 6.77904748916626, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8522783517837524, + "num_tokens": 353949532.0, + "step": 9277 + }, + { + "epoch": 1.1802569647627528, + "ewc_loss": 0.05904843658208847, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026577731478028, + "grad_norm": 6.8056559562683105, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8497829437255859, + "num_tokens": 353988622.0, + "step": 9278 + }, + { + "epoch": 1.1803841750413433, + "ewc_loss": 0.05896575748920441, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002649505331646651, + "grad_norm": 6.724145889282227, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.866270899772644, + "num_tokens": 354028136.0, + "step": 9279 + }, + { + "epoch": 1.1805113853199338, + "ewc_loss": 0.059059493243694305, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002658879093360156, + "grad_norm": 6.779479026794434, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8629854321479797, + "num_tokens": 354065029.0, + "step": 9280 + }, + { + "epoch": 1.1806385955985244, + "ewc_loss": 0.05905890464782715, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026588200125843287, + "grad_norm": 6.806665897369385, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8732624053955078, + "num_tokens": 354099279.0, + "step": 9281 + }, + { + "epoch": 1.1807658058771149, + "ewc_loss": 0.05904970318078995, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026579000405035913, + "grad_norm": 6.772310733795166, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8576339483261108, + "num_tokens": 354138870.0, + "step": 9282 + }, + { + "epoch": 1.1808930161557054, + "ewc_loss": 0.05912071466445923, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026650013751350343, + "grad_norm": 6.824613571166992, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8498132824897766, + "num_tokens": 354176498.0, + "step": 9283 + }, + { + "epoch": 1.181020226434296, + "ewc_loss": 0.05899553745985031, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002652483235578984, + "grad_norm": 6.811601638793945, + "learning_rate": 1e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8440216779708862, + "num_tokens": 354210063.0, + "step": 9284 + }, + { + "epoch": 1.1811474367128865, + "ewc_loss": 0.05904647335410118, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026575769879855216, + "grad_norm": 6.812047004699707, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8556671142578125, + "num_tokens": 354248095.0, + "step": 9285 + }, + { + "epoch": 1.181274646991477, + "ewc_loss": 0.05901399627327919, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026543292915448546, + "grad_norm": 6.763488292694092, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.858524739742279, + "num_tokens": 354286897.0, + "step": 9286 + }, + { + "epoch": 1.1814018572700675, + "ewc_loss": 0.059091996401548386, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002662129409145564, + "grad_norm": 6.7212748527526855, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8704511523246765, + "num_tokens": 354330685.0, + "step": 9287 + }, + { + "epoch": 1.181529067548658, + "ewc_loss": 0.059114180505275726, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026643479941412807, + "grad_norm": 6.792019844055176, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8540000915527344, + "num_tokens": 354371120.0, + "step": 9288 + }, + { + "epoch": 1.1816562778272484, + "ewc_loss": 0.05908576399087906, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002661506296135485, + "grad_norm": 6.759610176086426, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8593024611473083, + "num_tokens": 354410825.0, + "step": 9289 + }, + { + "epoch": 1.1817834881058389, + "ewc_loss": 0.05918942019343376, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002671871625352651, + "grad_norm": 6.74820613861084, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8595241904258728, + "num_tokens": 354453623.0, + "step": 9290 + }, + { + "epoch": 1.1819106983844294, + "ewc_loss": 0.05916554480791092, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002669484238140285, + "grad_norm": 6.766391754150391, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8603237867355347, + "num_tokens": 354491854.0, + "step": 9291 + }, + { + "epoch": 1.18203790866302, + "ewc_loss": 0.05911997705698013, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026649271603673697, + "grad_norm": 6.947039604187012, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8481777906417847, + "num_tokens": 354530788.0, + "step": 9292 + }, + { + "epoch": 1.1821651189416105, + "ewc_loss": 0.05904657393693924, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002657586883287877, + "grad_norm": 6.702795505523682, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8809173703193665, + "num_tokens": 354568576.0, + "step": 9293 + }, + { + "epoch": 1.182292329220201, + "ewc_loss": 0.05923938751220703, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026768684620037675, + "grad_norm": 6.845227241516113, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8631762266159058, + "num_tokens": 354611307.0, + "step": 9294 + }, + { + "epoch": 1.1824195394987915, + "ewc_loss": 0.05894622206687927, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002647551882546395, + "grad_norm": 6.746908187866211, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8693405389785767, + "num_tokens": 354648810.0, + "step": 9295 + }, + { + "epoch": 1.182546749777382, + "ewc_loss": 0.05910402536392212, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002663331979420036, + "grad_norm": 6.778463363647461, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8652641773223877, + "num_tokens": 354688411.0, + "step": 9296 + }, + { + "epoch": 1.1826739600559726, + "ewc_loss": 0.0590377002954483, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026566997985355556, + "grad_norm": 6.749536991119385, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8607962131500244, + "num_tokens": 354734700.0, + "step": 9297 + }, + { + "epoch": 1.1828011703345631, + "ewc_loss": 0.05909201502799988, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002662131446413696, + "grad_norm": 6.816793918609619, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8593925833702087, + "num_tokens": 354765517.0, + "step": 9298 + }, + { + "epoch": 1.1829283806131534, + "ewc_loss": 0.05909433960914612, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026623636949807405, + "grad_norm": 6.776716232299805, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8658574819564819, + "num_tokens": 354808563.0, + "step": 9299 + }, + { + "epoch": 1.183055590891744, + "ewc_loss": 0.05906647443771362, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002659577294252813, + "grad_norm": 6.817420482635498, + "learning_rate": 1e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8424187898635864, + "num_tokens": 354846618.0, + "step": 9300 + }, + { + "epoch": 1.1831828011703345, + "ewc_loss": 0.05904596298933029, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002657526056282222, + "grad_norm": 6.808980464935303, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8644927144050598, + "num_tokens": 354881293.0, + "step": 9301 + }, + { + "epoch": 1.183310011448925, + "ewc_loss": 0.058996934443712234, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002652623225003481, + "grad_norm": 6.755342483520508, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8600860834121704, + "num_tokens": 354925239.0, + "step": 9302 + }, + { + "epoch": 1.1834372217275155, + "ewc_loss": 0.05919060483574867, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000267199007794261, + "grad_norm": 6.810700416564941, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8485239744186401, + "num_tokens": 354967836.0, + "step": 9303 + }, + { + "epoch": 1.183564432006106, + "ewc_loss": 0.05902135372161865, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002655065036378801, + "grad_norm": 6.816307544708252, + "learning_rate": 1e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8463745713233948, + "num_tokens": 355003386.0, + "step": 9304 + }, + { + "epoch": 1.1836916422846966, + "ewc_loss": 0.059053000062704086, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002658229786902666, + "grad_norm": 6.816669940948486, + "learning_rate": 1e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8402982354164124, + "num_tokens": 355043391.0, + "step": 9305 + }, + { + "epoch": 1.1838188525632871, + "ewc_loss": 0.059095799922943115, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002662509505171329, + "grad_norm": 6.7899489402771, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.862929105758667, + "num_tokens": 355082820.0, + "step": 9306 + }, + { + "epoch": 1.1839460628418776, + "ewc_loss": 0.05909261107444763, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002662190527189523, + "grad_norm": 6.840484142303467, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8830189108848572, + "num_tokens": 355117531.0, + "step": 9307 + }, + { + "epoch": 1.1840732731204682, + "ewc_loss": 0.059048138558864594, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002657743461895734, + "grad_norm": 6.789931774139404, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8502269387245178, + "num_tokens": 355159485.0, + "step": 9308 + }, + { + "epoch": 1.1842004833990587, + "ewc_loss": 0.05903496593236923, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002656426513567567, + "grad_norm": 6.791109561920166, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8565554618835449, + "num_tokens": 355206051.0, + "step": 9309 + }, + { + "epoch": 1.1843276936776492, + "ewc_loss": 0.05910239368677139, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026631689979694784, + "grad_norm": 6.820895195007324, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.853514552116394, + "num_tokens": 355242933.0, + "step": 9310 + }, + { + "epoch": 1.1844549039562398, + "ewc_loss": 0.05904025584459305, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002656955039128661, + "grad_norm": 6.76045560836792, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.87400221824646, + "num_tokens": 355286311.0, + "step": 9311 + }, + { + "epoch": 1.1845821142348303, + "ewc_loss": 0.059130385518074036, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002665968204382807, + "grad_norm": 6.83730936050415, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8613699674606323, + "num_tokens": 355327844.0, + "step": 9312 + }, + { + "epoch": 1.1847093245134206, + "ewc_loss": 0.058976154774427414, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000265054521150887, + "grad_norm": 6.758481025695801, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8702391386032104, + "num_tokens": 355368977.0, + "step": 9313 + }, + { + "epoch": 1.1848365347920111, + "ewc_loss": 0.05915876105427742, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026688058278523386, + "grad_norm": 6.817348003387451, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8510779142379761, + "num_tokens": 355411479.0, + "step": 9314 + }, + { + "epoch": 1.1849637450706016, + "ewc_loss": 0.05904287099838257, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002657216682564467, + "grad_norm": 6.802029609680176, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8804738521575928, + "num_tokens": 355447370.0, + "step": 9315 + }, + { + "epoch": 1.1850909553491922, + "ewc_loss": 0.05912477523088455, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002665407373569906, + "grad_norm": 6.812097549438477, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8588524460792542, + "num_tokens": 355490335.0, + "step": 9316 + }, + { + "epoch": 1.1852181656277827, + "ewc_loss": 0.059118881821632385, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002664818021003157, + "grad_norm": 6.817811965942383, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8625073432922363, + "num_tokens": 355527843.0, + "step": 9317 + }, + { + "epoch": 1.1853453759063732, + "ewc_loss": 0.059141725301742554, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002667102380655706, + "grad_norm": 6.8447489738464355, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8593473434448242, + "num_tokens": 355566094.0, + "step": 9318 + }, + { + "epoch": 1.1854725861849638, + "ewc_loss": 0.058997925370931625, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002652722178027034, + "grad_norm": 6.834816932678223, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8694168329238892, + "num_tokens": 355609505.0, + "step": 9319 + }, + { + "epoch": 1.1855997964635543, + "ewc_loss": 0.05910911411046982, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026638410054147243, + "grad_norm": 6.85216760635376, + "learning_rate": 1e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.8369364142417908, + "num_tokens": 355650930.0, + "step": 9320 + }, + { + "epoch": 1.1857270067421448, + "ewc_loss": 0.05898602306842804, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002651532122399658, + "grad_norm": 6.79991340637207, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8720976114273071, + "num_tokens": 355684075.0, + "step": 9321 + }, + { + "epoch": 1.1858542170207353, + "ewc_loss": 0.059102170169353485, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026631465880200267, + "grad_norm": 6.793907165527344, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8681915402412415, + "num_tokens": 355723567.0, + "step": 9322 + }, + { + "epoch": 1.1859814272993259, + "ewc_loss": 0.05909756198525429, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026626858743838966, + "grad_norm": 6.845122337341309, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8466519713401794, + "num_tokens": 355764915.0, + "step": 9323 + }, + { + "epoch": 1.1861086375779162, + "ewc_loss": 0.059036750346422195, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002656604629009962, + "grad_norm": 6.803761959075928, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8627837300300598, + "num_tokens": 355804950.0, + "step": 9324 + }, + { + "epoch": 1.1862358478565067, + "ewc_loss": 0.059126511216163635, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026655805413611233, + "grad_norm": 6.840615272521973, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.869082510471344, + "num_tokens": 355844155.0, + "step": 9325 + }, + { + "epoch": 1.1863630581350972, + "ewc_loss": 0.059074766933918, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002660406462382525, + "grad_norm": 6.765605449676514, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8781875371932983, + "num_tokens": 355883971.0, + "step": 9326 + }, + { + "epoch": 1.1864902684136878, + "ewc_loss": 0.05921751260757446, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002674680727068335, + "grad_norm": 6.89503812789917, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8571376800537109, + "num_tokens": 355922271.0, + "step": 9327 + }, + { + "epoch": 1.1866174786922783, + "ewc_loss": 0.05908579379320145, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002661509206518531, + "grad_norm": 6.789548397064209, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8595422506332397, + "num_tokens": 355961087.0, + "step": 9328 + }, + { + "epoch": 1.1867446889708688, + "ewc_loss": 0.05922774598002434, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002675704308785498, + "grad_norm": 6.877939224243164, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8657269477844238, + "num_tokens": 355999553.0, + "step": 9329 + }, + { + "epoch": 1.1868718992494594, + "ewc_loss": 0.05912325531244278, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026652554515749216, + "grad_norm": 6.841606616973877, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8664110898971558, + "num_tokens": 356038503.0, + "step": 9330 + }, + { + "epoch": 1.1869991095280499, + "ewc_loss": 0.059179678559303284, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002670897520147264, + "grad_norm": 6.856569290161133, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8729805946350098, + "num_tokens": 356078538.0, + "step": 9331 + }, + { + "epoch": 1.1871263198066404, + "ewc_loss": 0.059157662093639374, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002668695815373212, + "grad_norm": 6.850264549255371, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8741140365600586, + "num_tokens": 356115031.0, + "step": 9332 + }, + { + "epoch": 1.187253530085231, + "ewc_loss": 0.05913986265659332, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002666916116140783, + "grad_norm": 6.886144161224365, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8760100603103638, + "num_tokens": 356154751.0, + "step": 9333 + }, + { + "epoch": 1.1873807403638215, + "ewc_loss": 0.05910872668027878, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002663802297320217, + "grad_norm": 6.929981231689453, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.847040057182312, + "num_tokens": 356192105.0, + "step": 9334 + }, + { + "epoch": 1.187507950642412, + "ewc_loss": 0.059033047407865524, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026562344282865524, + "grad_norm": 6.8671417236328125, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8517465591430664, + "num_tokens": 356224964.0, + "step": 9335 + }, + { + "epoch": 1.1876351609210025, + "ewc_loss": 0.05904282256960869, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002657212025951594, + "grad_norm": 6.915516376495361, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8674989938735962, + "num_tokens": 356261565.0, + "step": 9336 + }, + { + "epoch": 1.187762371199593, + "ewc_loss": 0.0589040145277977, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002643330954015255, + "grad_norm": 6.768935203552246, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8646771311759949, + "num_tokens": 356301886.0, + "step": 9337 + }, + { + "epoch": 1.1878895814781834, + "ewc_loss": 0.05913487821817398, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002666417567525059, + "grad_norm": 6.852646827697754, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8702633380889893, + "num_tokens": 356339481.0, + "step": 9338 + }, + { + "epoch": 1.1880167917567739, + "ewc_loss": 0.05894990637898445, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026479203370399773, + "grad_norm": 6.830578804016113, + "learning_rate": 1e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.8387695550918579, + "num_tokens": 356387038.0, + "step": 9339 + }, + { + "epoch": 1.1881440020353644, + "ewc_loss": 0.05913032591342926, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026659623836167157, + "grad_norm": 6.867228984832764, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8578377962112427, + "num_tokens": 356433256.0, + "step": 9340 + }, + { + "epoch": 1.188271212313955, + "ewc_loss": 0.05898744985461235, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002651674731168896, + "grad_norm": 6.810105323791504, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8525435328483582, + "num_tokens": 356471282.0, + "step": 9341 + }, + { + "epoch": 1.1883984225925455, + "ewc_loss": 0.05907561630010605, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002660491445567459, + "grad_norm": 6.8610520362854, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8680353164672852, + "num_tokens": 356508951.0, + "step": 9342 + }, + { + "epoch": 1.188525632871136, + "ewc_loss": 0.05906718969345093, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026596488896757364, + "grad_norm": 6.873164653778076, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8645367622375488, + "num_tokens": 356544487.0, + "step": 9343 + }, + { + "epoch": 1.1886528431497265, + "ewc_loss": 0.05902998894453049, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026559288380667567, + "grad_norm": 6.841073036193848, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8688311576843262, + "num_tokens": 356580873.0, + "step": 9344 + }, + { + "epoch": 1.188780053428317, + "ewc_loss": 0.059073854237794876, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002660315076354891, + "grad_norm": 6.881475448608398, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8600075840950012, + "num_tokens": 356613622.0, + "step": 9345 + }, + { + "epoch": 1.1889072637069076, + "ewc_loss": 0.0590553879737854, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026584684383124113, + "grad_norm": 6.846684455871582, + "learning_rate": 1e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8395446538925171, + "num_tokens": 356660058.0, + "step": 9346 + }, + { + "epoch": 1.189034473985498, + "ewc_loss": 0.05906764045357704, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000265969370957464, + "grad_norm": 6.819132328033447, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8606582880020142, + "num_tokens": 356701614.0, + "step": 9347 + }, + { + "epoch": 1.1891616842640884, + "ewc_loss": 0.05908110365271568, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026610400527715683, + "grad_norm": 6.878235340118408, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8769959807395935, + "num_tokens": 356737105.0, + "step": 9348 + }, + { + "epoch": 1.189288894542679, + "ewc_loss": 0.058966487646102905, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002649578673299402, + "grad_norm": 6.797886848449707, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8516508340835571, + "num_tokens": 356779382.0, + "step": 9349 + }, + { + "epoch": 1.1894161048212695, + "ewc_loss": 0.05920369178056717, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002673298877198249, + "grad_norm": 6.875495910644531, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8579272627830505, + "num_tokens": 356813836.0, + "step": 9350 + }, + { + "epoch": 1.18954331509986, + "ewc_loss": 0.05900883674621582, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026538135716691613, + "grad_norm": 6.819615840911865, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8522700667381287, + "num_tokens": 356852591.0, + "step": 9351 + }, + { + "epoch": 1.1896705253784505, + "ewc_loss": 0.05920708179473877, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000267363793682307, + "grad_norm": 6.854574203491211, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.861152172088623, + "num_tokens": 356889940.0, + "step": 9352 + }, + { + "epoch": 1.189797735657041, + "ewc_loss": 0.059105999767780304, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002663529885467142, + "grad_norm": 6.815521240234375, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8649542331695557, + "num_tokens": 356927878.0, + "step": 9353 + }, + { + "epoch": 1.1899249459356316, + "ewc_loss": 0.059172000735998154, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026701297610998154, + "grad_norm": 6.896871089935303, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.87043297290802, + "num_tokens": 356963339.0, + "step": 9354 + }, + { + "epoch": 1.1900521562142221, + "ewc_loss": 0.05912270396947861, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002665199863258749, + "grad_norm": 6.769741535186768, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8656675815582275, + "num_tokens": 357002999.0, + "step": 9355 + }, + { + "epoch": 1.1901793664928126, + "ewc_loss": 0.05925025790929794, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026779554900713265, + "grad_norm": 6.905407428741455, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8650897145271301, + "num_tokens": 357044305.0, + "step": 9356 + }, + { + "epoch": 1.1903065767714032, + "ewc_loss": 0.05907373130321503, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002660302852746099, + "grad_norm": 6.780529022216797, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8637476563453674, + "num_tokens": 357082197.0, + "step": 9357 + }, + { + "epoch": 1.1904337870499937, + "ewc_loss": 0.059222884476184845, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002675217983778566, + "grad_norm": 6.8892388343811035, + "learning_rate": 1e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8436039686203003, + "num_tokens": 357119541.0, + "step": 9358 + }, + { + "epoch": 1.1905609973285842, + "ewc_loss": 0.05909625440835953, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002662555198185146, + "grad_norm": 6.7995924949646, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8829951882362366, + "num_tokens": 357159176.0, + "step": 9359 + }, + { + "epoch": 1.1906882076071748, + "ewc_loss": 0.059293631464242935, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026822928339242935, + "grad_norm": 6.856896877288818, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8606088161468506, + "num_tokens": 357199204.0, + "step": 9360 + }, + { + "epoch": 1.1908154178857653, + "ewc_loss": 0.05909397453069687, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026623273151926696, + "grad_norm": 6.762790203094482, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8771325945854187, + "num_tokens": 357236900.0, + "step": 9361 + }, + { + "epoch": 1.1909426281643556, + "ewc_loss": 0.059319283813238144, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000268485804554075, + "grad_norm": 6.8221049308776855, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8627308011054993, + "num_tokens": 357271622.0, + "step": 9362 + }, + { + "epoch": 1.1910698384429461, + "ewc_loss": 0.05914535000920296, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002667464723344892, + "grad_norm": 6.808581829071045, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8737161755561829, + "num_tokens": 357311145.0, + "step": 9363 + }, + { + "epoch": 1.1911970487215366, + "ewc_loss": 0.05917271599173546, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002670201356522739, + "grad_norm": 6.851802349090576, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8791784048080444, + "num_tokens": 357344609.0, + "step": 9364 + }, + { + "epoch": 1.1913242590001272, + "ewc_loss": 0.05918732285499573, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002671661786735058, + "grad_norm": 6.792566776275635, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.858603298664093, + "num_tokens": 357378896.0, + "step": 9365 + }, + { + "epoch": 1.1914514692787177, + "ewc_loss": 0.05918402969837189, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002671332622412592, + "grad_norm": 6.828258037567139, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8658588528633118, + "num_tokens": 357424730.0, + "step": 9366 + }, + { + "epoch": 1.1915786795573082, + "ewc_loss": 0.05906021595001221, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026589512708596885, + "grad_norm": 6.793764591217041, + "learning_rate": 1e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.8441294431686401, + "num_tokens": 357461152.0, + "step": 9367 + }, + { + "epoch": 1.1917058898358988, + "ewc_loss": 0.059233710169792175, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026763006462715566, + "grad_norm": 6.873271465301514, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8546543121337891, + "num_tokens": 357502305.0, + "step": 9368 + }, + { + "epoch": 1.1918331001144893, + "ewc_loss": 0.059177882969379425, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026707182405516505, + "grad_norm": 6.903702259063721, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8561222553253174, + "num_tokens": 357539187.0, + "step": 9369 + }, + { + "epoch": 1.1919603103930798, + "ewc_loss": 0.05914558097720146, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002667487715370953, + "grad_norm": 6.881224632263184, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8514590859413147, + "num_tokens": 357569520.0, + "step": 9370 + }, + { + "epoch": 1.1920875206716703, + "ewc_loss": 0.05919395759701729, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026723253540694714, + "grad_norm": 6.865557670593262, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8611159324645996, + "num_tokens": 357605253.0, + "step": 9371 + }, + { + "epoch": 1.1922147309502609, + "ewc_loss": 0.059104159474372864, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002663345658220351, + "grad_norm": 6.839933395385742, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8724859952926636, + "num_tokens": 357639701.0, + "step": 9372 + }, + { + "epoch": 1.1923419412288512, + "ewc_loss": 0.05912385508418083, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002665315114427358, + "grad_norm": 6.793375015258789, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8706823587417603, + "num_tokens": 357675611.0, + "step": 9373 + }, + { + "epoch": 1.1924691515074417, + "ewc_loss": 0.059242576360702515, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026771874399855733, + "grad_norm": 6.868422508239746, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8744677305221558, + "num_tokens": 357708521.0, + "step": 9374 + }, + { + "epoch": 1.1925963617860322, + "ewc_loss": 0.05908804386854172, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026617341791279614, + "grad_norm": 6.793700218200684, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8523911237716675, + "num_tokens": 357749970.0, + "step": 9375 + }, + { + "epoch": 1.1927235720646228, + "ewc_loss": 0.059238139539957047, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002676743606571108, + "grad_norm": 6.913270950317383, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8724833726882935, + "num_tokens": 357783287.0, + "step": 9376 + }, + { + "epoch": 1.1928507823432133, + "ewc_loss": 0.05909569561481476, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002662499318830669, + "grad_norm": 6.843535423278809, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8632480502128601, + "num_tokens": 357818883.0, + "step": 9377 + }, + { + "epoch": 1.1929779926218038, + "ewc_loss": 0.05914025381207466, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002666955115273595, + "grad_norm": 6.7811126708984375, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8751819729804993, + "num_tokens": 357862862.0, + "step": 9378 + }, + { + "epoch": 1.1931052029003943, + "ewc_loss": 0.05921970307826996, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026749001699499786, + "grad_norm": 6.913559436798096, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8515563607215881, + "num_tokens": 357901574.0, + "step": 9379 + }, + { + "epoch": 1.1932324131789849, + "ewc_loss": 0.059052418917417526, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026581715792417526, + "grad_norm": 6.793666362762451, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.862675666809082, + "num_tokens": 357941822.0, + "step": 9380 + }, + { + "epoch": 1.1933596234575754, + "ewc_loss": 0.05927550047636032, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026804799563251436, + "grad_norm": 6.849367618560791, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8547719717025757, + "num_tokens": 357982820.0, + "step": 9381 + }, + { + "epoch": 1.193486833736166, + "ewc_loss": 0.059107549488544464, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026636847178451717, + "grad_norm": 6.833548545837402, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8582871556282043, + "num_tokens": 358020491.0, + "step": 9382 + }, + { + "epoch": 1.1936140440147565, + "ewc_loss": 0.05921529233455658, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026744589558802545, + "grad_norm": 6.819124221801758, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8632633090019226, + "num_tokens": 358059775.0, + "step": 9383 + }, + { + "epoch": 1.193741254293347, + "ewc_loss": 0.05921812355518341, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002674741845112294, + "grad_norm": 6.838761806488037, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8700488805770874, + "num_tokens": 358098672.0, + "step": 9384 + }, + { + "epoch": 1.1938684645719375, + "ewc_loss": 0.05918329209089279, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026712589897215366, + "grad_norm": 6.791396617889404, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8558061122894287, + "num_tokens": 358140850.0, + "step": 9385 + }, + { + "epoch": 1.193995674850528, + "ewc_loss": 0.05919615924358368, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002672545379027724, + "grad_norm": 6.903677940368652, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8684632182121277, + "num_tokens": 358175503.0, + "step": 9386 + }, + { + "epoch": 1.1941228851291183, + "ewc_loss": 0.05916431546211243, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026693614199757576, + "grad_norm": 6.848165512084961, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8642299175262451, + "num_tokens": 358210615.0, + "step": 9387 + }, + { + "epoch": 1.1942500954077089, + "ewc_loss": 0.05910150334239006, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002663079940248281, + "grad_norm": 6.819348335266113, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8642171025276184, + "num_tokens": 358246605.0, + "step": 9388 + }, + { + "epoch": 1.1943773056862994, + "ewc_loss": 0.05905827507376671, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002658757148310542, + "grad_norm": 6.828589916229248, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8587552309036255, + "num_tokens": 358283136.0, + "step": 9389 + }, + { + "epoch": 1.19450451596489, + "ewc_loss": 0.05909545719623566, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000266247516265139, + "grad_norm": 6.807439804077148, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8713526725769043, + "num_tokens": 358325437.0, + "step": 9390 + }, + { + "epoch": 1.1946317262434805, + "ewc_loss": 0.059126805514097214, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002665610227268189, + "grad_norm": 6.847663402557373, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8625295162200928, + "num_tokens": 358365360.0, + "step": 9391 + }, + { + "epoch": 1.194758936522071, + "ewc_loss": 0.0593307763338089, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002661593025550246, + "grad_norm": 6.7930755615234375, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8783425092697144, + "num_tokens": 358406502.0, + "step": 9392 + }, + { + "epoch": 1.1948861468006615, + "ewc_loss": 0.05924360454082489, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026772901765070856, + "grad_norm": 6.9138360023498535, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.859539270401001, + "num_tokens": 358437220.0, + "step": 9393 + }, + { + "epoch": 1.195013357079252, + "ewc_loss": 0.05904879420995712, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026578089455142617, + "grad_norm": 6.810568809509277, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8505560755729675, + "num_tokens": 358475314.0, + "step": 9394 + }, + { + "epoch": 1.1951405673578426, + "ewc_loss": 0.05921278893947601, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026742086629383266, + "grad_norm": 6.828246593475342, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8718898296356201, + "num_tokens": 358520777.0, + "step": 9395 + }, + { + "epoch": 1.195267777636433, + "ewc_loss": 0.05919107794761658, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026720372261479497, + "grad_norm": 6.833374500274658, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.88196861743927, + "num_tokens": 358557973.0, + "step": 9396 + }, + { + "epoch": 1.1953949879150234, + "ewc_loss": 0.059160567820072174, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026689862716011703, + "grad_norm": 6.8382463455200195, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8698687553405762, + "num_tokens": 358596795.0, + "step": 9397 + }, + { + "epoch": 1.195522198193614, + "ewc_loss": 0.059171319007873535, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002670061367098242, + "grad_norm": 6.840892791748047, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8629791140556335, + "num_tokens": 358632146.0, + "step": 9398 + }, + { + "epoch": 1.1956494084722045, + "ewc_loss": 0.059125471860170364, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026654769317246974, + "grad_norm": 6.856576442718506, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8703277111053467, + "num_tokens": 358672194.0, + "step": 9399 + }, + { + "epoch": 1.195776618750795, + "ewc_loss": 0.05910941958427429, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026638718554750085, + "grad_norm": 6.836101531982422, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8702021837234497, + "num_tokens": 358710192.0, + "step": 9400 + }, + { + "epoch": 1.1959038290293855, + "ewc_loss": 0.05911194160580635, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002664123894646764, + "grad_norm": 6.843316078186035, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8490618467330933, + "num_tokens": 358749284.0, + "step": 9401 + }, + { + "epoch": 1.196031039307976, + "ewc_loss": 0.05904968082904816, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026578980032354593, + "grad_norm": 6.875524520874023, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8578706979751587, + "num_tokens": 358782830.0, + "step": 9402 + }, + { + "epoch": 1.1961582495865666, + "ewc_loss": 0.05909070372581482, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026619998971000314, + "grad_norm": 6.829151153564453, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8610896468162537, + "num_tokens": 358820696.0, + "step": 9403 + }, + { + "epoch": 1.196285459865157, + "ewc_loss": 0.05908658355474472, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002661587786860764, + "grad_norm": 6.924637794494629, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8694882988929749, + "num_tokens": 358853616.0, + "step": 9404 + }, + { + "epoch": 1.1964126701437476, + "ewc_loss": 0.05903557687997818, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026564873405732214, + "grad_norm": 6.7387919425964355, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8719321489334106, + "num_tokens": 358893420.0, + "step": 9405 + }, + { + "epoch": 1.1965398804223382, + "ewc_loss": 0.05925057828426361, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002677987504284829, + "grad_norm": 6.948585033416748, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8647574782371521, + "num_tokens": 358930704.0, + "step": 9406 + }, + { + "epoch": 1.1966670907009287, + "ewc_loss": 0.05899583548307419, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002652513503562659, + "grad_norm": 6.7706170082092285, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.876176655292511, + "num_tokens": 358967730.0, + "step": 9407 + }, + { + "epoch": 1.1967943009795192, + "ewc_loss": 0.05929877609014511, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002682807098608464, + "grad_norm": 6.86684513092041, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8756768703460693, + "num_tokens": 359007241.0, + "step": 9408 + }, + { + "epoch": 1.1969215112581097, + "ewc_loss": 0.058992065489292145, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002652136026881635, + "grad_norm": 6.809850692749023, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8496702313423157, + "num_tokens": 359045573.0, + "step": 9409 + }, + { + "epoch": 1.1970487215367003, + "ewc_loss": 0.05915794149041176, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026687237550504506, + "grad_norm": 6.8648457527160645, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8658883571624756, + "num_tokens": 359083420.0, + "step": 9410 + }, + { + "epoch": 1.1971759318152906, + "ewc_loss": 0.059097640216350555, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026626934413798153, + "grad_norm": 6.859078407287598, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8765119314193726, + "num_tokens": 359117555.0, + "step": 9411 + }, + { + "epoch": 1.1973031420938811, + "ewc_loss": 0.05909361690282822, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026622912264429033, + "grad_norm": 6.829917907714844, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8603717684745789, + "num_tokens": 359154597.0, + "step": 9412 + }, + { + "epoch": 1.1974303523724716, + "ewc_loss": 0.05917948856949806, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002670878602657467, + "grad_norm": 6.8561882972717285, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8566742539405823, + "num_tokens": 359190074.0, + "step": 9413 + }, + { + "epoch": 1.1975575626510622, + "ewc_loss": 0.05907060205936432, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026599899865686893, + "grad_norm": 6.807527542114258, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8505460619926453, + "num_tokens": 359232711.0, + "step": 9414 + }, + { + "epoch": 1.1976847729296527, + "ewc_loss": 0.059267718344926834, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002679701428860426, + "grad_norm": 6.942170143127441, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8620212078094482, + "num_tokens": 359265582.0, + "step": 9415 + }, + { + "epoch": 1.1978119832082432, + "ewc_loss": 0.05910981446504593, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002663911145646125, + "grad_norm": 6.8019819259643555, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.871442437171936, + "num_tokens": 359303177.0, + "step": 9416 + }, + { + "epoch": 1.1979391934868338, + "ewc_loss": 0.0592469796538353, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002677627489902079, + "grad_norm": 6.859700679779053, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8730650544166565, + "num_tokens": 359338712.0, + "step": 9417 + }, + { + "epoch": 1.1980664037654243, + "ewc_loss": 0.05907847359776497, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026607769541442394, + "grad_norm": 6.8667521476745605, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8700859546661377, + "num_tokens": 359372849.0, + "step": 9418 + }, + { + "epoch": 1.1981936140440148, + "ewc_loss": 0.05915575847029686, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002668505476322025, + "grad_norm": 6.875367641448975, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8672360777854919, + "num_tokens": 359407981.0, + "step": 9419 + }, + { + "epoch": 1.1983208243226053, + "ewc_loss": 0.05918681621551514, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026716114371083677, + "grad_norm": 6.828943729400635, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8434270620346069, + "num_tokens": 359450180.0, + "step": 9420 + }, + { + "epoch": 1.1984480346011959, + "ewc_loss": 0.0591629296541214, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002669222594704479, + "grad_norm": 6.804375648498535, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8625996112823486, + "num_tokens": 359487909.0, + "step": 9421 + }, + { + "epoch": 1.1985752448797862, + "ewc_loss": 0.05923142284154892, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026760718901641667, + "grad_norm": 6.90458869934082, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8746142387390137, + "num_tokens": 359522609.0, + "step": 9422 + }, + { + "epoch": 1.1987024551583767, + "ewc_loss": 0.05935361236333847, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026638770941644907, + "grad_norm": 6.803892135620117, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8601821064949036, + "num_tokens": 359560735.0, + "step": 9423 + }, + { + "epoch": 1.1988296654369672, + "ewc_loss": 0.05923382192850113, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000267631170572713, + "grad_norm": 6.818810939788818, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8565306663513184, + "num_tokens": 359602994.0, + "step": 9424 + }, + { + "epoch": 1.1989568757155578, + "ewc_loss": 0.05918607488274574, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002671537222340703, + "grad_norm": 6.847595691680908, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8627574443817139, + "num_tokens": 359638864.0, + "step": 9425 + }, + { + "epoch": 1.1990840859941483, + "ewc_loss": 0.059147678315639496, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002667697553988546, + "grad_norm": 6.861629486083984, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8709739446640015, + "num_tokens": 359674202.0, + "step": 9426 + }, + { + "epoch": 1.1992112962727388, + "ewc_loss": 0.05918332189321518, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026712619001045823, + "grad_norm": 6.827798366546631, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8678775429725647, + "num_tokens": 359710408.0, + "step": 9427 + }, + { + "epoch": 1.1993385065513293, + "ewc_loss": 0.05925234407186508, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026781641645357013, + "grad_norm": 6.803841590881348, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8714040517807007, + "num_tokens": 359749068.0, + "step": 9428 + }, + { + "epoch": 1.1994657168299199, + "ewc_loss": 0.05924437195062637, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026773670106194913, + "grad_norm": 6.847628116607666, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8634142875671387, + "num_tokens": 359788449.0, + "step": 9429 + }, + { + "epoch": 1.1995929271085104, + "ewc_loss": 0.05919012427330017, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002671942056622356, + "grad_norm": 6.856780052185059, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8576790690422058, + "num_tokens": 359824167.0, + "step": 9430 + }, + { + "epoch": 1.199720137387101, + "ewc_loss": 0.05917344242334366, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002670273825060576, + "grad_norm": 6.828784942626953, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8578439950942993, + "num_tokens": 359861476.0, + "step": 9431 + }, + { + "epoch": 1.1998473476656915, + "ewc_loss": 0.059228844940662384, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000267581403022632, + "grad_norm": 6.850601673126221, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8608778715133667, + "num_tokens": 359906463.0, + "step": 9432 + }, + { + "epoch": 1.199974557944282, + "ewc_loss": 0.0591396689414978, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002666896616574377, + "grad_norm": 6.81626033782959, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8662075996398926, + "num_tokens": 359940576.0, + "step": 9433 + }, + { + "epoch": 1.2001017682228725, + "ewc_loss": 0.05922635644674301, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002675565192475915, + "grad_norm": 6.855737686157227, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8739591240882874, + "num_tokens": 359973616.0, + "step": 9434 + }, + { + "epoch": 1.200228978501463, + "ewc_loss": 0.05923881381750107, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026768111274577677, + "grad_norm": 6.838042259216309, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8759634494781494, + "num_tokens": 360011059.0, + "step": 9435 + }, + { + "epoch": 1.2003561887800533, + "ewc_loss": 0.059289999306201935, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026819296181201935, + "grad_norm": 6.883696556091309, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8721833229064941, + "num_tokens": 360048307.0, + "step": 9436 + }, + { + "epoch": 1.2004833990586439, + "ewc_loss": 0.05906974524259567, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002659904130268842, + "grad_norm": 6.780501365661621, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8624179363250732, + "num_tokens": 360086946.0, + "step": 9437 + }, + { + "epoch": 1.2006106093372344, + "ewc_loss": 0.05926872417330742, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002679802128113806, + "grad_norm": 6.842954635620117, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8675412535667419, + "num_tokens": 360121908.0, + "step": 9438 + }, + { + "epoch": 1.200737819615825, + "ewc_loss": 0.05921278893947601, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002674208371900022, + "grad_norm": 6.863078594207764, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8547511696815491, + "num_tokens": 360164349.0, + "step": 9439 + }, + { + "epoch": 1.2008650298944155, + "ewc_loss": 0.05916595086455345, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000266952469246462, + "grad_norm": 6.864579200744629, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8785248398780823, + "num_tokens": 360197660.0, + "step": 9440 + }, + { + "epoch": 1.200992240173006, + "ewc_loss": 0.05922957509756088, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026758870808407664, + "grad_norm": 6.872281551361084, + "learning_rate": 1e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8420329093933105, + "num_tokens": 360240297.0, + "step": 9441 + }, + { + "epoch": 1.2011194504515965, + "ewc_loss": 0.05915047973394394, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000266797753283754, + "grad_norm": 6.827700614929199, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.853205680847168, + "num_tokens": 360281566.0, + "step": 9442 + }, + { + "epoch": 1.201246660730187, + "ewc_loss": 0.05921965837478638, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026748955133371055, + "grad_norm": 6.887569904327393, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.868213415145874, + "num_tokens": 360313927.0, + "step": 9443 + }, + { + "epoch": 1.2013738710087776, + "ewc_loss": 0.05917045474052429, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000266997521976009, + "grad_norm": 6.869374752044678, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8656071424484253, + "num_tokens": 360349369.0, + "step": 9444 + }, + { + "epoch": 1.201501081287368, + "ewc_loss": 0.05916386842727661, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002669316600076854, + "grad_norm": 6.8541951179504395, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8627920150756836, + "num_tokens": 360386996.0, + "step": 9445 + }, + { + "epoch": 1.2016282915659584, + "ewc_loss": 0.05917320400476456, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002670250250957906, + "grad_norm": 6.800470352172852, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8681015968322754, + "num_tokens": 360424883.0, + "step": 9446 + }, + { + "epoch": 1.201755501844549, + "ewc_loss": 0.059274882078170776, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002680417674127966, + "grad_norm": 6.909257888793945, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8677600026130676, + "num_tokens": 360457992.0, + "step": 9447 + }, + { + "epoch": 1.2018827121231395, + "ewc_loss": 0.05919753760099411, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002672683331184089, + "grad_norm": 6.898073673248291, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8483506441116333, + "num_tokens": 360497934.0, + "step": 9448 + }, + { + "epoch": 1.20200992240173, + "ewc_loss": 0.059179652482271194, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002670894900802523, + "grad_norm": 6.822767734527588, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8598247766494751, + "num_tokens": 360533928.0, + "step": 9449 + }, + { + "epoch": 1.2021371326803205, + "ewc_loss": 0.05953604727983475, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026821205392479897, + "grad_norm": 6.942612171173096, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.865788459777832, + "num_tokens": 360571076.0, + "step": 9450 + }, + { + "epoch": 1.202264342958911, + "ewc_loss": 0.05914048105478287, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002666977816261351, + "grad_norm": 6.802506923675537, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8558396697044373, + "num_tokens": 360611957.0, + "step": 9451 + }, + { + "epoch": 1.2023915532375016, + "ewc_loss": 0.059460610151290894, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000269899086561054, + "grad_norm": 6.95720100402832, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8568356037139893, + "num_tokens": 360647823.0, + "step": 9452 + }, + { + "epoch": 1.202518763516092, + "ewc_loss": 0.05915438383817673, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026683681062422693, + "grad_norm": 6.873442649841309, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8550180792808533, + "num_tokens": 360688341.0, + "step": 9453 + }, + { + "epoch": 1.2026459737946826, + "ewc_loss": 0.05927261337637901, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002680190955288708, + "grad_norm": 6.855346202850342, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8554227352142334, + "num_tokens": 360730417.0, + "step": 9454 + }, + { + "epoch": 1.2027731840732732, + "ewc_loss": 0.059291768819093704, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026821065694093704, + "grad_norm": 6.881191253662109, + "learning_rate": 1e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8428323268890381, + "num_tokens": 360771931.0, + "step": 9455 + }, + { + "epoch": 1.2029003943518637, + "ewc_loss": 0.05914977192878723, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000266790681052953, + "grad_norm": 6.798677444458008, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8614832758903503, + "num_tokens": 360811460.0, + "step": 9456 + }, + { + "epoch": 1.2030276046304542, + "ewc_loss": 0.05937357246875763, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002690287074074149, + "grad_norm": 6.906983375549316, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.876105546951294, + "num_tokens": 360849476.0, + "step": 9457 + }, + { + "epoch": 1.2031548149090447, + "ewc_loss": 0.05915217846632004, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002668147499207407, + "grad_norm": 6.801327228546143, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8526091575622559, + "num_tokens": 360891178.0, + "step": 9458 + }, + { + "epoch": 1.2032820251876353, + "ewc_loss": 0.05938859283924103, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026917888317257166, + "grad_norm": 6.906374931335449, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8594217896461487, + "num_tokens": 360930503.0, + "step": 9459 + }, + { + "epoch": 1.2034092354662256, + "ewc_loss": 0.059153519570827484, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002668281667865813, + "grad_norm": 6.827098846435547, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.859673023223877, + "num_tokens": 360963730.0, + "step": 9460 + }, + { + "epoch": 1.203536445744816, + "ewc_loss": 0.05932557210326195, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000268548697931692, + "grad_norm": 6.869604587554932, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8500880002975464, + "num_tokens": 361003796.0, + "step": 9461 + }, + { + "epoch": 1.2036636560234066, + "ewc_loss": 0.05922556668519974, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002675486321095377, + "grad_norm": 6.830203056335449, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8625841736793518, + "num_tokens": 361044818.0, + "step": 9462 + }, + { + "epoch": 1.2037908663019972, + "ewc_loss": 0.05935109406709671, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002688039094209671, + "grad_norm": 6.8654398918151855, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8742282390594482, + "num_tokens": 361080692.0, + "step": 9463 + }, + { + "epoch": 1.2039180765805877, + "ewc_loss": 0.05928778648376465, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002681708137970418, + "grad_norm": 6.923684120178223, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8535891771316528, + "num_tokens": 361116609.0, + "step": 9464 + }, + { + "epoch": 1.2040452868591782, + "ewc_loss": 0.05923939496278763, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002676869335118681, + "grad_norm": 6.831959247589111, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8561314940452576, + "num_tokens": 361154559.0, + "step": 9465 + }, + { + "epoch": 1.2041724971377687, + "ewc_loss": 0.05936575308442116, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026895050541497767, + "grad_norm": 6.8737874031066895, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8719382882118225, + "num_tokens": 361191005.0, + "step": 9466 + }, + { + "epoch": 1.2042997074163593, + "ewc_loss": 0.059237122535705566, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026766417431645095, + "grad_norm": 6.83660888671875, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8610657453536987, + "num_tokens": 361230191.0, + "step": 9467 + }, + { + "epoch": 1.2044269176949498, + "ewc_loss": 0.059418532997369766, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026947830338031054, + "grad_norm": 6.921859264373779, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8558995723724365, + "num_tokens": 361269132.0, + "step": 9468 + }, + { + "epoch": 1.2045541279735403, + "ewc_loss": 0.05926969274878502, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002679899043869227, + "grad_norm": 6.926261901855469, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8552215099334717, + "num_tokens": 361306674.0, + "step": 9469 + }, + { + "epoch": 1.2046813382521309, + "ewc_loss": 0.059215012937784195, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002674431016203016, + "grad_norm": 6.8298139572143555, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8641809225082397, + "num_tokens": 361348309.0, + "step": 9470 + }, + { + "epoch": 1.2048085485307212, + "ewc_loss": 0.059294819831848145, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026824118685908616, + "grad_norm": 6.849823951721191, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.857131838798523, + "num_tokens": 361386824.0, + "step": 9471 + }, + { + "epoch": 1.2049357588093117, + "ewc_loss": 0.059232354164123535, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002676165022421628, + "grad_norm": 6.849388122558594, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8429715633392334, + "num_tokens": 361426590.0, + "step": 9472 + }, + { + "epoch": 1.2050629690879022, + "ewc_loss": 0.059355899691581726, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002688519889488816, + "grad_norm": 6.874788761138916, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8490861058235168, + "num_tokens": 361463259.0, + "step": 9473 + }, + { + "epoch": 1.2051901793664928, + "ewc_loss": 0.05954662337899208, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002683177881408483, + "grad_norm": 6.915412425994873, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8713878393173218, + "num_tokens": 361498750.0, + "step": 9474 + }, + { + "epoch": 1.2053173896450833, + "ewc_loss": 0.05924475938081741, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002677405718713999, + "grad_norm": 6.883549690246582, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8526456356048584, + "num_tokens": 361537101.0, + "step": 9475 + }, + { + "epoch": 1.2054445999236738, + "ewc_loss": 0.059269845485687256, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002679914468899369, + "grad_norm": 6.869589805603027, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8645979166030884, + "num_tokens": 361572130.0, + "step": 9476 + }, + { + "epoch": 1.2055718102022643, + "ewc_loss": 0.05922139436006546, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026750689721666276, + "grad_norm": 6.887427806854248, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8775926828384399, + "num_tokens": 361608484.0, + "step": 9477 + }, + { + "epoch": 1.2056990204808549, + "ewc_loss": 0.059341005980968475, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002687030064407736, + "grad_norm": 6.8351359367370605, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8803927302360535, + "num_tokens": 361650361.0, + "step": 9478 + }, + { + "epoch": 1.2058262307594454, + "ewc_loss": 0.05932916700839996, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026858464116230607, + "grad_norm": 6.881039619445801, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8707900047302246, + "num_tokens": 361680670.0, + "step": 9479 + }, + { + "epoch": 1.205953441038036, + "ewc_loss": 0.059521786868572235, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026806944515556097, + "grad_norm": 6.851192474365234, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8682107925415039, + "num_tokens": 361723504.0, + "step": 9480 + }, + { + "epoch": 1.2060806513166265, + "ewc_loss": 0.059609413146972656, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026894567417912185, + "grad_norm": 6.931999206542969, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.869242787361145, + "num_tokens": 361765850.0, + "step": 9481 + }, + { + "epoch": 1.206207861595217, + "ewc_loss": 0.05917561054229736, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026704909396357834, + "grad_norm": 6.8697004318237305, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8544278144836426, + "num_tokens": 361804375.0, + "step": 9482 + }, + { + "epoch": 1.2063350718738075, + "ewc_loss": 0.05956588312983513, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026851039729081094, + "grad_norm": 6.925934314727783, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8599793910980225, + "num_tokens": 361834714.0, + "step": 9483 + }, + { + "epoch": 1.206462282152398, + "ewc_loss": 0.05934719741344452, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026632353547029197, + "grad_norm": 6.869072437286377, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8624485731124878, + "num_tokens": 361877568.0, + "step": 9484 + }, + { + "epoch": 1.2065894924309883, + "ewc_loss": 0.05955490469932556, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002684005885384977, + "grad_norm": 6.867377281188965, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8585517406463623, + "num_tokens": 361914785.0, + "step": 9485 + }, + { + "epoch": 1.2067167027095789, + "ewc_loss": 0.05942763760685921, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026712793624028563, + "grad_norm": 6.863027572631836, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8702474236488342, + "num_tokens": 361956781.0, + "step": 9486 + }, + { + "epoch": 1.2068439129881694, + "ewc_loss": 0.05918021500110626, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002670951362233609, + "grad_norm": 6.854092121124268, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8731153607368469, + "num_tokens": 361997408.0, + "step": 9487 + }, + { + "epoch": 1.20697112326676, + "ewc_loss": 0.05945531278848648, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026740471366792917, + "grad_norm": 6.817395210266113, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8628722429275513, + "num_tokens": 362030226.0, + "step": 9488 + }, + { + "epoch": 1.2070983335453505, + "ewc_loss": 0.059182047843933105, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026711347163654864, + "grad_norm": 6.884571552276611, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8733834028244019, + "num_tokens": 362068370.0, + "step": 9489 + }, + { + "epoch": 1.207225543823941, + "ewc_loss": 0.0591784231364727, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026707720826379955, + "grad_norm": 6.801769733428955, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8708336353302002, + "num_tokens": 362108569.0, + "step": 9490 + }, + { + "epoch": 1.2073527541025315, + "ewc_loss": 0.05928754806518555, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002681684272829443, + "grad_norm": 6.986264228820801, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8546407222747803, + "num_tokens": 362140476.0, + "step": 9491 + }, + { + "epoch": 1.207479964381122, + "ewc_loss": 0.05918123573064804, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026710532256402075, + "grad_norm": 6.903810501098633, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8693082332611084, + "num_tokens": 362171132.0, + "step": 9492 + }, + { + "epoch": 1.2076071746597126, + "ewc_loss": 0.05912337079644203, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026652668020688, + "grad_norm": 6.826966285705566, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8616663217544556, + "num_tokens": 362204791.0, + "step": 9493 + }, + { + "epoch": 1.207734384938303, + "ewc_loss": 0.05924932658672333, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002677862357813865, + "grad_norm": 6.8674845695495605, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8656802177429199, + "num_tokens": 362242360.0, + "step": 9494 + }, + { + "epoch": 1.2078615952168934, + "ewc_loss": 0.059157975018024445, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002668726956471801, + "grad_norm": 6.823836326599121, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8655816316604614, + "num_tokens": 362276558.0, + "step": 9495 + }, + { + "epoch": 1.207988805495484, + "ewc_loss": 0.05927419662475586, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002680349280126393, + "grad_norm": 6.924738883972168, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8482365608215332, + "num_tokens": 362307055.0, + "step": 9496 + }, + { + "epoch": 1.2081160157740745, + "ewc_loss": 0.05911189317703247, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026641192380338907, + "grad_norm": 6.817495822906494, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.873262882232666, + "num_tokens": 362342433.0, + "step": 9497 + }, + { + "epoch": 1.208243226052665, + "ewc_loss": 0.05932648479938507, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002685578365344554, + "grad_norm": 6.903631210327148, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8845479488372803, + "num_tokens": 362377008.0, + "step": 9498 + }, + { + "epoch": 1.2083704363312555, + "ewc_loss": 0.05904638022184372, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000265756796579808, + "grad_norm": 6.876667499542236, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8774514198303223, + "num_tokens": 362412089.0, + "step": 9499 + }, + { + "epoch": 1.208497646609846, + "ewc_loss": 0.05918431282043457, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002671360853128135, + "grad_norm": 6.829912185668945, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8546193838119507, + "num_tokens": 362455817.0, + "step": 9500 + }, + { + "epoch": 1.2086248568884366, + "ewc_loss": 0.059421107172966, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002670626563485712, + "grad_norm": 6.785963535308838, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8644704818725586, + "num_tokens": 362498859.0, + "step": 9501 + }, + { + "epoch": 1.208752067167027, + "ewc_loss": 0.05951499938964844, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002680015459191054, + "grad_norm": 6.852062225341797, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8628045916557312, + "num_tokens": 362542346.0, + "step": 9502 + }, + { + "epoch": 1.2088792774456176, + "ewc_loss": 0.05941351503133774, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002669866953510791, + "grad_norm": 6.9075775146484375, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8806401491165161, + "num_tokens": 362575360.0, + "step": 9503 + }, + { + "epoch": 1.2090064877242082, + "ewc_loss": 0.05948791652917862, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002677307347767055, + "grad_norm": 6.938801288604736, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8630028963088989, + "num_tokens": 362614737.0, + "step": 9504 + }, + { + "epoch": 1.2091336980027987, + "ewc_loss": 0.059377413243055344, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002666256914380938, + "grad_norm": 6.808418273925781, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8586630821228027, + "num_tokens": 362653756.0, + "step": 9505 + }, + { + "epoch": 1.2092609082813892, + "ewc_loss": 0.05951984226703644, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002680499746929854, + "grad_norm": 6.8818135261535645, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.863888680934906, + "num_tokens": 362690745.0, + "step": 9506 + }, + { + "epoch": 1.2093881185599797, + "ewc_loss": 0.05937119573354721, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002665635256562382, + "grad_norm": 6.813683986663818, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8719006776809692, + "num_tokens": 362726328.0, + "step": 9507 + }, + { + "epoch": 1.2095153288385703, + "ewc_loss": 0.05954828858375549, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002683344646357, + "grad_norm": 6.917882919311523, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8663817048072815, + "num_tokens": 362764285.0, + "step": 9508 + }, + { + "epoch": 1.2096425391171606, + "ewc_loss": 0.05931858718395233, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002660374157130718, + "grad_norm": 6.881255149841309, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8447672724723816, + "num_tokens": 362794697.0, + "step": 9509 + }, + { + "epoch": 1.209769749395751, + "ewc_loss": 0.059460077434778214, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002674523275345564, + "grad_norm": 6.861032485961914, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8611344695091248, + "num_tokens": 362831618.0, + "step": 9510 + }, + { + "epoch": 1.2098969596743416, + "ewc_loss": 0.05937642604112625, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026661582523956895, + "grad_norm": 6.869007110595703, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.871200680732727, + "num_tokens": 362866404.0, + "step": 9511 + }, + { + "epoch": 1.2100241699529322, + "ewc_loss": 0.05944414436817169, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026729298406280577, + "grad_norm": 6.833916187286377, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8792957067489624, + "num_tokens": 362902554.0, + "step": 9512 + }, + { + "epoch": 1.2101513802315227, + "ewc_loss": 0.05954600125551224, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026831155992113054, + "grad_norm": 6.852358818054199, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8580845594406128, + "num_tokens": 362944169.0, + "step": 9513 + }, + { + "epoch": 1.2102785905101132, + "ewc_loss": 0.059436313807964325, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002672147238627076, + "grad_norm": 6.858251094818115, + "learning_rate": 1e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8423272371292114, + "num_tokens": 362983789.0, + "step": 9514 + }, + { + "epoch": 1.2104058007887037, + "ewc_loss": 0.05952075123786926, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002680590841919184, + "grad_norm": 6.882716178894043, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8698610067367554, + "num_tokens": 363021551.0, + "step": 9515 + }, + { + "epoch": 1.2105330110672943, + "ewc_loss": 0.059455759823322296, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002674091374501586, + "grad_norm": 6.802558422088623, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8461474776268005, + "num_tokens": 363064951.0, + "step": 9516 + }, + { + "epoch": 1.2106602213458848, + "ewc_loss": 0.05951845645904541, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.000268036121269688, + "grad_norm": 6.845914840698242, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8624541759490967, + "num_tokens": 363110876.0, + "step": 9517 + }, + { + "epoch": 1.2107874316244753, + "ewc_loss": 0.05945903807878494, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002674419665709138, + "grad_norm": 6.804042339324951, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8608689308166504, + "num_tokens": 363155310.0, + "step": 9518 + }, + { + "epoch": 1.2109146419030659, + "ewc_loss": 0.05949399620294571, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002677915326785296, + "grad_norm": 6.843635082244873, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8749110102653503, + "num_tokens": 363193166.0, + "step": 9519 + }, + { + "epoch": 1.2110418521816562, + "ewc_loss": 0.0594848096370697, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002676996518857777, + "grad_norm": 6.881813049316406, + "learning_rate": 1e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8436366319656372, + "num_tokens": 363225163.0, + "step": 9520 + }, + { + "epoch": 1.2111690624602467, + "ewc_loss": 0.05950742959976196, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026792584685608745, + "grad_norm": 6.891829490661621, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8614506125450134, + "num_tokens": 363264720.0, + "step": 9521 + }, + { + "epoch": 1.2112962727388372, + "ewc_loss": 0.059543535113334656, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002682869089767337, + "grad_norm": 6.81012487411499, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.847640872001648, + "num_tokens": 363305029.0, + "step": 9522 + }, + { + "epoch": 1.2114234830174277, + "ewc_loss": 0.05963212996721268, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026917288778349757, + "grad_norm": 6.896636486053467, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8708372116088867, + "num_tokens": 363344128.0, + "step": 9523 + }, + { + "epoch": 1.2115506932960183, + "ewc_loss": 0.05947478488087654, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002675994182936847, + "grad_norm": 6.881056785583496, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8624255657196045, + "num_tokens": 363385903.0, + "step": 9524 + }, + { + "epoch": 1.2116779035746088, + "ewc_loss": 0.05958431959152222, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002686947409529239, + "grad_norm": 6.869692325592041, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8643121719360352, + "num_tokens": 363425216.0, + "step": 9525 + }, + { + "epoch": 1.2118051138531993, + "ewc_loss": 0.05946546792984009, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026750622782856226, + "grad_norm": 6.926653861999512, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8733389377593994, + "num_tokens": 363462691.0, + "step": 9526 + }, + { + "epoch": 1.2119323241317899, + "ewc_loss": 0.05944094806909561, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026726105716079473, + "grad_norm": 6.978833198547363, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8479893207550049, + "num_tokens": 363495849.0, + "step": 9527 + }, + { + "epoch": 1.2120595344103804, + "ewc_loss": 0.05908265709877014, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026611951761879027, + "grad_norm": 6.810608386993408, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8617509603500366, + "num_tokens": 363535268.0, + "step": 9528 + }, + { + "epoch": 1.212186744688971, + "ewc_loss": 0.05951932817697525, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002680448233149946, + "grad_norm": 6.994755744934082, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8459157943725586, + "num_tokens": 363573523.0, + "step": 9529 + }, + { + "epoch": 1.2123139549675614, + "ewc_loss": 0.059202276170253754, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002648743393365294, + "grad_norm": 6.787441730499268, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8631564378738403, + "num_tokens": 363607545.0, + "step": 9530 + }, + { + "epoch": 1.212441165246152, + "ewc_loss": 0.05940013378858566, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002692942798603326, + "grad_norm": 6.936535358428955, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8650690317153931, + "num_tokens": 363648456.0, + "step": 9531 + }, + { + "epoch": 1.2125683755247425, + "ewc_loss": 0.05925533175468445, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026540490216575563, + "grad_norm": 6.809154033660889, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8728718161582947, + "num_tokens": 363688726.0, + "step": 9532 + }, + { + "epoch": 1.212695585803333, + "ewc_loss": 0.059319984167814255, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002684928185772151, + "grad_norm": 6.889146327972412, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8563396334648132, + "num_tokens": 363728557.0, + "step": 9533 + }, + { + "epoch": 1.2128227960819233, + "ewc_loss": 0.05916004627943039, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026689344667829573, + "grad_norm": 6.839732646942139, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8630778789520264, + "num_tokens": 363764191.0, + "step": 9534 + }, + { + "epoch": 1.2129500063605139, + "ewc_loss": 0.059239305555820465, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026768603129312396, + "grad_norm": 6.89906644821167, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8515681028366089, + "num_tokens": 363798126.0, + "step": 9535 + }, + { + "epoch": 1.2130772166391044, + "ewc_loss": 0.05941742658615112, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002670258400030434, + "grad_norm": 6.85543155670166, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8464570045471191, + "num_tokens": 363836037.0, + "step": 9536 + }, + { + "epoch": 1.213204426917695, + "ewc_loss": 0.0595901757478714, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002687533269636333, + "grad_norm": 6.84320068359375, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8842963576316833, + "num_tokens": 363872359.0, + "step": 9537 + }, + { + "epoch": 1.2133316371962855, + "ewc_loss": 0.05945985019207001, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026745005743578076, + "grad_norm": 6.825632572174072, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8572962880134583, + "num_tokens": 363914206.0, + "step": 9538 + }, + { + "epoch": 1.213458847474876, + "ewc_loss": 0.059511058032512665, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.000267962139332667, + "grad_norm": 6.8122406005859375, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8645222187042236, + "num_tokens": 363956147.0, + "step": 9539 + }, + { + "epoch": 1.2135860577534665, + "ewc_loss": 0.05950118228793144, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002678633900359273, + "grad_norm": 6.896337032318115, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8793597221374512, + "num_tokens": 363986328.0, + "step": 9540 + }, + { + "epoch": 1.213713268032057, + "ewc_loss": 0.05948847532272339, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002677363227121532, + "grad_norm": 6.839810371398926, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8600121736526489, + "num_tokens": 364024905.0, + "step": 9541 + }, + { + "epoch": 1.2138404783106476, + "ewc_loss": 0.059265121817588806, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002679441822692752, + "grad_norm": 6.842364311218262, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8633655309677124, + "num_tokens": 364068628.0, + "step": 9542 + }, + { + "epoch": 1.213967688589238, + "ewc_loss": 0.059514861553907394, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026800017803907394, + "grad_norm": 6.875979900360107, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8599914908409119, + "num_tokens": 364104261.0, + "step": 9543 + }, + { + "epoch": 1.2140948988678284, + "ewc_loss": 0.059583015739917755, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026868173154070973, + "grad_norm": 6.8553009033203125, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8594709634780884, + "num_tokens": 364147524.0, + "step": 9544 + }, + { + "epoch": 1.214222109146419, + "ewc_loss": 0.05950397253036499, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002678913006093353, + "grad_norm": 6.833535671234131, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8643821477890015, + "num_tokens": 364187715.0, + "step": 9545 + }, + { + "epoch": 1.2143493194250095, + "ewc_loss": 0.05956621468067169, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026851368602365255, + "grad_norm": 7.013169288635254, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8732423782348633, + "num_tokens": 364226081.0, + "step": 9546 + }, + { + "epoch": 1.2144765297036, + "ewc_loss": 0.05947088822722435, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026756044826470315, + "grad_norm": 6.838524341583252, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8627929091453552, + "num_tokens": 364268390.0, + "step": 9547 + }, + { + "epoch": 1.2146037399821905, + "ewc_loss": 0.05964664742350578, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002693180285859853, + "grad_norm": 6.9022064208984375, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8655911684036255, + "num_tokens": 364310176.0, + "step": 9548 + }, + { + "epoch": 1.214730950260781, + "ewc_loss": 0.05950196832418442, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002678712480701506, + "grad_norm": 6.881621837615967, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8740455508232117, + "num_tokens": 364345652.0, + "step": 9549 + }, + { + "epoch": 1.2148581605393716, + "ewc_loss": 0.059510473161935806, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002679562894627452, + "grad_norm": 6.937195301055908, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8584597110748291, + "num_tokens": 364383810.0, + "step": 9550 + }, + { + "epoch": 1.214985370817962, + "ewc_loss": 0.05958999693393707, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026875155162997544, + "grad_norm": 6.8684563636779785, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8671771287918091, + "num_tokens": 364421169.0, + "step": 9551 + }, + { + "epoch": 1.2151125810965526, + "ewc_loss": 0.05945972353219986, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002674488059710711, + "grad_norm": 6.970507621765137, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8574317097663879, + "num_tokens": 364452552.0, + "step": 9552 + }, + { + "epoch": 1.2152397913751432, + "ewc_loss": 0.05942833051085472, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026713486295193434, + "grad_norm": 6.915118217468262, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.862695574760437, + "num_tokens": 364486779.0, + "step": 9553 + }, + { + "epoch": 1.2153670016537337, + "ewc_loss": 0.05923296511173248, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026762261404655874, + "grad_norm": 6.883172035217285, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8735096454620361, + "num_tokens": 364526237.0, + "step": 9554 + }, + { + "epoch": 1.2154942119323242, + "ewc_loss": 0.05916622281074524, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026695517590269446, + "grad_norm": 7.0941338539123535, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8694198727607727, + "num_tokens": 364568446.0, + "step": 9555 + }, + { + "epoch": 1.2156214222109147, + "ewc_loss": 0.059046171605587006, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026575467200018466, + "grad_norm": 6.835309028625488, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8695209622383118, + "num_tokens": 364604448.0, + "step": 9556 + }, + { + "epoch": 1.2157486324895053, + "ewc_loss": 0.05936156585812569, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026890862500295043, + "grad_norm": 6.9794392585754395, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8609205484390259, + "num_tokens": 364642707.0, + "step": 9557 + }, + { + "epoch": 1.2158758427680956, + "ewc_loss": 0.05911678075790405, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026646076003089547, + "grad_norm": 6.870717525482178, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8582824468612671, + "num_tokens": 364678428.0, + "step": 9558 + }, + { + "epoch": 1.216003053046686, + "ewc_loss": 0.059319280087947845, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026848577545024455, + "grad_norm": 6.911115646362305, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8779267072677612, + "num_tokens": 364718915.0, + "step": 9559 + }, + { + "epoch": 1.2161302633252766, + "ewc_loss": 0.05926613137125969, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002679542812984437, + "grad_norm": 6.914510250091553, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8787188529968262, + "num_tokens": 364756030.0, + "step": 9560 + }, + { + "epoch": 1.2162574736038672, + "ewc_loss": 0.05925056338310242, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026779857580550015, + "grad_norm": 7.446624279022217, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8611923456192017, + "num_tokens": 364796158.0, + "step": 9561 + }, + { + "epoch": 1.2163846838824577, + "ewc_loss": 0.05891212448477745, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002644142077770084, + "grad_norm": 6.798027038574219, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8702693581581116, + "num_tokens": 364832722.0, + "step": 9562 + }, + { + "epoch": 1.2165118941610482, + "ewc_loss": 0.05939045548439026, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002691975387278944, + "grad_norm": 7.054983139038086, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8627184629440308, + "num_tokens": 364874120.0, + "step": 9563 + }, + { + "epoch": 1.2166391044396387, + "ewc_loss": 0.05884561687707901, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002637491561472416, + "grad_norm": 6.816395282745361, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8662375211715698, + "num_tokens": 364913166.0, + "step": 9564 + }, + { + "epoch": 1.2167663147182293, + "ewc_loss": 0.0593523234128952, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002688162203412503, + "grad_norm": 7.3045973777771, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.84581458568573, + "num_tokens": 364949586.0, + "step": 9565 + }, + { + "epoch": 1.2168935249968198, + "ewc_loss": 0.05887461453676224, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026403911761008203, + "grad_norm": 6.884438514709473, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8456038236618042, + "num_tokens": 364989070.0, + "step": 9566 + }, + { + "epoch": 1.2170207352754103, + "ewc_loss": 0.05930308252573013, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026832378352992237, + "grad_norm": 7.135499000549316, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8671738505363464, + "num_tokens": 365027050.0, + "step": 9567 + }, + { + "epoch": 1.2171479455540009, + "ewc_loss": 0.058959778398275375, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026489075389690697, + "grad_norm": 6.843085765838623, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8627586364746094, + "num_tokens": 365060050.0, + "step": 9568 + }, + { + "epoch": 1.2172751558325912, + "ewc_loss": 0.05932493507862091, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000268542324192822, + "grad_norm": 7.137057304382324, + "learning_rate": 1e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8326326608657837, + "num_tokens": 365098030.0, + "step": 9569 + }, + { + "epoch": 1.2174023661111817, + "ewc_loss": 0.05890928953886032, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000264385889749974, + "grad_norm": 6.779473781585693, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8648254871368408, + "num_tokens": 365141143.0, + "step": 9570 + }, + { + "epoch": 1.2175295763897722, + "ewc_loss": 0.059453047811985016, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026982344570569694, + "grad_norm": 7.130764484405518, + "learning_rate": 1e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.8405600786209106, + "num_tokens": 365174204.0, + "step": 9571 + }, + { + "epoch": 1.2176567866683627, + "ewc_loss": 0.05902734398841858, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002655664284247905, + "grad_norm": 6.917591094970703, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8629338145256042, + "num_tokens": 365207170.0, + "step": 9572 + }, + { + "epoch": 1.2177839969469533, + "ewc_loss": 0.05929027497768402, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002681956975720823, + "grad_norm": 6.928533554077148, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8652184009552002, + "num_tokens": 365248926.0, + "step": 9573 + }, + { + "epoch": 1.2179112072255438, + "ewc_loss": 0.05915938317775726, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026688678190112114, + "grad_norm": 6.9747443199157715, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8657130002975464, + "num_tokens": 365283588.0, + "step": 9574 + }, + { + "epoch": 1.2180384175041343, + "ewc_loss": 0.05903672054409981, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026566017186269164, + "grad_norm": 6.886567115783691, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8600497841835022, + "num_tokens": 365324869.0, + "step": 9575 + }, + { + "epoch": 1.2181656277827249, + "ewc_loss": 0.059520527720451355, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002680568431969732, + "grad_norm": 6.959317207336426, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8599312901496887, + "num_tokens": 365366832.0, + "step": 9576 + }, + { + "epoch": 1.2182928380613154, + "ewc_loss": 0.059363953769207, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026649111532606184, + "grad_norm": 6.928952693939209, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8596451282501221, + "num_tokens": 365401148.0, + "step": 9577 + }, + { + "epoch": 1.218420048339906, + "ewc_loss": 0.05943877249956131, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002672392874956131, + "grad_norm": 6.892591953277588, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8647648692131042, + "num_tokens": 365442303.0, + "step": 9578 + }, + { + "epoch": 1.2185472586184964, + "ewc_loss": 0.05927286669611931, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026802162756212056, + "grad_norm": 7.002078056335449, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.874809980392456, + "num_tokens": 365477877.0, + "step": 9579 + }, + { + "epoch": 1.218674468897087, + "ewc_loss": 0.05910588055849075, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000266351766185835, + "grad_norm": 6.870091915130615, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8515564799308777, + "num_tokens": 365520648.0, + "step": 9580 + }, + { + "epoch": 1.2188016791756775, + "ewc_loss": 0.05925988778471947, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000267891853582114, + "grad_norm": 6.9694037437438965, + "learning_rate": 1e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8451764583587646, + "num_tokens": 365555129.0, + "step": 9581 + }, + { + "epoch": 1.218928889454268, + "ewc_loss": 0.05918598547577858, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026715282001532614, + "grad_norm": 6.931222438812256, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8675190806388855, + "num_tokens": 365594368.0, + "step": 9582 + }, + { + "epoch": 1.2190560997328583, + "ewc_loss": 0.05923999100923538, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002676928706932813, + "grad_norm": 6.927035808563232, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8621393442153931, + "num_tokens": 365634763.0, + "step": 9583 + }, + { + "epoch": 1.2191833100114489, + "ewc_loss": 0.05922367796301842, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002675297437235713, + "grad_norm": 6.8823771476745605, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8781754374504089, + "num_tokens": 365675951.0, + "step": 9584 + }, + { + "epoch": 1.2193105202900394, + "ewc_loss": 0.059371910989284515, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002665706560947001, + "grad_norm": 6.929524898529053, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8709208965301514, + "num_tokens": 365710348.0, + "step": 9585 + }, + { + "epoch": 1.21943773056863, + "ewc_loss": 0.059277016669511795, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026806312962435186, + "grad_norm": 6.905824661254883, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8652574419975281, + "num_tokens": 365745166.0, + "step": 9586 + }, + { + "epoch": 1.2195649408472204, + "ewc_loss": 0.05948982387781143, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026774979778565466, + "grad_norm": 6.991284370422363, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.875545859336853, + "num_tokens": 365780122.0, + "step": 9587 + }, + { + "epoch": 1.219692151125811, + "ewc_loss": 0.059397220611572266, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026682374300435185, + "grad_norm": 6.911606788635254, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8610182404518127, + "num_tokens": 365816326.0, + "step": 9588 + }, + { + "epoch": 1.2198193614044015, + "ewc_loss": 0.059205226600170135, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002673452254384756, + "grad_norm": 6.913680553436279, + "learning_rate": 1e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8364931344985962, + "num_tokens": 365858938.0, + "step": 9589 + }, + { + "epoch": 1.219946571682992, + "ewc_loss": 0.05941092222929001, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002669607929419726, + "grad_norm": 6.918622016906738, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.85212242603302, + "num_tokens": 365893892.0, + "step": 9590 + }, + { + "epoch": 1.2200737819615826, + "ewc_loss": 0.059458792209625244, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002674394636414945, + "grad_norm": 6.906301975250244, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8529975414276123, + "num_tokens": 365938787.0, + "step": 9591 + }, + { + "epoch": 1.220200992240173, + "ewc_loss": 0.05945388972759247, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002673904527910054, + "grad_norm": 6.930055618286133, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8617256879806519, + "num_tokens": 365972664.0, + "step": 9592 + }, + { + "epoch": 1.2203282025187634, + "ewc_loss": 0.059466395527124405, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026751551195047796, + "grad_norm": 6.882217884063721, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8686122298240662, + "num_tokens": 366013367.0, + "step": 9593 + }, + { + "epoch": 1.220455412797354, + "ewc_loss": 0.059606634080410004, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026891790912486613, + "grad_norm": 6.911706447601318, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8689866065979004, + "num_tokens": 366051637.0, + "step": 9594 + }, + { + "epoch": 1.2205826230759445, + "ewc_loss": 0.05951064079999924, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002679579774849117, + "grad_norm": 6.894326686859131, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8589038252830505, + "num_tokens": 366092713.0, + "step": 9595 + }, + { + "epoch": 1.220709833354535, + "ewc_loss": 0.05959538370370865, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026880542282015085, + "grad_norm": 6.914963722229004, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8630771636962891, + "num_tokens": 366131594.0, + "step": 9596 + }, + { + "epoch": 1.2208370436331255, + "ewc_loss": 0.05957069993019104, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002685585350263864, + "grad_norm": 6.958922386169434, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8576372861862183, + "num_tokens": 366170871.0, + "step": 9597 + }, + { + "epoch": 1.220964253911716, + "ewc_loss": 0.05956553667783737, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026850690483115613, + "grad_norm": 6.8165435791015625, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8513921499252319, + "num_tokens": 366212974.0, + "step": 9598 + }, + { + "epoch": 1.2210914641903066, + "ewc_loss": 0.05969769135117531, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026982848066836596, + "grad_norm": 6.982143402099609, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8545601963996887, + "num_tokens": 366255904.0, + "step": 9599 + }, + { + "epoch": 1.221218674468897, + "ewc_loss": 0.05954058840870857, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026825745590031147, + "grad_norm": 6.9470696449279785, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8682841062545776, + "num_tokens": 366288887.0, + "step": 9600 + }, + { + "epoch": 1.2213458847474876, + "ewc_loss": 0.059381403028964996, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000269107025815174, + "grad_norm": 6.862841606140137, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8763301372528076, + "num_tokens": 366328796.0, + "step": 9601 + }, + { + "epoch": 1.2214730950260781, + "ewc_loss": 0.059491947293281555, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002702124183997512, + "grad_norm": 6.925940990447998, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8596220016479492, + "num_tokens": 366373093.0, + "step": 9602 + }, + { + "epoch": 1.2216003053046687, + "ewc_loss": 0.059378720819950104, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002690801920834929, + "grad_norm": 6.8540215492248535, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8715641498565674, + "num_tokens": 366414353.0, + "step": 9603 + }, + { + "epoch": 1.2217275155832592, + "ewc_loss": 0.059563539922237396, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027092837262898684, + "grad_norm": 6.95979118347168, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.876480221748352, + "num_tokens": 366450683.0, + "step": 9604 + }, + { + "epoch": 1.2218547258618497, + "ewc_loss": 0.05941104143857956, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002694033901207149, + "grad_norm": 6.912278175354004, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8678351640701294, + "num_tokens": 366487158.0, + "step": 9605 + }, + { + "epoch": 1.2219819361404403, + "ewc_loss": 0.0595935583114624, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002712285495363176, + "grad_norm": 6.9390482902526855, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8643125891685486, + "num_tokens": 366529984.0, + "step": 9606 + }, + { + "epoch": 1.2221091464190306, + "ewc_loss": 0.059443507343530655, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026972804334945977, + "grad_norm": 6.948051929473877, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8660427331924438, + "num_tokens": 366564100.0, + "step": 9607 + }, + { + "epoch": 1.222236356697621, + "ewc_loss": 0.059479884803295135, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027009184123016894, + "grad_norm": 6.909114837646484, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8703416585922241, + "num_tokens": 366603752.0, + "step": 9608 + }, + { + "epoch": 1.2223635669762116, + "ewc_loss": 0.05949132889509201, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002702062774915248, + "grad_norm": 6.8674845695495605, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8647346496582031, + "num_tokens": 366642170.0, + "step": 9609 + }, + { + "epoch": 1.2224907772548022, + "ewc_loss": 0.05953230708837509, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027061603032052517, + "grad_norm": 6.916083812713623, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8484525084495544, + "num_tokens": 366684038.0, + "step": 9610 + }, + { + "epoch": 1.2226179875333927, + "ewc_loss": 0.059507086873054504, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027036384562961757, + "grad_norm": 6.909337520599365, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8598872423171997, + "num_tokens": 366717832.0, + "step": 9611 + }, + { + "epoch": 1.2227451978119832, + "ewc_loss": 0.059496402740478516, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027025697636418045, + "grad_norm": 6.916594505310059, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.856730580329895, + "num_tokens": 366757999.0, + "step": 9612 + }, + { + "epoch": 1.2228724080905737, + "ewc_loss": 0.05947253108024597, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027001829585060477, + "grad_norm": 6.863956451416016, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8716450929641724, + "num_tokens": 366798533.0, + "step": 9613 + }, + { + "epoch": 1.2229996183691643, + "ewc_loss": 0.059565506875514984, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002709480468183756, + "grad_norm": 6.888699531555176, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8604531288146973, + "num_tokens": 366843066.0, + "step": 9614 + }, + { + "epoch": 1.2231268286477548, + "ewc_loss": 0.05960952118039131, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027138818404637277, + "grad_norm": 6.890874862670898, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.878368079662323, + "num_tokens": 366883190.0, + "step": 9615 + }, + { + "epoch": 1.2232540389263453, + "ewc_loss": 0.05947369337081909, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000270029908278957, + "grad_norm": 6.896331310272217, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.865653932094574, + "num_tokens": 366920642.0, + "step": 9616 + }, + { + "epoch": 1.2233812492049359, + "ewc_loss": 0.05956215411424637, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027091451920568943, + "grad_norm": 6.915981292724609, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8643231987953186, + "num_tokens": 366954468.0, + "step": 9617 + }, + { + "epoch": 1.2235084594835262, + "ewc_loss": 0.05951894819736481, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002704824728425592, + "grad_norm": 6.930724620819092, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8477169275283813, + "num_tokens": 366991170.0, + "step": 9618 + }, + { + "epoch": 1.2236356697621167, + "ewc_loss": 0.05956427752971649, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002709357358980924, + "grad_norm": 6.882405757904053, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8671396970748901, + "num_tokens": 367031833.0, + "step": 9619 + }, + { + "epoch": 1.2237628800407072, + "ewc_loss": 0.0595494881272316, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027078783023171127, + "grad_norm": 6.901440143585205, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8576865792274475, + "num_tokens": 367074709.0, + "step": 9620 + }, + { + "epoch": 1.2238900903192977, + "ewc_loss": 0.05953899025917053, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027068285271525383, + "grad_norm": 6.941715717315674, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8605588674545288, + "num_tokens": 367106820.0, + "step": 9621 + }, + { + "epoch": 1.2240173005978883, + "ewc_loss": 0.0595211461186409, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027050444623455405, + "grad_norm": 6.919813632965088, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.877632737159729, + "num_tokens": 367142298.0, + "step": 9622 + }, + { + "epoch": 1.2241445108764788, + "ewc_loss": 0.05949115380644798, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027020450215786695, + "grad_norm": 6.931562423706055, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.853096604347229, + "num_tokens": 367179957.0, + "step": 9623 + }, + { + "epoch": 1.2242717211550693, + "ewc_loss": 0.05950665473937988, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027035953826271, + "grad_norm": 6.873352527618408, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8648191690444946, + "num_tokens": 367226800.0, + "step": 9624 + }, + { + "epoch": 1.2243989314336599, + "ewc_loss": 0.05954897031188011, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002707826788537204, + "grad_norm": 6.931230545043945, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8672523498535156, + "num_tokens": 367257168.0, + "step": 9625 + }, + { + "epoch": 1.2245261417122504, + "ewc_loss": 0.059406861662864685, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026936159702017903, + "grad_norm": 6.869144439697266, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8637147545814514, + "num_tokens": 367295254.0, + "step": 9626 + }, + { + "epoch": 1.224653351990841, + "ewc_loss": 0.05957932025194168, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027108617359772325, + "grad_norm": 6.886004447937012, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8499943017959595, + "num_tokens": 367335814.0, + "step": 9627 + }, + { + "epoch": 1.2247805622694314, + "ewc_loss": 0.059506215155124664, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000270355143584311, + "grad_norm": 6.927040100097656, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8595936298370361, + "num_tokens": 367377592.0, + "step": 9628 + }, + { + "epoch": 1.224907772548022, + "ewc_loss": 0.05948661267757416, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027015910018235445, + "grad_norm": 6.966846466064453, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8546081781387329, + "num_tokens": 367408069.0, + "step": 9629 + }, + { + "epoch": 1.2250349828266125, + "ewc_loss": 0.05950893461704254, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002703823265619576, + "grad_norm": 6.886795997619629, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8647967576980591, + "num_tokens": 367444787.0, + "step": 9630 + }, + { + "epoch": 1.225162193105203, + "ewc_loss": 0.0595255009829998, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002705479855649173, + "grad_norm": 6.922159194946289, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8476304411888123, + "num_tokens": 367486848.0, + "step": 9631 + }, + { + "epoch": 1.2252894033837933, + "ewc_loss": 0.05948328226804733, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002701257762964815, + "grad_norm": 6.885951995849609, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8732983469963074, + "num_tokens": 367530456.0, + "step": 9632 + }, + { + "epoch": 1.2254166136623839, + "ewc_loss": 0.05950026214122772, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027029559714719653, + "grad_norm": 6.884262561798096, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.86659836769104, + "num_tokens": 367566513.0, + "step": 9633 + }, + { + "epoch": 1.2255438239409744, + "ewc_loss": 0.05944143980741501, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002697073796298355, + "grad_norm": 6.913356304168701, + "learning_rate": 1e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8408730626106262, + "num_tokens": 367607757.0, + "step": 9634 + }, + { + "epoch": 1.225671034219565, + "ewc_loss": 0.05957195907831192, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027101257001049817, + "grad_norm": 6.886897563934326, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8655622005462646, + "num_tokens": 367645525.0, + "step": 9635 + }, + { + "epoch": 1.2257982444981554, + "ewc_loss": 0.0595305971801281, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027059894637204707, + "grad_norm": 6.938319206237793, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8717670440673828, + "num_tokens": 367680840.0, + "step": 9636 + }, + { + "epoch": 1.225925454776746, + "ewc_loss": 0.059450335800647736, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002697963500395417, + "grad_norm": 6.860337734222412, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8720012307167053, + "num_tokens": 367717444.0, + "step": 9637 + }, + { + "epoch": 1.2260526650553365, + "ewc_loss": 0.05964375287294388, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002717305033002049, + "grad_norm": 6.942089080810547, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.84295654296875, + "num_tokens": 367756548.0, + "step": 9638 + }, + { + "epoch": 1.226179875333927, + "ewc_loss": 0.05968198925256729, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002696714655030519, + "grad_norm": 6.852470397949219, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8575207591056824, + "num_tokens": 367798341.0, + "step": 9639 + }, + { + "epoch": 1.2263070856125176, + "ewc_loss": 0.05970092862844467, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027230227715335786, + "grad_norm": 6.984521865844727, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8699246644973755, + "num_tokens": 367835087.0, + "step": 9640 + }, + { + "epoch": 1.226434295891108, + "ewc_loss": 0.05943455547094345, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002696385490708053, + "grad_norm": 6.8714704513549805, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8641597032546997, + "num_tokens": 367873816.0, + "step": 9641 + }, + { + "epoch": 1.2265615061696984, + "ewc_loss": 0.059546925127506256, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027076221886090934, + "grad_norm": 6.960819244384766, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8578916788101196, + "num_tokens": 367913621.0, + "step": 9642 + }, + { + "epoch": 1.226688716448289, + "ewc_loss": 0.059486061334609985, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027015357045456767, + "grad_norm": 6.927387237548828, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8534716963768005, + "num_tokens": 367946881.0, + "step": 9643 + }, + { + "epoch": 1.2268159267268794, + "ewc_loss": 0.05948992446064949, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002701922203414142, + "grad_norm": 6.916688442230225, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8668069839477539, + "num_tokens": 367980806.0, + "step": 9644 + }, + { + "epoch": 1.22694313700547, + "ewc_loss": 0.059479571878910065, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002700886980164796, + "grad_norm": 6.863154411315918, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8696469664573669, + "num_tokens": 368016041.0, + "step": 9645 + }, + { + "epoch": 1.2270703472840605, + "ewc_loss": 0.059471357613801956, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027000653790310025, + "grad_norm": 6.910473823547363, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8608614206314087, + "num_tokens": 368052331.0, + "step": 9646 + }, + { + "epoch": 1.227197557562651, + "ewc_loss": 0.05946025624871254, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026989553589373827, + "grad_norm": 6.871293544769287, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8709405064582825, + "num_tokens": 368091458.0, + "step": 9647 + }, + { + "epoch": 1.2273247678412416, + "ewc_loss": 0.059757694602012634, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027042851434089243, + "grad_norm": 6.915594577789307, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8508870601654053, + "num_tokens": 368132433.0, + "step": 9648 + }, + { + "epoch": 1.227451978119832, + "ewc_loss": 0.059689633548259735, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002697479212656617, + "grad_norm": 6.88469934463501, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8632567524909973, + "num_tokens": 368171030.0, + "step": 9649 + }, + { + "epoch": 1.2275791883984226, + "ewc_loss": 0.05979031324386597, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002707547100726515, + "grad_norm": 6.901203632354736, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.851378321647644, + "num_tokens": 368211555.0, + "step": 9650 + }, + { + "epoch": 1.2277063986770131, + "ewc_loss": 0.0598178431391716, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002710299741011113, + "grad_norm": 6.896794319152832, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8694251775741577, + "num_tokens": 368252076.0, + "step": 9651 + }, + { + "epoch": 1.2278336089556037, + "ewc_loss": 0.05980515852570534, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027090313960798085, + "grad_norm": 6.946516990661621, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8548849821090698, + "num_tokens": 368291265.0, + "step": 9652 + }, + { + "epoch": 1.2279608192341942, + "ewc_loss": 0.0597480833530426, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002703324134927243, + "grad_norm": 6.880034446716309, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8561396598815918, + "num_tokens": 368332178.0, + "step": 9653 + }, + { + "epoch": 1.2280880295127847, + "ewc_loss": 0.059799760580062866, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002708491520024836, + "grad_norm": 6.961116790771484, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8608586192131042, + "num_tokens": 368374103.0, + "step": 9654 + }, + { + "epoch": 1.2282152397913753, + "ewc_loss": 0.059686511754989624, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002697166637517512, + "grad_norm": 6.8674139976501465, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8651685118675232, + "num_tokens": 368411928.0, + "step": 9655 + }, + { + "epoch": 1.2283424500699656, + "ewc_loss": 0.05983126163482666, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002711641718633473, + "grad_norm": 6.975127220153809, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8377097845077515, + "num_tokens": 368449200.0, + "step": 9656 + }, + { + "epoch": 1.228469660348556, + "ewc_loss": 0.05962185561656952, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00026907012215815485, + "grad_norm": 6.931395053863525, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8440635204315186, + "num_tokens": 368489935.0, + "step": 9657 + }, + { + "epoch": 1.2285968706271466, + "ewc_loss": 0.05981391668319702, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002709907421376556, + "grad_norm": 6.981070041656494, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8744253516197205, + "num_tokens": 368522227.0, + "step": 9658 + }, + { + "epoch": 1.2287240809057371, + "ewc_loss": 0.05964449793100357, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002692965208552778, + "grad_norm": 7.17234992980957, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8612355589866638, + "num_tokens": 368555084.0, + "step": 9659 + }, + { + "epoch": 1.2288512911843277, + "ewc_loss": 0.0595308393239975, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002681599580682814, + "grad_norm": 6.875618934631348, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8581399917602539, + "num_tokens": 368591845.0, + "step": 9660 + }, + { + "epoch": 1.2289785014629182, + "ewc_loss": 0.05974803492426872, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027033191872760653, + "grad_norm": 6.948665618896484, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8522441387176514, + "num_tokens": 368637841.0, + "step": 9661 + }, + { + "epoch": 1.2291057117415087, + "ewc_loss": 0.059596844017505646, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002688200038392097, + "grad_norm": 6.899458885192871, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8770705461502075, + "num_tokens": 368677470.0, + "step": 9662 + }, + { + "epoch": 1.2292329220200993, + "ewc_loss": 0.05959983542561531, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002688499225769192, + "grad_norm": 6.898356914520264, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8759064078330994, + "num_tokens": 368718412.0, + "step": 9663 + }, + { + "epoch": 1.2293601322986898, + "ewc_loss": 0.0596541203558445, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002693927672225982, + "grad_norm": 6.89694881439209, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8510349988937378, + "num_tokens": 368759267.0, + "step": 9664 + }, + { + "epoch": 1.2294873425772803, + "ewc_loss": 0.05973568558692932, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002702084311749786, + "grad_norm": 6.937868595123291, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8521893620491028, + "num_tokens": 368797586.0, + "step": 9665 + }, + { + "epoch": 1.2296145528558708, + "ewc_loss": 0.059643082320690155, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002692823763936758, + "grad_norm": 6.866971492767334, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8540307879447937, + "num_tokens": 368833801.0, + "step": 9666 + }, + { + "epoch": 1.2297417631344612, + "ewc_loss": 0.05978713929653168, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027072292868979275, + "grad_norm": 6.952474117279053, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8464979529380798, + "num_tokens": 368878513.0, + "step": 9667 + }, + { + "epoch": 1.2298689734130517, + "ewc_loss": 0.05971600487828255, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002700116019695997, + "grad_norm": 6.941328048706055, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8549609780311584, + "num_tokens": 368917815.0, + "step": 9668 + }, + { + "epoch": 1.2299961836916422, + "ewc_loss": 0.05976410210132599, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027049260097555816, + "grad_norm": 6.942069053649902, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8523582220077515, + "num_tokens": 368954971.0, + "step": 9669 + }, + { + "epoch": 1.2301233939702327, + "ewc_loss": 0.0597442202270031, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027029376360587776, + "grad_norm": 6.9006667137146, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8676021695137024, + "num_tokens": 368996212.0, + "step": 9670 + }, + { + "epoch": 1.2302506042488233, + "ewc_loss": 0.05979382246732712, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027078978018835187, + "grad_norm": 6.931988716125488, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8762837052345276, + "num_tokens": 369032186.0, + "step": 9671 + }, + { + "epoch": 1.2303778145274138, + "ewc_loss": 0.059730660170316696, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027015816885977983, + "grad_norm": 6.9062113761901855, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8670972585678101, + "num_tokens": 369077565.0, + "step": 9672 + }, + { + "epoch": 1.2305050248060043, + "ewc_loss": 0.05952053517103195, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027049833443015814, + "grad_norm": 6.970531463623047, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.86902916431427, + "num_tokens": 369111935.0, + "step": 9673 + }, + { + "epoch": 1.2306322350845948, + "ewc_loss": 0.05946147069334984, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026990767219103873, + "grad_norm": 6.890175819396973, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8593262434005737, + "num_tokens": 369154374.0, + "step": 9674 + }, + { + "epoch": 1.2307594453631854, + "ewc_loss": 0.05949652940034866, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027025825693272054, + "grad_norm": 6.9110307693481445, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8537014722824097, + "num_tokens": 369198557.0, + "step": 9675 + }, + { + "epoch": 1.230886655641776, + "ewc_loss": 0.05950085446238518, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027030150522477925, + "grad_norm": 6.920276641845703, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8448026180267334, + "num_tokens": 369237756.0, + "step": 9676 + }, + { + "epoch": 1.2310138659203664, + "ewc_loss": 0.059497587382793427, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002702688507270068, + "grad_norm": 6.954562664031982, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8484158515930176, + "num_tokens": 369270155.0, + "step": 9677 + }, + { + "epoch": 1.231141076198957, + "ewc_loss": 0.05945226550102234, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026981564587913454, + "grad_norm": 6.9188385009765625, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8558668494224548, + "num_tokens": 369310870.0, + "step": 9678 + }, + { + "epoch": 1.2312682864775475, + "ewc_loss": 0.05951760336756706, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027046899776905775, + "grad_norm": 6.971637725830078, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8451454639434814, + "num_tokens": 369344679.0, + "step": 9679 + }, + { + "epoch": 1.231395496756138, + "ewc_loss": 0.059394367039203644, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002692366251721978, + "grad_norm": 6.920431613922119, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8609644174575806, + "num_tokens": 369380388.0, + "step": 9680 + }, + { + "epoch": 1.2315227070347283, + "ewc_loss": 0.0594201385974884, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026949436869472265, + "grad_norm": 6.985769748687744, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8733997344970703, + "num_tokens": 369422327.0, + "step": 9681 + }, + { + "epoch": 1.2316499173133189, + "ewc_loss": 0.05933668836951256, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026865984546020627, + "grad_norm": 6.863736629486084, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8675632476806641, + "num_tokens": 369457160.0, + "step": 9682 + }, + { + "epoch": 1.2317771275919094, + "ewc_loss": 0.05952189490199089, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027051192591898143, + "grad_norm": 6.9524078369140625, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8623000383377075, + "num_tokens": 369497883.0, + "step": 9683 + }, + { + "epoch": 1.2319043378705, + "ewc_loss": 0.05936882644891739, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00026898123905994, + "grad_norm": 6.8118205070495605, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8915494084358215, + "num_tokens": 369540014.0, + "step": 9684 + }, + { + "epoch": 1.2320315481490904, + "ewc_loss": 0.0595843642950058, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027113661053590477, + "grad_norm": 6.953938007354736, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8617897033691406, + "num_tokens": 369580424.0, + "step": 9685 + }, + { + "epoch": 1.232158758427681, + "ewc_loss": 0.059496887028217316, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002702618658076972, + "grad_norm": 6.846060276031494, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.85804682970047, + "num_tokens": 369623615.0, + "step": 9686 + }, + { + "epoch": 1.2322859687062715, + "ewc_loss": 0.05962171405553818, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002715101290959865, + "grad_norm": 7.004554271697998, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8495681881904602, + "num_tokens": 369658408.0, + "step": 9687 + }, + { + "epoch": 1.232413178984862, + "ewc_loss": 0.05947268381714821, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002700198092497885, + "grad_norm": 6.844377517700195, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8574879765510559, + "num_tokens": 369696311.0, + "step": 9688 + }, + { + "epoch": 1.2325403892634526, + "ewc_loss": 0.059691235423088074, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002722053322941065, + "grad_norm": 7.0016279220581055, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8487753868103027, + "num_tokens": 369737473.0, + "step": 9689 + }, + { + "epoch": 1.232667599542043, + "ewc_loss": 0.0595589242875576, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027088221395388246, + "grad_norm": 6.893367290496826, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8588693141937256, + "num_tokens": 369770151.0, + "step": 9690 + }, + { + "epoch": 1.2327948098206334, + "ewc_loss": 0.05972587317228317, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027255169698037207, + "grad_norm": 6.904282569885254, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.879572331905365, + "num_tokens": 369813503.0, + "step": 9691 + }, + { + "epoch": 1.232922020099224, + "ewc_loss": 0.0596008375287056, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027130136732012033, + "grad_norm": 6.960646629333496, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8639798164367676, + "num_tokens": 369848768.0, + "step": 9692 + }, + { + "epoch": 1.2330492303778144, + "ewc_loss": 0.05960064381361008, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027129941736347973, + "grad_norm": 6.9334797859191895, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8651890158653259, + "num_tokens": 369892510.0, + "step": 9693 + }, + { + "epoch": 1.233176440656405, + "ewc_loss": 0.0595778152346611, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027107109781354666, + "grad_norm": 6.9076008796691895, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8695000410079956, + "num_tokens": 369929638.0, + "step": 9694 + }, + { + "epoch": 1.2333036509349955, + "ewc_loss": 0.05956357717514038, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002709287218749523, + "grad_norm": 6.879560470581055, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8822512626647949, + "num_tokens": 369971201.0, + "step": 9695 + }, + { + "epoch": 1.233430861213586, + "ewc_loss": 0.059643279761075974, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002717257593758404, + "grad_norm": 6.957451343536377, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8565025925636292, + "num_tokens": 370011713.0, + "step": 9696 + }, + { + "epoch": 1.2335580714921766, + "ewc_loss": 0.05951765924692154, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027046955074183643, + "grad_norm": 6.917131423950195, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8407330513000488, + "num_tokens": 370042647.0, + "step": 9697 + }, + { + "epoch": 1.233685281770767, + "ewc_loss": 0.05965755879878998, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002718685718718916, + "grad_norm": 6.965902328491211, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8605244159698486, + "num_tokens": 370083976.0, + "step": 9698 + }, + { + "epoch": 1.2338124920493576, + "ewc_loss": 0.05960402637720108, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027133323601447046, + "grad_norm": 6.92887544631958, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.858639657497406, + "num_tokens": 370126435.0, + "step": 9699 + }, + { + "epoch": 1.2339397023279481, + "ewc_loss": 0.05958889424800873, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002711819251999259, + "grad_norm": 6.933838367462158, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.870168924331665, + "num_tokens": 370165038.0, + "step": 9700 + }, + { + "epoch": 1.2340669126065387, + "ewc_loss": 0.0596420019865036, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002717129827942699, + "grad_norm": 6.907609462738037, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.853278636932373, + "num_tokens": 370203438.0, + "step": 9701 + }, + { + "epoch": 1.2341941228851292, + "ewc_loss": 0.059663571417331696, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027192870038561523, + "grad_norm": 7.010097980499268, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8767626881599426, + "num_tokens": 370235144.0, + "step": 9702 + }, + { + "epoch": 1.2343213331637197, + "ewc_loss": 0.05955230072140694, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027081597363576293, + "grad_norm": 6.918586730957031, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8570510149002075, + "num_tokens": 370278936.0, + "step": 9703 + }, + { + "epoch": 1.2344485434423103, + "ewc_loss": 0.05990738794207573, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002719254407566041, + "grad_norm": 6.951883792877197, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8654592633247375, + "num_tokens": 370312595.0, + "step": 9704 + }, + { + "epoch": 1.2345757537209006, + "ewc_loss": 0.059780798852443695, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.000270659540547058, + "grad_norm": 6.912596702575684, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8589686751365662, + "num_tokens": 370351697.0, + "step": 9705 + }, + { + "epoch": 1.234702963999491, + "ewc_loss": 0.0599033422768116, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002718849864322692, + "grad_norm": 6.948879718780518, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8759284019470215, + "num_tokens": 370391264.0, + "step": 9706 + }, + { + "epoch": 1.2348301742780816, + "ewc_loss": 0.05965877324342728, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002718806790653616, + "grad_norm": 6.962624549865723, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8700438141822815, + "num_tokens": 370425248.0, + "step": 9707 + }, + { + "epoch": 1.2349573845566721, + "ewc_loss": 0.059620581567287445, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002714988077059388, + "grad_norm": 6.937154769897461, + "learning_rate": 1e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8405164480209351, + "num_tokens": 370462670.0, + "step": 9708 + }, + { + "epoch": 1.2350845948352627, + "ewc_loss": 0.05963999778032303, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002716929593589157, + "grad_norm": 6.932589054107666, + "learning_rate": 1e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8424088954925537, + "num_tokens": 370506126.0, + "step": 9709 + }, + { + "epoch": 1.2352118051138532, + "ewc_loss": 0.05961829423904419, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002714759320951998, + "grad_norm": 6.94198751449585, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.871663510799408, + "num_tokens": 370546895.0, + "step": 9710 + }, + { + "epoch": 1.2353390153924437, + "ewc_loss": 0.059607554227113724, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000271368509856984, + "grad_norm": 6.907156944274902, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.866378664970398, + "num_tokens": 370590645.0, + "step": 9711 + }, + { + "epoch": 1.2354662256710343, + "ewc_loss": 0.05962040275335312, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027149697416462004, + "grad_norm": 6.950802803039551, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8649078607559204, + "num_tokens": 370628614.0, + "step": 9712 + }, + { + "epoch": 1.2355934359496248, + "ewc_loss": 0.059503231197595596, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002703252830542624, + "grad_norm": 6.937503337860107, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8647432327270508, + "num_tokens": 370666082.0, + "step": 9713 + }, + { + "epoch": 1.2357206462282153, + "ewc_loss": 0.05991034954786301, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002719550393521786, + "grad_norm": 6.942174911499023, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8473060131072998, + "num_tokens": 370705780.0, + "step": 9714 + }, + { + "epoch": 1.2358478565068058, + "ewc_loss": 0.059659555554389954, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002718885079957545, + "grad_norm": 6.980067729949951, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8667119145393372, + "num_tokens": 370742550.0, + "step": 9715 + }, + { + "epoch": 1.2359750667853961, + "ewc_loss": 0.0595790259540081, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002710832341108471, + "grad_norm": 13.347982406616211, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8663996458053589, + "num_tokens": 370784406.0, + "step": 9716 + }, + { + "epoch": 1.2361022770639867, + "ewc_loss": 0.06922513246536255, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00036266143433749676, + "grad_norm": 8.080697059631348, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8582262396812439, + "num_tokens": 370820911.0, + "step": 9717 + }, + { + "epoch": 1.2362294873425772, + "ewc_loss": 0.058561742305755615, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002560275897849351, + "grad_norm": 6.64911413192749, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8721796274185181, + "num_tokens": 370857427.0, + "step": 9718 + }, + { + "epoch": 1.2363566976211677, + "ewc_loss": 0.06173869967460632, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00029023856041021645, + "grad_norm": 7.540261268615723, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8523659706115723, + "num_tokens": 370897989.0, + "step": 9719 + }, + { + "epoch": 1.2364839078997583, + "ewc_loss": 0.06055900454521179, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027600020985119045, + "grad_norm": 7.179096221923828, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8501952886581421, + "num_tokens": 370932672.0, + "step": 9720 + }, + { + "epoch": 1.2366111181783488, + "ewc_loss": 0.060156021267175674, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002768531849142164, + "grad_norm": 7.144538879394531, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8529217839241028, + "num_tokens": 370971027.0, + "step": 9721 + }, + { + "epoch": 1.2367383284569393, + "ewc_loss": 0.0597870759665966, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002731637214310467, + "grad_norm": 6.872379779815674, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8727614879608154, + "num_tokens": 371014742.0, + "step": 9722 + }, + { + "epoch": 1.2368655387355298, + "ewc_loss": 0.05999665707349777, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002752595173660666, + "grad_norm": 7.064551830291748, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8456858992576599, + "num_tokens": 371052065.0, + "step": 9723 + }, + { + "epoch": 1.2369927490141204, + "ewc_loss": 0.05994948744773865, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002747878315858543, + "grad_norm": 6.989212512969971, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8397470712661743, + "num_tokens": 371093897.0, + "step": 9724 + }, + { + "epoch": 1.237119959292711, + "ewc_loss": 0.059897590428590775, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002742688811849803, + "grad_norm": 6.986440658569336, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.86029452085495, + "num_tokens": 371132390.0, + "step": 9725 + }, + { + "epoch": 1.2372471695713014, + "ewc_loss": 0.05988422408699989, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002741352072916925, + "grad_norm": 7.042824745178223, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8725438117980957, + "num_tokens": 371170468.0, + "step": 9726 + }, + { + "epoch": 1.237374379849892, + "ewc_loss": 0.059823013842105865, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027352312463335693, + "grad_norm": 6.942821979522705, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8586918115615845, + "num_tokens": 371209323.0, + "step": 9727 + }, + { + "epoch": 1.2375015901284825, + "ewc_loss": 0.059951506555080414, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027480802964419127, + "grad_norm": 7.000071048736572, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8625749349594116, + "num_tokens": 371246526.0, + "step": 9728 + }, + { + "epoch": 1.237628800407073, + "ewc_loss": 0.059799809008836746, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027329105068929493, + "grad_norm": 6.944291114807129, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8650989532470703, + "num_tokens": 371282850.0, + "step": 9729 + }, + { + "epoch": 1.2377560106856633, + "ewc_loss": 0.05993519350886345, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027464490267448127, + "grad_norm": 7.02304744720459, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8702109456062317, + "num_tokens": 371312614.0, + "step": 9730 + }, + { + "epoch": 1.2378832209642538, + "ewc_loss": 0.059769220650196075, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002729851985350251, + "grad_norm": 6.9489617347717285, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8632767200469971, + "num_tokens": 371350170.0, + "step": 9731 + }, + { + "epoch": 1.2380104312428444, + "ewc_loss": 0.05987174063920975, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00027401038096286356, + "grad_norm": 6.959819316864014, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8697341084480286, + "num_tokens": 371389295.0, + "step": 9732 + }, + { + "epoch": 1.238137641521435, + "ewc_loss": 0.05976059287786484, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002728989056777209, + "grad_norm": 6.958188056945801, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8622164726257324, + "num_tokens": 371430579.0, + "step": 9733 + }, + { + "epoch": 1.2382648518000254, + "ewc_loss": 0.060080427676439285, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027365583810023963, + "grad_norm": 7.012689590454102, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8525983095169067, + "num_tokens": 371465087.0, + "step": 9734 + }, + { + "epoch": 1.238392062078616, + "ewc_loss": 0.059964556246995926, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002724971272982657, + "grad_norm": 6.876040458679199, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8700641393661499, + "num_tokens": 371503549.0, + "step": 9735 + }, + { + "epoch": 1.2385192723572065, + "ewc_loss": 0.0601680725812912, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002745322708506137, + "grad_norm": 7.008859157562256, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8809245824813843, + "num_tokens": 371542953.0, + "step": 9736 + }, + { + "epoch": 1.238646482635797, + "ewc_loss": 0.06014632433652878, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000271873374003917, + "grad_norm": 6.9233880043029785, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8661847114562988, + "num_tokens": 371582541.0, + "step": 9737 + }, + { + "epoch": 1.2387736929143875, + "ewc_loss": 0.060102708637714386, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027387862792238593, + "grad_norm": 7.011919021606445, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8722412586212158, + "num_tokens": 371617221.0, + "step": 9738 + }, + { + "epoch": 1.238900903192978, + "ewc_loss": 0.06023585423827171, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002727686951402575, + "grad_norm": 6.900932788848877, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8661782145500183, + "num_tokens": 371660773.0, + "step": 9739 + }, + { + "epoch": 1.2390281134715684, + "ewc_loss": 0.060441117733716965, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027482133009471, + "grad_norm": 6.988378047943115, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8577667474746704, + "num_tokens": 371694763.0, + "step": 9740 + }, + { + "epoch": 1.239155323750159, + "ewc_loss": 0.06017626076936722, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002721727651078254, + "grad_norm": 6.945681571960449, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8616664409637451, + "num_tokens": 371734793.0, + "step": 9741 + }, + { + "epoch": 1.2392825340287494, + "ewc_loss": 0.060335710644721985, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000273767247563228, + "grad_norm": 6.975491523742676, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8582871556282043, + "num_tokens": 371775691.0, + "step": 9742 + }, + { + "epoch": 1.23940974430734, + "ewc_loss": 0.06024037301540375, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002728138933889568, + "grad_norm": 6.897345066070557, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8538583517074585, + "num_tokens": 371817319.0, + "step": 9743 + }, + { + "epoch": 1.2395369545859305, + "ewc_loss": 0.06030238792300224, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027343403780832887, + "grad_norm": 7.028826713562012, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8616386651992798, + "num_tokens": 371847431.0, + "step": 9744 + }, + { + "epoch": 1.239664164864521, + "ewc_loss": 0.05971890687942505, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002724820515140891, + "grad_norm": 6.941501140594482, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8752214908599854, + "num_tokens": 371881788.0, + "step": 9745 + }, + { + "epoch": 1.2397913751431116, + "ewc_loss": 0.060365259647369385, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002740627678576857, + "grad_norm": 7.046550273895264, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8572217226028442, + "num_tokens": 371917388.0, + "step": 9746 + }, + { + "epoch": 1.239918585421702, + "ewc_loss": 0.06017618626356125, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000272172037512064, + "grad_norm": 6.871484756469727, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8711221218109131, + "num_tokens": 371957561.0, + "step": 9747 + }, + { + "epoch": 1.2400457957002926, + "ewc_loss": 0.0603765994310379, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027417612727731466, + "grad_norm": 7.12341833114624, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.854986846446991, + "num_tokens": 371987322.0, + "step": 9748 + }, + { + "epoch": 1.2401730059788831, + "ewc_loss": 0.06022774800658226, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002726876409724355, + "grad_norm": 6.913199424743652, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8645711541175842, + "num_tokens": 372027263.0, + "step": 9749 + }, + { + "epoch": 1.2403002162574737, + "ewc_loss": 0.059852372854948044, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.000273816694971174, + "grad_norm": 7.010265350341797, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8664467334747314, + "num_tokens": 372061441.0, + "step": 9750 + }, + { + "epoch": 1.2404274265360642, + "ewc_loss": 0.06027348339557648, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027314500766806304, + "grad_norm": 6.907407283782959, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8678709268569946, + "num_tokens": 372099085.0, + "step": 9751 + }, + { + "epoch": 1.2405546368146547, + "ewc_loss": 0.060392625629901886, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002743364020716399, + "grad_norm": 6.976043701171875, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8635308742523193, + "num_tokens": 372134106.0, + "step": 9752 + }, + { + "epoch": 1.2406818470932452, + "ewc_loss": 0.060298316180706024, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002733933215495199, + "grad_norm": 6.9631571769714355, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8700293302536011, + "num_tokens": 372164635.0, + "step": 9753 + }, + { + "epoch": 1.2408090573718356, + "ewc_loss": 0.060358159244060516, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002739917254075408, + "grad_norm": 6.939184188842773, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8679900765419006, + "num_tokens": 372204924.0, + "step": 9754 + }, + { + "epoch": 1.240936267650426, + "ewc_loss": 0.06035096198320389, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027391978073865175, + "grad_norm": 7.037729263305664, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.86335289478302, + "num_tokens": 372240939.0, + "step": 9755 + }, + { + "epoch": 1.2410634779290166, + "ewc_loss": 0.06021507829427719, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002725609519984573, + "grad_norm": 6.947879314422607, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8565864562988281, + "num_tokens": 372272355.0, + "step": 9756 + }, + { + "epoch": 1.2411906882076071, + "ewc_loss": 0.06037403643131256, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027415051590651274, + "grad_norm": 6.944201469421387, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8600994348526001, + "num_tokens": 372309566.0, + "step": 9757 + }, + { + "epoch": 1.2413178984861977, + "ewc_loss": 0.060301586985588074, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027342603425495327, + "grad_norm": 6.9288201332092285, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8565162420272827, + "num_tokens": 372347768.0, + "step": 9758 + }, + { + "epoch": 1.2414451087647882, + "ewc_loss": 0.060365431010723114, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027406448498368263, + "grad_norm": 6.968024253845215, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8703356385231018, + "num_tokens": 372381020.0, + "step": 9759 + }, + { + "epoch": 1.2415723190433787, + "ewc_loss": 0.06022613123059273, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027267145924270153, + "grad_norm": 6.908143997192383, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8510986566543579, + "num_tokens": 372422434.0, + "step": 9760 + }, + { + "epoch": 1.2416995293219693, + "ewc_loss": 0.06037333607673645, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002741435309872031, + "grad_norm": 6.976510047912598, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8538811206817627, + "num_tokens": 372459342.0, + "step": 9761 + }, + { + "epoch": 1.2418267396005598, + "ewc_loss": 0.0603194460272789, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002736046153586358, + "grad_norm": 6.940080642700195, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8655776977539062, + "num_tokens": 372496637.0, + "step": 9762 + }, + { + "epoch": 1.2419539498791503, + "ewc_loss": 0.06039143353700638, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027432446950115263, + "grad_norm": 6.961601257324219, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8751480579376221, + "num_tokens": 372532068.0, + "step": 9763 + }, + { + "epoch": 1.2420811601577408, + "ewc_loss": 0.06031983345746994, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027360848616808653, + "grad_norm": 6.946344375610352, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.870602011680603, + "num_tokens": 372565460.0, + "step": 9764 + }, + { + "epoch": 1.2422083704363311, + "ewc_loss": 0.06029514968395233, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002733616274781525, + "grad_norm": 6.940921783447266, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8666717410087585, + "num_tokens": 372605998.0, + "step": 9765 + }, + { + "epoch": 1.2423355807149217, + "ewc_loss": 0.06029254198074341, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002733355504460633, + "grad_norm": 6.9453301429748535, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8526500463485718, + "num_tokens": 372642038.0, + "step": 9766 + }, + { + "epoch": 1.2424627909935122, + "ewc_loss": 0.060303978621959686, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027344992849975824, + "grad_norm": 6.990627288818359, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8572533130645752, + "num_tokens": 372678192.0, + "step": 9767 + }, + { + "epoch": 1.2425900012721027, + "ewc_loss": 0.060257479548454285, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027298496570438147, + "grad_norm": 6.951754093170166, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8593151569366455, + "num_tokens": 372712867.0, + "step": 9768 + }, + { + "epoch": 1.2427172115506933, + "ewc_loss": 0.060151420533657074, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002719243348110467, + "grad_norm": 7.0186614990234375, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8445403575897217, + "num_tokens": 372745790.0, + "step": 9769 + }, + { + "epoch": 1.2428444218292838, + "ewc_loss": 0.06014007329940796, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027181088807992637, + "grad_norm": 6.885887145996094, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8727184534072876, + "num_tokens": 372777634.0, + "step": 9770 + }, + { + "epoch": 1.2429716321078743, + "ewc_loss": 0.060333676636219025, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027374690398573875, + "grad_norm": 6.943044185638428, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8473260402679443, + "num_tokens": 372820527.0, + "step": 9771 + }, + { + "epoch": 1.2430988423864648, + "ewc_loss": 0.060120806097984314, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027161819161847234, + "grad_norm": 6.892580509185791, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8724994659423828, + "num_tokens": 372858479.0, + "step": 9772 + }, + { + "epoch": 1.2432260526650554, + "ewc_loss": 0.060304976999759674, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027345994021743536, + "grad_norm": 6.930918216705322, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8685995936393738, + "num_tokens": 372895219.0, + "step": 9773 + }, + { + "epoch": 1.243353262943646, + "ewc_loss": 0.06008801609277725, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002712903078645468, + "grad_norm": 6.885443210601807, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8747707605361938, + "num_tokens": 372934491.0, + "step": 9774 + }, + { + "epoch": 1.2434804732222364, + "ewc_loss": 0.060232315212488174, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002727333048824221, + "grad_norm": 6.938607215881348, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8538957834243774, + "num_tokens": 372972892.0, + "step": 9775 + }, + { + "epoch": 1.243607683500827, + "ewc_loss": 0.06018931418657303, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027230329578742385, + "grad_norm": 6.9076762199401855, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8658472895622253, + "num_tokens": 373011564.0, + "step": 9776 + }, + { + "epoch": 1.2437348937794175, + "ewc_loss": 0.06029573082923889, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027336744824424386, + "grad_norm": 6.974045753479004, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8651030659675598, + "num_tokens": 373049444.0, + "step": 9777 + }, + { + "epoch": 1.243862104058008, + "ewc_loss": 0.060271311551332474, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027312326710671186, + "grad_norm": 6.914921283721924, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8622590899467468, + "num_tokens": 373085466.0, + "step": 9778 + }, + { + "epoch": 1.2439893143365983, + "ewc_loss": 0.060235366225242615, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027276380569674075, + "grad_norm": 6.910146236419678, + "learning_rate": 1e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8393197059631348, + "num_tokens": 373124712.0, + "step": 9779 + }, + { + "epoch": 1.2441165246151888, + "ewc_loss": 0.0603015273809433, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002734254230745137, + "grad_norm": 6.912191390991211, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8674501180648804, + "num_tokens": 373160008.0, + "step": 9780 + }, + { + "epoch": 1.2442437348937794, + "ewc_loss": 0.06026418134570122, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027305196272209287, + "grad_norm": 6.975758075714111, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8629329800605774, + "num_tokens": 373192110.0, + "step": 9781 + }, + { + "epoch": 1.24437094517237, + "ewc_loss": 0.06025516986846924, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002729618572629988, + "grad_norm": 6.9347825050354, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8643284440040588, + "num_tokens": 373234432.0, + "step": 9782 + }, + { + "epoch": 1.2444981554509604, + "ewc_loss": 0.060255296528339386, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027296310872770846, + "grad_norm": 6.957374572753906, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8523669242858887, + "num_tokens": 373270725.0, + "step": 9783 + }, + { + "epoch": 1.244625365729551, + "ewc_loss": 0.060119614005088806, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027160628815181553, + "grad_norm": 6.949322700500488, + "learning_rate": 1e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8412421941757202, + "num_tokens": 373307531.0, + "step": 9784 + }, + { + "epoch": 1.2447525760081415, + "ewc_loss": 0.06026948615908623, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002731050190050155, + "grad_norm": 6.930178165435791, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8659476041793823, + "num_tokens": 373351657.0, + "step": 9785 + }, + { + "epoch": 1.244879786286732, + "ewc_loss": 0.06021607667207718, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027257093461230397, + "grad_norm": 6.982725620269775, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8672513961791992, + "num_tokens": 373388811.0, + "step": 9786 + }, + { + "epoch": 1.2450069965653225, + "ewc_loss": 0.06010425090789795, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027145264903083444, + "grad_norm": 6.862205505371094, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.872505247592926, + "num_tokens": 373423378.0, + "step": 9787 + }, + { + "epoch": 1.245134206843913, + "ewc_loss": 0.060299307107925415, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027340324595570564, + "grad_norm": 6.947340488433838, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.864210307598114, + "num_tokens": 373460982.0, + "step": 9788 + }, + { + "epoch": 1.2452614171225034, + "ewc_loss": 0.06016148626804352, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027202500496059656, + "grad_norm": 6.871183395385742, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8697174191474915, + "num_tokens": 373501474.0, + "step": 9789 + }, + { + "epoch": 1.245388627401094, + "ewc_loss": 0.06033693253993988, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002737795002758503, + "grad_norm": 7.0199761390686035, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8661919236183167, + "num_tokens": 373537830.0, + "step": 9790 + }, + { + "epoch": 1.2455158376796844, + "ewc_loss": 0.060104094445705414, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027145107742398977, + "grad_norm": 6.845349311828613, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8562744855880737, + "num_tokens": 373573975.0, + "step": 9791 + }, + { + "epoch": 1.245643047958275, + "ewc_loss": 0.06039535254240036, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027436367236077785, + "grad_norm": 6.9807610511779785, + "learning_rate": 1e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8411838412284851, + "num_tokens": 373612157.0, + "step": 9792 + }, + { + "epoch": 1.2457702582368655, + "ewc_loss": 0.06012127175927162, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027162287733517587, + "grad_norm": 6.848099231719971, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8810982704162598, + "num_tokens": 373645065.0, + "step": 9793 + }, + { + "epoch": 1.245897468515456, + "ewc_loss": 0.060507796704769135, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002754881279543042, + "grad_norm": 6.936164855957031, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8570603728294373, + "num_tokens": 373687658.0, + "step": 9794 + }, + { + "epoch": 1.2460246787940465, + "ewc_loss": 0.06030191481113434, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002734292938839644, + "grad_norm": 6.840678691864014, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8644330501556396, + "num_tokens": 373732262.0, + "step": 9795 + }, + { + "epoch": 1.246151889072637, + "ewc_loss": 0.06039934232831001, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027440357371233404, + "grad_norm": 6.975680828094482, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8433827757835388, + "num_tokens": 373772410.0, + "step": 9796 + }, + { + "epoch": 1.2462790993512276, + "ewc_loss": 0.0602702796459198, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002731129352468997, + "grad_norm": 6.915490627288818, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8630778193473816, + "num_tokens": 373808773.0, + "step": 9797 + }, + { + "epoch": 1.2464063096298181, + "ewc_loss": 0.060432251542806625, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002747326798271388, + "grad_norm": 7.037178039550781, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8582228422164917, + "num_tokens": 373848945.0, + "step": 9798 + }, + { + "epoch": 1.2465335199084087, + "ewc_loss": 0.06026148051023483, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000273024954367429, + "grad_norm": 6.886279106140137, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8574401140213013, + "num_tokens": 373892716.0, + "step": 9799 + }, + { + "epoch": 1.2466607301869992, + "ewc_loss": 0.06034862622618675, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000273896410362795, + "grad_norm": 6.98344612121582, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8784263134002686, + "num_tokens": 373929443.0, + "step": 9800 + }, + { + "epoch": 1.2467879404655897, + "ewc_loss": 0.06031203269958496, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002735304879024625, + "grad_norm": 6.961208343505859, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8633406162261963, + "num_tokens": 373968882.0, + "step": 9801 + }, + { + "epoch": 1.2469151507441802, + "ewc_loss": 0.06025915592908859, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002730017004068941, + "grad_norm": 6.854970455169678, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8764976859092712, + "num_tokens": 374009265.0, + "step": 9802 + }, + { + "epoch": 1.2470423610227706, + "ewc_loss": 0.06036553531885147, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002740655036177486, + "grad_norm": 6.984906196594238, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8470231890678406, + "num_tokens": 374047152.0, + "step": 9803 + }, + { + "epoch": 1.247169571301361, + "ewc_loss": 0.06019657850265503, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002723759680520743, + "grad_norm": 6.90948486328125, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8583865165710449, + "num_tokens": 374083127.0, + "step": 9804 + }, + { + "epoch": 1.2472967815799516, + "ewc_loss": 0.06039562076330185, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027436637901701033, + "grad_norm": 6.931153774261475, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8715440034866333, + "num_tokens": 374127086.0, + "step": 9805 + }, + { + "epoch": 1.2474239918585421, + "ewc_loss": 0.060280926525592804, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027321939705871046, + "grad_norm": 6.93925666809082, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8564358949661255, + "num_tokens": 374163879.0, + "step": 9806 + }, + { + "epoch": 1.2475512021371327, + "ewc_loss": 0.0603080615401268, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002734907902777195, + "grad_norm": 6.943211078643799, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8678662776947021, + "num_tokens": 374206639.0, + "step": 9807 + }, + { + "epoch": 1.2476784124157232, + "ewc_loss": 0.0603504553437233, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002739146875683218, + "grad_norm": 7.044224739074707, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8593518733978271, + "num_tokens": 374244106.0, + "step": 9808 + }, + { + "epoch": 1.2478056226943137, + "ewc_loss": 0.06017583981156349, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027216854505240917, + "grad_norm": 7.048552989959717, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8590496778488159, + "num_tokens": 374285776.0, + "step": 9809 + }, + { + "epoch": 1.2479328329729042, + "ewc_loss": 0.060125112533569336, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002716612652875483, + "grad_norm": 6.882330417633057, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8716713190078735, + "num_tokens": 374323036.0, + "step": 9810 + }, + { + "epoch": 1.2480600432514948, + "ewc_loss": 0.06030510738492012, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027346122078597546, + "grad_norm": 7.178520679473877, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8683569431304932, + "num_tokens": 374360204.0, + "step": 9811 + }, + { + "epoch": 1.2481872535300853, + "ewc_loss": 0.060025278478860855, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002706629456952214, + "grad_norm": 6.9516215324401855, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8727627992630005, + "num_tokens": 374394961.0, + "step": 9812 + }, + { + "epoch": 1.2483144638086758, + "ewc_loss": 0.06037207692861557, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027413092902861536, + "grad_norm": 7.113368034362793, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8653689622879028, + "num_tokens": 374436142.0, + "step": 9813 + }, + { + "epoch": 1.2484416740872661, + "ewc_loss": 0.05997801572084427, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027019029948860407, + "grad_norm": 6.8871989250183105, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8583178520202637, + "num_tokens": 374474263.0, + "step": 9814 + }, + { + "epoch": 1.2485688843658567, + "ewc_loss": 0.06038723513484001, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000274282501777634, + "grad_norm": 7.321150779724121, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8573973178863525, + "num_tokens": 374510250.0, + "step": 9815 + }, + { + "epoch": 1.2486960946444472, + "ewc_loss": 0.05997314304113388, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027014160878024995, + "grad_norm": 6.787835121154785, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8815116286277771, + "num_tokens": 374549579.0, + "step": 9816 + }, + { + "epoch": 1.2488233049230377, + "ewc_loss": 0.060805149376392365, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027846163720823824, + "grad_norm": 7.505645275115967, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.860733151435852, + "num_tokens": 374588631.0, + "step": 9817 + }, + { + "epoch": 1.2489505152016283, + "ewc_loss": 0.060025714337825775, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002706672821659595, + "grad_norm": 6.775483131408691, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8577314019203186, + "num_tokens": 374627092.0, + "step": 9818 + }, + { + "epoch": 1.2490777254802188, + "ewc_loss": 0.060956694185733795, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027997707366012037, + "grad_norm": 7.24666690826416, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.868859052658081, + "num_tokens": 374657797.0, + "step": 9819 + }, + { + "epoch": 1.2492049357588093, + "ewc_loss": 0.060216668993234634, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002725768426898867, + "grad_norm": 6.958086013793945, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8664528131484985, + "num_tokens": 374694011.0, + "step": 9820 + }, + { + "epoch": 1.2493321460373998, + "ewc_loss": 0.060676928609609604, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002771794388536364, + "grad_norm": 7.019900798797607, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8653190732002258, + "num_tokens": 374735127.0, + "step": 9821 + }, + { + "epoch": 1.2494593563159904, + "ewc_loss": 0.060398466885089874, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000274394842563197, + "grad_norm": 7.020251750946045, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8603302240371704, + "num_tokens": 374774170.0, + "step": 9822 + }, + { + "epoch": 1.249586566594581, + "ewc_loss": 0.060409292578697205, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002745030797086656, + "grad_norm": 7.224771499633789, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8636309504508972, + "num_tokens": 374809162.0, + "step": 9823 + }, + { + "epoch": 1.2497137768731714, + "ewc_loss": 0.060173191130161285, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000272142089670524, + "grad_norm": 6.919459342956543, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8548350930213928, + "num_tokens": 374847974.0, + "step": 9824 + }, + { + "epoch": 1.249840987151762, + "ewc_loss": 0.06037907674908638, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002742009237408638, + "grad_norm": 6.955817699432373, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8777183294296265, + "num_tokens": 374894911.0, + "step": 9825 + }, + { + "epoch": 1.2499681974303525, + "ewc_loss": 0.06031711399555206, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027358130319043994, + "grad_norm": 7.06190824508667, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8576771020889282, + "num_tokens": 374933534.0, + "step": 9826 + }, + { + "epoch": 1.250095407708943, + "ewc_loss": 0.060251835733652115, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002729285042732954, + "grad_norm": 6.961458683013916, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8489468693733215, + "num_tokens": 374968183.0, + "step": 9827 + }, + { + "epoch": 1.2502226179875333, + "ewc_loss": 0.06035670265555382, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027397717349231243, + "grad_norm": 7.014811992645264, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8563128709793091, + "num_tokens": 375005973.0, + "step": 9828 + }, + { + "epoch": 1.2503498282661238, + "ewc_loss": 0.06023961305618286, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002728062972892076, + "grad_norm": 6.932994365692139, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8564543724060059, + "num_tokens": 375043446.0, + "step": 9829 + }, + { + "epoch": 1.2504770385447144, + "ewc_loss": 0.06043129414319992, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027472307556308806, + "grad_norm": 6.999107360839844, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8661231398582458, + "num_tokens": 375081518.0, + "step": 9830 + }, + { + "epoch": 1.250604248823305, + "ewc_loss": 0.060258351266384125, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027299366774968803, + "grad_norm": 6.945721626281738, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8554189205169678, + "num_tokens": 375117767.0, + "step": 9831 + }, + { + "epoch": 1.2507314591018954, + "ewc_loss": 0.060485225170850754, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002752623986452818, + "grad_norm": 7.061228275299072, + "learning_rate": 1e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8388662338256836, + "num_tokens": 375152061.0, + "step": 9832 + }, + { + "epoch": 1.250858669380486, + "ewc_loss": 0.060280315577983856, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000273213314358145, + "grad_norm": 6.904423713684082, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8559534549713135, + "num_tokens": 375195677.0, + "step": 9833 + }, + { + "epoch": 1.2509858796590765, + "ewc_loss": 0.06048149615526199, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002752251166384667, + "grad_norm": 7.065981864929199, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8552185297012329, + "num_tokens": 375230319.0, + "step": 9834 + }, + { + "epoch": 1.251113089937667, + "ewc_loss": 0.06021991744637489, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002726093225646764, + "grad_norm": 6.870461463928223, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8669755458831787, + "num_tokens": 375268970.0, + "step": 9835 + }, + { + "epoch": 1.2512403002162575, + "ewc_loss": 0.06063389033079147, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027674908051267266, + "grad_norm": 7.0490593910217285, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8697890043258667, + "num_tokens": 375303661.0, + "step": 9836 + }, + { + "epoch": 1.2513675104948478, + "ewc_loss": 0.060250990092754364, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027292006416246295, + "grad_norm": 6.845395088195801, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8702605366706848, + "num_tokens": 375343278.0, + "step": 9837 + }, + { + "epoch": 1.2514947207734384, + "ewc_loss": 0.060612648725509644, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002765366225503385, + "grad_norm": 7.010735511779785, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8698394298553467, + "num_tokens": 375378574.0, + "step": 9838 + }, + { + "epoch": 1.251621931052029, + "ewc_loss": 0.0603676438331604, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027408660389482975, + "grad_norm": 6.912089824676514, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8572819232940674, + "num_tokens": 375412781.0, + "step": 9839 + }, + { + "epoch": 1.2517491413306194, + "ewc_loss": 0.060507632791996, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002754864690359682, + "grad_norm": 7.017970085144043, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.874280571937561, + "num_tokens": 375450431.0, + "step": 9840 + }, + { + "epoch": 1.25187635160921, + "ewc_loss": 0.06036526709794998, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002740628260653466, + "grad_norm": 6.916243076324463, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.856010377407074, + "num_tokens": 375490692.0, + "step": 9841 + }, + { + "epoch": 1.2520035618878005, + "ewc_loss": 0.060478389263153076, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002751940337475389, + "grad_norm": 6.954492568969727, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8670878410339355, + "num_tokens": 375535278.0, + "step": 9842 + }, + { + "epoch": 1.252130772166391, + "ewc_loss": 0.06040100380778313, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027442019199952483, + "grad_norm": 6.935114860534668, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8554828763008118, + "num_tokens": 375575859.0, + "step": 9843 + }, + { + "epoch": 1.2522579824449815, + "ewc_loss": 0.06044123321771622, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002748224651440978, + "grad_norm": 7.0192718505859375, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8539758920669556, + "num_tokens": 375611084.0, + "step": 9844 + }, + { + "epoch": 1.252385192723572, + "ewc_loss": 0.06033293902873993, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002737395407166332, + "grad_norm": 6.914767265319824, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.872064471244812, + "num_tokens": 375644926.0, + "step": 9845 + }, + { + "epoch": 1.2525124030021626, + "ewc_loss": 0.06048477068543434, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027525785844773054, + "grad_norm": 6.982720851898193, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8704729080200195, + "num_tokens": 375686970.0, + "step": 9846 + }, + { + "epoch": 1.2526396132807531, + "ewc_loss": 0.060327813029289246, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002736882888711989, + "grad_norm": 6.923001766204834, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8731713891029358, + "num_tokens": 375727372.0, + "step": 9847 + }, + { + "epoch": 1.2527668235593437, + "ewc_loss": 0.0604843869805336, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002752540458459407, + "grad_norm": 7.020939826965332, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8505614995956421, + "num_tokens": 375763568.0, + "step": 9848 + }, + { + "epoch": 1.2528940338379342, + "ewc_loss": 0.06029587984085083, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002733689616434276, + "grad_norm": 7.000602722167969, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8463283181190491, + "num_tokens": 375797954.0, + "step": 9849 + }, + { + "epoch": 1.2530212441165247, + "ewc_loss": 0.06030956655740738, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027350583695806563, + "grad_norm": 6.985381603240967, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8604066371917725, + "num_tokens": 375836362.0, + "step": 9850 + }, + { + "epoch": 1.2531484543951152, + "ewc_loss": 0.060248877853155136, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027289893478155136, + "grad_norm": 6.949981689453125, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8570599555969238, + "num_tokens": 375879403.0, + "step": 9851 + }, + { + "epoch": 1.2532756646737058, + "ewc_loss": 0.06026831269264221, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027309329016134143, + "grad_norm": 6.946375846862793, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8648023009300232, + "num_tokens": 375916728.0, + "step": 9852 + }, + { + "epoch": 1.253402874952296, + "ewc_loss": 0.06021792069077492, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002725893573369831, + "grad_norm": 6.973072528839111, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8578224182128906, + "num_tokens": 375951743.0, + "step": 9853 + }, + { + "epoch": 1.2535300852308866, + "ewc_loss": 0.060324110090732574, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027365126879885793, + "grad_norm": 6.947316646575928, + "learning_rate": 1e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.8368850946426392, + "num_tokens": 375988872.0, + "step": 9854 + }, + { + "epoch": 1.2536572955094771, + "ewc_loss": 0.06037181615829468, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027412830968387425, + "grad_norm": 6.972366809844971, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8509619832038879, + "num_tokens": 376030388.0, + "step": 9855 + }, + { + "epoch": 1.2537845057880677, + "ewc_loss": 0.06034553050994873, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027386544388718903, + "grad_norm": 6.982280254364014, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8719838857650757, + "num_tokens": 376062975.0, + "step": 9856 + }, + { + "epoch": 1.2539117160666582, + "ewc_loss": 0.06024027615785599, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027281290385872126, + "grad_norm": 6.9377312660217285, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8546345233917236, + "num_tokens": 376101873.0, + "step": 9857 + }, + { + "epoch": 1.2540389263452487, + "ewc_loss": 0.060322538018226624, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027363552362658083, + "grad_norm": 6.986055850982666, + "learning_rate": 1e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8409563302993774, + "num_tokens": 376138774.0, + "step": 9858 + }, + { + "epoch": 1.2541661366238392, + "ewc_loss": 0.06021985039114952, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002726086531765759, + "grad_norm": 6.967702388763428, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8537996411323547, + "num_tokens": 376172357.0, + "step": 9859 + }, + { + "epoch": 1.2542933469024298, + "ewc_loss": 0.06026965379714966, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002731066779233515, + "grad_norm": 6.910750865936279, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8518081903457642, + "num_tokens": 376212977.0, + "step": 9860 + }, + { + "epoch": 1.2544205571810203, + "ewc_loss": 0.06030106917023659, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027342085377313197, + "grad_norm": 6.937230110168457, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.852093517780304, + "num_tokens": 376256989.0, + "step": 9861 + }, + { + "epoch": 1.2545477674596106, + "ewc_loss": 0.060313526540994644, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002735454181674868, + "grad_norm": 6.942828178405762, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8663373589515686, + "num_tokens": 376300130.0, + "step": 9862 + }, + { + "epoch": 1.2546749777382011, + "ewc_loss": 0.06032484024763107, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002736585447564721, + "grad_norm": 6.969621658325195, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8649154901504517, + "num_tokens": 376337475.0, + "step": 9863 + }, + { + "epoch": 1.2548021880167917, + "ewc_loss": 0.060310907661914825, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002735192247200757, + "grad_norm": 6.965590476989746, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8528899550437927, + "num_tokens": 376376485.0, + "step": 9864 + }, + { + "epoch": 1.2549293982953822, + "ewc_loss": 0.060304414480924606, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027345429407432675, + "grad_norm": 6.9297051429748535, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8682846426963806, + "num_tokens": 376413393.0, + "step": 9865 + }, + { + "epoch": 1.2550566085739727, + "ewc_loss": 0.06036148592829704, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002740250201895833, + "grad_norm": 6.981474876403809, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8519965410232544, + "num_tokens": 376448192.0, + "step": 9866 + }, + { + "epoch": 1.2551838188525632, + "ewc_loss": 0.06025521457195282, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027296229382045567, + "grad_norm": 6.9106125831604, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8638572692871094, + "num_tokens": 376489382.0, + "step": 9867 + }, + { + "epoch": 1.2553110291311538, + "ewc_loss": 0.06035936623811722, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002740038326010108, + "grad_norm": 6.94949197769165, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8663898706436157, + "num_tokens": 376527221.0, + "step": 9868 + }, + { + "epoch": 1.2554382394097443, + "ewc_loss": 0.060304172337055206, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027345187845639884, + "grad_norm": 6.895479679107666, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8576887845993042, + "num_tokens": 376567045.0, + "step": 9869 + }, + { + "epoch": 1.2555654496883348, + "ewc_loss": 0.06043436378240585, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002747537801042199, + "grad_norm": 6.935095310211182, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8695814609527588, + "num_tokens": 376607039.0, + "step": 9870 + }, + { + "epoch": 1.2556926599669254, + "ewc_loss": 0.06037439405918121, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002741540956776589, + "grad_norm": 6.940351963043213, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8638285398483276, + "num_tokens": 376643220.0, + "step": 9871 + }, + { + "epoch": 1.255819870245516, + "ewc_loss": 0.06046439707279205, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027505410253070295, + "grad_norm": 6.950328826904297, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8815127015113831, + "num_tokens": 376681417.0, + "step": 9872 + }, + { + "epoch": 1.2559470805241064, + "ewc_loss": 0.06038504093885422, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002742605865933001, + "grad_norm": 6.905656814575195, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8807545900344849, + "num_tokens": 376722836.0, + "step": 9873 + }, + { + "epoch": 1.256074290802697, + "ewc_loss": 0.06046106666326523, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027502080774866045, + "grad_norm": 6.954453468322754, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8550462126731873, + "num_tokens": 376763194.0, + "step": 9874 + }, + { + "epoch": 1.2562015010812875, + "ewc_loss": 0.060389962047338486, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000274309772066772, + "grad_norm": 6.946404457092285, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8576719760894775, + "num_tokens": 376803701.0, + "step": 9875 + }, + { + "epoch": 1.256328711359878, + "ewc_loss": 0.060413870960474014, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027454886003397405, + "grad_norm": 6.938031196594238, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8710312843322754, + "num_tokens": 376835296.0, + "step": 9876 + }, + { + "epoch": 1.2564559216384683, + "ewc_loss": 0.06044524163007736, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002748625702224672, + "grad_norm": 6.931928634643555, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8645869493484497, + "num_tokens": 376876317.0, + "step": 9877 + }, + { + "epoch": 1.2565831319170588, + "ewc_loss": 0.060482610017061234, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027523626340553164, + "grad_norm": 6.923537731170654, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8607789278030396, + "num_tokens": 376916403.0, + "step": 9878 + }, + { + "epoch": 1.2567103421956494, + "ewc_loss": 0.06053759157657623, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027578609297052026, + "grad_norm": 6.956397533416748, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8715392351150513, + "num_tokens": 376956079.0, + "step": 9879 + }, + { + "epoch": 1.25683755247424, + "ewc_loss": 0.060458600521087646, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027499618590809405, + "grad_norm": 6.987574100494385, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8531404733657837, + "num_tokens": 376991618.0, + "step": 9880 + }, + { + "epoch": 1.2569647627528304, + "ewc_loss": 0.06050392985343933, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002754494780674577, + "grad_norm": 6.967175006866455, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8681583404541016, + "num_tokens": 377033964.0, + "step": 9881 + }, + { + "epoch": 1.257091973031421, + "ewc_loss": 0.060439251363277435, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027480264543555677, + "grad_norm": 6.9617109298706055, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8709306120872498, + "num_tokens": 377070091.0, + "step": 9882 + }, + { + "epoch": 1.2572191833100115, + "ewc_loss": 0.060411207377910614, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027452223002910614, + "grad_norm": 6.9640679359436035, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8708058595657349, + "num_tokens": 377111185.0, + "step": 9883 + }, + { + "epoch": 1.257346393588602, + "ewc_loss": 0.06041470542550087, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027455721283331513, + "grad_norm": 6.974029541015625, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8669859766960144, + "num_tokens": 377147637.0, + "step": 9884 + }, + { + "epoch": 1.2574736038671925, + "ewc_loss": 0.06049793213605881, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027538949507288635, + "grad_norm": 7.053473949432373, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8615347146987915, + "num_tokens": 377181918.0, + "step": 9885 + }, + { + "epoch": 1.2576008141457828, + "ewc_loss": 0.0603424534201622, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002738347102422267, + "grad_norm": 6.97873592376709, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8557156324386597, + "num_tokens": 377222900.0, + "step": 9886 + }, + { + "epoch": 1.2577280244243734, + "ewc_loss": 0.0604509562253952, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027491970104165375, + "grad_norm": 6.97620153427124, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8761411905288696, + "num_tokens": 377263107.0, + "step": 9887 + }, + { + "epoch": 1.257855234702964, + "ewc_loss": 0.060396455228328705, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027437470271252096, + "grad_norm": 6.951107978820801, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8771072626113892, + "num_tokens": 377301384.0, + "step": 9888 + }, + { + "epoch": 1.2579824449815544, + "ewc_loss": 0.060497645288705826, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027538661379367113, + "grad_norm": 7.034023284912109, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8554053902626038, + "num_tokens": 377339865.0, + "step": 9889 + }, + { + "epoch": 1.258109655260145, + "ewc_loss": 0.06032990664243698, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027370924362912774, + "grad_norm": 6.95200252532959, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8706893920898438, + "num_tokens": 377381582.0, + "step": 9890 + }, + { + "epoch": 1.2582368655387355, + "ewc_loss": 0.0605621337890625, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027603149646893144, + "grad_norm": 7.005268573760986, + "learning_rate": 1e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8402004837989807, + "num_tokens": 377425548.0, + "step": 9891 + }, + { + "epoch": 1.258364075817326, + "ewc_loss": 0.060438770800828934, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027479787240736187, + "grad_norm": 7.019932746887207, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8609168529510498, + "num_tokens": 377462298.0, + "step": 9892 + }, + { + "epoch": 1.2584912860959165, + "ewc_loss": 0.06040722131729126, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027448238688521087, + "grad_norm": 7.049299716949463, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8563053011894226, + "num_tokens": 377503873.0, + "step": 9893 + }, + { + "epoch": 1.258618496374507, + "ewc_loss": 0.06043706834316254, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002747808466665447, + "grad_norm": 7.036590576171875, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8778719902038574, + "num_tokens": 377540497.0, + "step": 9894 + }, + { + "epoch": 1.2587457066530976, + "ewc_loss": 0.06032445281744003, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002736547030508518, + "grad_norm": 6.954894065856934, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8630369305610657, + "num_tokens": 377582242.0, + "step": 9895 + }, + { + "epoch": 1.2588729169316881, + "ewc_loss": 0.060533955693244934, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027574971318244934, + "grad_norm": 7.02743673324585, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8774321675300598, + "num_tokens": 377614972.0, + "step": 9896 + }, + { + "epoch": 1.2590001272102787, + "ewc_loss": 0.06034823879599571, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027389253955334425, + "grad_norm": 7.015458583831787, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.859861433506012, + "num_tokens": 377645392.0, + "step": 9897 + }, + { + "epoch": 1.2591273374888692, + "ewc_loss": 0.060531359165906906, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027572375256568193, + "grad_norm": 7.029635429382324, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.870875358581543, + "num_tokens": 377680317.0, + "step": 9898 + }, + { + "epoch": 1.2592545477674597, + "ewc_loss": 0.06043919920921326, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000274802150670439, + "grad_norm": 7.027945518493652, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8626922369003296, + "num_tokens": 377714879.0, + "step": 9899 + }, + { + "epoch": 1.2593817580460502, + "ewc_loss": 0.06048278510570526, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027523800963535905, + "grad_norm": 6.992544174194336, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8476921319961548, + "num_tokens": 377751404.0, + "step": 9900 + }, + { + "epoch": 1.2595089683246408, + "ewc_loss": 0.06050795316696167, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002754896995611489, + "grad_norm": 7.012179851531982, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8669349551200867, + "num_tokens": 377788354.0, + "step": 9901 + }, + { + "epoch": 1.259636178603231, + "ewc_loss": 0.06044371426105499, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002748472907114774, + "grad_norm": 6.9820146560668945, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8513535261154175, + "num_tokens": 377827259.0, + "step": 9902 + }, + { + "epoch": 1.2597633888818216, + "ewc_loss": 0.06048225238919258, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027523268363438547, + "grad_norm": 6.963700771331787, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8546948432922363, + "num_tokens": 377872477.0, + "step": 9903 + }, + { + "epoch": 1.2598905991604121, + "ewc_loss": 0.06063047796487808, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027671491261571646, + "grad_norm": 7.090023517608643, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8543524742126465, + "num_tokens": 377911159.0, + "step": 9904 + }, + { + "epoch": 1.2600178094390027, + "ewc_loss": 0.06043326109647751, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002747427497524768, + "grad_norm": 6.948844909667969, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8648257255554199, + "num_tokens": 377951919.0, + "step": 9905 + }, + { + "epoch": 1.2601450197175932, + "ewc_loss": 0.06065069139003754, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002769170969258994, + "grad_norm": 7.036263942718506, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8450959920883179, + "num_tokens": 377997567.0, + "step": 9906 + }, + { + "epoch": 1.2602722299961837, + "ewc_loss": 0.06037919223308563, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002742020587902516, + "grad_norm": 6.997844696044922, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8612262010574341, + "num_tokens": 378033811.0, + "step": 9907 + }, + { + "epoch": 1.2603994402747742, + "ewc_loss": 0.06055685877799988, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002759787312243134, + "grad_norm": 7.075894355773926, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8500343561172485, + "num_tokens": 378070136.0, + "step": 9908 + }, + { + "epoch": 1.2605266505533648, + "ewc_loss": 0.060390833765268326, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000274318503215909, + "grad_norm": 7.011674880981445, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8596752285957336, + "num_tokens": 378106902.0, + "step": 9909 + }, + { + "epoch": 1.2606538608319553, + "ewc_loss": 0.06047028303146362, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002751129795797169, + "grad_norm": 6.967190742492676, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8658608794212341, + "num_tokens": 378144638.0, + "step": 9910 + }, + { + "epoch": 1.2607810711105456, + "ewc_loss": 0.06055496260523796, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027595978463068604, + "grad_norm": 7.041764736175537, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.871400773525238, + "num_tokens": 378179059.0, + "step": 9911 + }, + { + "epoch": 1.2609082813891361, + "ewc_loss": 0.06243178993463516, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00027519682771526277, + "grad_norm": 53.71527099609375, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8484333753585815, + "num_tokens": 378210346.0, + "step": 9912 + }, + { + "epoch": 1.2610354916677267, + "ewc_loss": 0.09397044777870178, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0006101146573200822, + "grad_norm": 11.168924331665039, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8677332401275635, + "num_tokens": 378242687.0, + "step": 9913 + }, + { + "epoch": 1.2611627019463172, + "ewc_loss": 0.06231755018234253, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029114427161403, + "grad_norm": 6.3067216873168945, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8674274682998657, + "num_tokens": 378279421.0, + "step": 9914 + }, + { + "epoch": 1.2612899122249077, + "ewc_loss": 0.07775823771953583, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00044799252646043897, + "grad_norm": 9.97800350189209, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8571910858154297, + "num_tokens": 378318133.0, + "step": 9915 + }, + { + "epoch": 1.2614171225034982, + "ewc_loss": 0.08324073255062103, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0005003761034458876, + "grad_norm": 9.895485877990723, + "learning_rate": 1e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.844264030456543, + "num_tokens": 378362987.0, + "step": 9916 + }, + { + "epoch": 1.2615443327820888, + "ewc_loss": 0.06766749918460846, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0003470851224847138, + "grad_norm": 7.4672393798828125, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.879463791847229, + "num_tokens": 378396197.0, + "step": 9917 + }, + { + "epoch": 1.2616715430606793, + "ewc_loss": 0.06942294538021088, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0003646395925898105, + "grad_norm": 8.593156814575195, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8697983026504517, + "num_tokens": 378433691.0, + "step": 9918 + }, + { + "epoch": 1.2617987533392698, + "ewc_loss": 0.07148049771785736, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0003827736945822835, + "grad_norm": 8.264819145202637, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8659384250640869, + "num_tokens": 378470277.0, + "step": 9919 + }, + { + "epoch": 1.2619259636178604, + "ewc_loss": 0.06552140414714813, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00032562421984039247, + "grad_norm": 7.574156284332275, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8559930324554443, + "num_tokens": 378510500.0, + "step": 9920 + }, + { + "epoch": 1.2620531738964509, + "ewc_loss": 0.0661630630493164, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000332040770445019, + "grad_norm": 7.9054999351501465, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.859826922416687, + "num_tokens": 378552893.0, + "step": 9921 + }, + { + "epoch": 1.2621803841750414, + "ewc_loss": 0.06544804573059082, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0003248905995860696, + "grad_norm": 7.535079002380371, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8687056303024292, + "num_tokens": 378590917.0, + "step": 9922 + }, + { + "epoch": 1.262307594453632, + "ewc_loss": 0.0639614462852478, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0003100246249232441, + "grad_norm": 7.5162882804870605, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8706412315368652, + "num_tokens": 378637764.0, + "step": 9923 + }, + { + "epoch": 1.2624348047322225, + "ewc_loss": 0.063670814037323, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0003071183164138347, + "grad_norm": 7.6127400398254395, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8589945435523987, + "num_tokens": 378675497.0, + "step": 9924 + }, + { + "epoch": 1.262562015010813, + "ewc_loss": 0.06303137540817261, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00030072388472035527, + "grad_norm": 7.374785900115967, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8538141250610352, + "num_tokens": 378710470.0, + "step": 9925 + }, + { + "epoch": 1.2626892252894033, + "ewc_loss": 0.062330685555934906, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002937170211225748, + "grad_norm": 7.235380172729492, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8589017987251282, + "num_tokens": 378758116.0, + "step": 9926 + }, + { + "epoch": 1.2628164355679938, + "ewc_loss": 0.062299855053424835, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00029340869514271617, + "grad_norm": 7.258188724517822, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8591799736022949, + "num_tokens": 378798166.0, + "step": 9927 + }, + { + "epoch": 1.2629436458465844, + "ewc_loss": 0.06179840862751007, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002883942215703428, + "grad_norm": 7.168655872344971, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8773798942565918, + "num_tokens": 378839899.0, + "step": 9928 + }, + { + "epoch": 1.263070856125175, + "ewc_loss": 0.06126333028078079, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002879262901842594, + "grad_norm": 7.136497497558594, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8628588318824768, + "num_tokens": 378886022.0, + "step": 9929 + }, + { + "epoch": 1.2631980664037654, + "ewc_loss": 0.06098929047584534, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00028518587350845337, + "grad_norm": 7.101545810699463, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8518558740615845, + "num_tokens": 378925097.0, + "step": 9930 + }, + { + "epoch": 1.263325276682356, + "ewc_loss": 0.060927119106054306, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002845641574822366, + "grad_norm": 7.118167877197266, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.874202311038971, + "num_tokens": 378965242.0, + "step": 9931 + }, + { + "epoch": 1.2634524869609465, + "ewc_loss": 0.060763053596019745, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002829234872478992, + "grad_norm": 7.083106517791748, + "learning_rate": 1e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8418886661529541, + "num_tokens": 379005932.0, + "step": 9932 + }, + { + "epoch": 1.263579697239537, + "ewc_loss": 0.060980893671512604, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00028266047593206167, + "grad_norm": 7.065114974975586, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8597034215927124, + "num_tokens": 379045817.0, + "step": 9933 + }, + { + "epoch": 1.2637069075181275, + "ewc_loss": 0.060930825769901276, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00028215983184054494, + "grad_norm": 7.117641448974609, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8553075194358826, + "num_tokens": 379086481.0, + "step": 9934 + }, + { + "epoch": 1.2638341177967178, + "ewc_loss": 0.06074870750308037, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00028033863054588437, + "grad_norm": 7.005096912384033, + "learning_rate": 1e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8454495668411255, + "num_tokens": 379129782.0, + "step": 9935 + }, + { + "epoch": 1.2639613280753084, + "ewc_loss": 0.06077704578638077, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.0002806220145430416, + "grad_norm": 7.029997825622559, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.859752357006073, + "num_tokens": 379171525.0, + "step": 9936 + }, + { + "epoch": 1.264088538353899, + "ewc_loss": 0.06077449768781662, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00028059654869139194, + "grad_norm": 7.029049873352051, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8555071353912354, + "num_tokens": 379213887.0, + "step": 9937 + }, + { + "epoch": 1.2642157486324894, + "ewc_loss": 0.06047901511192322, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.0002800831280183047, + "grad_norm": 7.009600639343262, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8626512289047241, + "num_tokens": 379252634.0, + "step": 9938 + }, + { + "epoch": 1.26434295891108, + "ewc_loss": 0.06055183708667755, + "ewc_loss_diag": 3.24249267578125e-05, + "ewc_loss_parallel": 0.00028081133496016264, + "grad_norm": 7.105344295501709, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8608258962631226, + "num_tokens": 379287402.0, + "step": 9939 + }, + { + "epoch": 1.2644701691896705, + "ewc_loss": 0.06067489832639694, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00027960052830167115, + "grad_norm": 7.014480113983154, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8678361177444458, + "num_tokens": 379325592.0, + "step": 9940 + }, + { + "epoch": 1.264597379468261, + "ewc_loss": 0.06074095889925957, + "ewc_loss_diag": 3.266334533691406e-05, + "ewc_loss_parallel": 0.00028026115614920855, + "grad_norm": 7.047577857971191, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8497514724731445, + "num_tokens": 379359010.0, + "step": 9941 + }, + { + "epoch": 1.2647245897468515, + "ewc_loss": 0.06085766851902008, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027898681582883, + "grad_norm": 7.010895729064941, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8547125458717346, + "num_tokens": 379399783.0, + "step": 9942 + }, + { + "epoch": 1.264851800025442, + "ewc_loss": 0.06091630458831787, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002795731998048723, + "grad_norm": 7.019647121429443, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8641250729560852, + "num_tokens": 379442207.0, + "step": 9943 + }, + { + "epoch": 1.2649790103040326, + "ewc_loss": 0.06090302765369415, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002794404572341591, + "grad_norm": 7.003276824951172, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.861452579498291, + "num_tokens": 379479561.0, + "step": 9944 + }, + { + "epoch": 1.2651062205826231, + "ewc_loss": 0.06085279956459999, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027893815422430634, + "grad_norm": 7.027480125427246, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8679109811782837, + "num_tokens": 379516413.0, + "step": 9945 + }, + { + "epoch": 1.2652334308612136, + "ewc_loss": 0.060919396579265594, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002796041371766478, + "grad_norm": 7.003288745880127, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8532620668411255, + "num_tokens": 379557993.0, + "step": 9946 + }, + { + "epoch": 1.2653606411398042, + "ewc_loss": 0.061017222702503204, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002805823751259595, + "grad_norm": 7.100871562957764, + "learning_rate": 1e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8436204791069031, + "num_tokens": 379594028.0, + "step": 9947 + }, + { + "epoch": 1.2654878514183947, + "ewc_loss": 0.060873400419950485, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002791441511362791, + "grad_norm": 7.015817165374756, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8554352521896362, + "num_tokens": 379631235.0, + "step": 9948 + }, + { + "epoch": 1.2656150616969852, + "ewc_loss": 0.060925088822841644, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002796610351651907, + "grad_norm": 7.014878749847412, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8497751355171204, + "num_tokens": 379668714.0, + "step": 9949 + }, + { + "epoch": 1.2657422719755758, + "ewc_loss": 0.060870587825775146, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027911600773222744, + "grad_norm": 6.9843292236328125, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8679454326629639, + "num_tokens": 379710233.0, + "step": 9950 + }, + { + "epoch": 1.265869482254166, + "ewc_loss": 0.06101423501968384, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002805525145959109, + "grad_norm": 7.095774173736572, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8578716516494751, + "num_tokens": 379746208.0, + "step": 9951 + }, + { + "epoch": 1.2659966925327566, + "ewc_loss": 0.060843899846076965, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027884915471076965, + "grad_norm": 6.930736541748047, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8706573843955994, + "num_tokens": 379796420.0, + "step": 9952 + }, + { + "epoch": 1.2661239028113471, + "ewc_loss": 0.06109648942947388, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028137507615610957, + "grad_norm": 7.108235836029053, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8632408380508423, + "num_tokens": 379830027.0, + "step": 9953 + }, + { + "epoch": 1.2662511130899377, + "ewc_loss": 0.060748279094696045, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027789294836111367, + "grad_norm": 6.947627067565918, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8600925207138062, + "num_tokens": 379871751.0, + "step": 9954 + }, + { + "epoch": 1.2663783233685282, + "ewc_loss": 0.061104901134967804, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002814591571222991, + "grad_norm": 7.085498809814453, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8555856943130493, + "num_tokens": 379908255.0, + "step": 9955 + }, + { + "epoch": 1.2665055336471187, + "ewc_loss": 0.060792550444602966, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027833564672619104, + "grad_norm": 7.014524459838867, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8643171191215515, + "num_tokens": 379944502.0, + "step": 9956 + }, + { + "epoch": 1.2666327439257092, + "ewc_loss": 0.06110499054193497, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028146005934104323, + "grad_norm": 7.0546345710754395, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8580347895622253, + "num_tokens": 379981929.0, + "step": 9957 + }, + { + "epoch": 1.2667599542042998, + "ewc_loss": 0.060880884528160095, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002792190061882138, + "grad_norm": 7.013806343078613, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8584626913070679, + "num_tokens": 380018876.0, + "step": 9958 + }, + { + "epoch": 1.2668871644828903, + "ewc_loss": 0.06086185574531555, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027902869624085724, + "grad_norm": 7.018935680389404, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8569928407669067, + "num_tokens": 380054153.0, + "step": 9959 + }, + { + "epoch": 1.2670143747614806, + "ewc_loss": 0.06087066978216171, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002791168517433107, + "grad_norm": 7.077037811279297, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8531737327575684, + "num_tokens": 380086550.0, + "step": 9960 + }, + { + "epoch": 1.2671415850400711, + "ewc_loss": 0.06084195524454117, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027882971335202456, + "grad_norm": 7.083138942718506, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8572635054588318, + "num_tokens": 380120308.0, + "step": 9961 + }, + { + "epoch": 1.2672687953186617, + "ewc_loss": 0.060795024037361145, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027836038498207927, + "grad_norm": 7.021449565887451, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8446704149246216, + "num_tokens": 380160204.0, + "step": 9962 + }, + { + "epoch": 1.2673960055972522, + "ewc_loss": 0.06082374230027199, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002786475815810263, + "grad_norm": 7.043301105499268, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8644068837165833, + "num_tokens": 380196687.0, + "step": 9963 + }, + { + "epoch": 1.2675232158758427, + "ewc_loss": 0.06081429868936539, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027855316875502467, + "grad_norm": 6.9607672691345215, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8765178918838501, + "num_tokens": 380237451.0, + "step": 9964 + }, + { + "epoch": 1.2676504261544332, + "ewc_loss": 0.060874685645103455, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000279157015029341, + "grad_norm": 7.0348052978515625, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8575544953346252, + "num_tokens": 380276631.0, + "step": 9965 + }, + { + "epoch": 1.2677776364330238, + "ewc_loss": 0.0607730969786644, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027814111672341824, + "grad_norm": 6.993794918060303, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8664626479148865, + "num_tokens": 380319432.0, + "step": 9966 + }, + { + "epoch": 1.2679048467116143, + "ewc_loss": 0.06094995141029358, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027990966918878257, + "grad_norm": 7.07893705368042, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8689606189727783, + "num_tokens": 380356666.0, + "step": 9967 + }, + { + "epoch": 1.2680320569902048, + "ewc_loss": 0.06073010712862015, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002777112240437418, + "grad_norm": 6.992617130279541, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8703205585479736, + "num_tokens": 380390347.0, + "step": 9968 + }, + { + "epoch": 1.2681592672687954, + "ewc_loss": 0.06094495207071304, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002798596688080579, + "grad_norm": 7.1016058921813965, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.864898681640625, + "num_tokens": 380424083.0, + "step": 9969 + }, + { + "epoch": 1.2682864775473859, + "ewc_loss": 0.06070180982351303, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000277428247500211, + "grad_norm": 6.987208843231201, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8569205403327942, + "num_tokens": 380471179.0, + "step": 9970 + }, + { + "epoch": 1.2684136878259764, + "ewc_loss": 0.06092705577611923, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002796807384584099, + "grad_norm": 7.083220481872559, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8634335994720459, + "num_tokens": 380510291.0, + "step": 9971 + }, + { + "epoch": 1.268540898104567, + "ewc_loss": 0.060706302523612976, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002774731838144362, + "grad_norm": 6.942174434661865, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8660038709640503, + "num_tokens": 380556297.0, + "step": 9972 + }, + { + "epoch": 1.2686681083831575, + "ewc_loss": 0.06079714000225067, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002783815434668213, + "grad_norm": 7.064791202545166, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8726450800895691, + "num_tokens": 380592113.0, + "step": 9973 + }, + { + "epoch": 1.268795318661748, + "ewc_loss": 0.06074695289134979, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002778796770144254, + "grad_norm": 7.073429584503174, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8737676739692688, + "num_tokens": 380623942.0, + "step": 9974 + }, + { + "epoch": 1.2689225289403383, + "ewc_loss": 0.06083018332719803, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027871198835782707, + "grad_norm": 7.02933406829834, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8547122478485107, + "num_tokens": 380662300.0, + "step": 9975 + }, + { + "epoch": 1.2690497392189288, + "ewc_loss": 0.060725919902324677, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027766937273554504, + "grad_norm": 6.951488971710205, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8637653589248657, + "num_tokens": 380701452.0, + "step": 9976 + }, + { + "epoch": 1.2691769494975194, + "ewc_loss": 0.060864176601171494, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002790519210975617, + "grad_norm": 7.013657569885254, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8727649450302124, + "num_tokens": 380741248.0, + "step": 9977 + }, + { + "epoch": 1.2693041597761099, + "ewc_loss": 0.06074988842010498, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027790904277935624, + "grad_norm": 6.973903656005859, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8556374907493591, + "num_tokens": 380781455.0, + "step": 9978 + }, + { + "epoch": 1.2694313700547004, + "ewc_loss": 0.060897793620824814, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027938809944316745, + "grad_norm": 7.0704474449157715, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.852206826210022, + "num_tokens": 380818139.0, + "step": 9979 + }, + { + "epoch": 1.269558580333291, + "ewc_loss": 0.06070834398269653, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002774936147034168, + "grad_norm": 6.942167282104492, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8673481345176697, + "num_tokens": 380860530.0, + "step": 9980 + }, + { + "epoch": 1.2696857906118815, + "ewc_loss": 0.060929074883461, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002797009074129164, + "grad_norm": 7.04155969619751, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8746157288551331, + "num_tokens": 380900825.0, + "step": 9981 + }, + { + "epoch": 1.269813000890472, + "ewc_loss": 0.06073945760726929, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002778047346509993, + "grad_norm": 6.957754611968994, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8588070869445801, + "num_tokens": 380939270.0, + "step": 9982 + }, + { + "epoch": 1.2699402111690625, + "ewc_loss": 0.060947198420763016, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002798821369651705, + "grad_norm": 7.03945255279541, + "learning_rate": 1e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.8399345874786377, + "num_tokens": 380981708.0, + "step": 9983 + }, + { + "epoch": 1.2700674214476528, + "ewc_loss": 0.06082741916179657, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027868433971889317, + "grad_norm": 7.042614936828613, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8661044836044312, + "num_tokens": 381015186.0, + "step": 9984 + }, + { + "epoch": 1.2701946317262434, + "ewc_loss": 0.060789115726947784, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002783013042062521, + "grad_norm": 8.24011516571045, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8714967370033264, + "num_tokens": 381055503.0, + "step": 9985 + }, + { + "epoch": 1.270321842004834, + "ewc_loss": 0.06077801436185837, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027819027309305966, + "grad_norm": 6.809776306152344, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8743764162063599, + "num_tokens": 381091935.0, + "step": 9986 + }, + { + "epoch": 1.2704490522834244, + "ewc_loss": 0.06155383586883545, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002859485102817416, + "grad_norm": 7.324394702911377, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8666926622390747, + "num_tokens": 381127951.0, + "step": 9987 + }, + { + "epoch": 1.270576262562015, + "ewc_loss": 0.060508400201797485, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027549415244720876, + "grad_norm": 6.904947757720947, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.855768084526062, + "num_tokens": 381161918.0, + "step": 9988 + }, + { + "epoch": 1.2707034728406055, + "ewc_loss": 0.0615273118019104, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028568325797095895, + "grad_norm": 7.197873592376709, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8494654297828674, + "num_tokens": 381198747.0, + "step": 9989 + }, + { + "epoch": 1.270830683119196, + "ewc_loss": 0.060722716152668, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002776373003143817, + "grad_norm": 6.933033466339111, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8631668090820312, + "num_tokens": 381242714.0, + "step": 9990 + }, + { + "epoch": 1.2709578933977865, + "ewc_loss": 0.0612528994679451, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002829391451086849, + "grad_norm": 7.143002510070801, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8725136518478394, + "num_tokens": 381274667.0, + "step": 9991 + }, + { + "epoch": 1.271085103676377, + "ewc_loss": 0.0608404204249382, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027881437563337386, + "grad_norm": 8.20553970336914, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8719971179962158, + "num_tokens": 381318110.0, + "step": 9992 + }, + { + "epoch": 1.2712123139549676, + "ewc_loss": 0.06074317544698715, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000277841929346323, + "grad_norm": 6.845746994018555, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8625236749649048, + "num_tokens": 381356694.0, + "step": 9993 + }, + { + "epoch": 1.2713395242335581, + "ewc_loss": 0.06148838996887207, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028529405244626105, + "grad_norm": 7.27392578125, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8506494164466858, + "num_tokens": 381395218.0, + "step": 9994 + }, + { + "epoch": 1.2714667345121486, + "ewc_loss": 0.06048941612243652, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002753043081611395, + "grad_norm": 6.9307122230529785, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8755713105201721, + "num_tokens": 381429195.0, + "step": 9995 + }, + { + "epoch": 1.2715939447907392, + "ewc_loss": 0.061376169323921204, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028417183784767985, + "grad_norm": 7.181366443634033, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8552992343902588, + "num_tokens": 381468276.0, + "step": 9996 + }, + { + "epoch": 1.2717211550693297, + "ewc_loss": 0.0607326366007328, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002777365152724087, + "grad_norm": 6.9792561531066895, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8696476221084595, + "num_tokens": 381503400.0, + "step": 9997 + }, + { + "epoch": 1.2718483653479202, + "ewc_loss": 0.06110829859972, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002814931212924421, + "grad_norm": 7.201095104217529, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8670644760131836, + "num_tokens": 381542126.0, + "step": 9998 + }, + { + "epoch": 1.2719755756265108, + "ewc_loss": 0.06075778976082802, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002779880596790463, + "grad_norm": 7.000864505767822, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8698230981826782, + "num_tokens": 381579169.0, + "step": 9999 + }, + { + "epoch": 1.272102785905101, + "ewc_loss": 0.06103913113474846, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002808014687616378, + "grad_norm": 7.1052141189575195, + "learning_rate": 1e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8477418422698975, + "num_tokens": 381615501.0, + "step": 10000 + }, + { + "epoch": 1.2722299961836916, + "ewc_loss": 0.060771599411964417, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002781261573545635, + "grad_norm": 7.041722297668457, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.856019914150238, + "num_tokens": 381649737.0, + "step": 10001 + }, + { + "epoch": 1.2723572064622821, + "ewc_loss": 0.060923073440790176, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027964089531451464, + "grad_norm": 7.0347771644592285, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8654709458351135, + "num_tokens": 381693776.0, + "step": 10002 + }, + { + "epoch": 1.2724844167408726, + "ewc_loss": 0.060838207602500916, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002787922276183963, + "grad_norm": 7.005715847015381, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8711435794830322, + "num_tokens": 381733324.0, + "step": 10003 + }, + { + "epoch": 1.2726116270194632, + "ewc_loss": 0.060814231634140015, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027855249936692417, + "grad_norm": 7.084270477294922, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8570429682731628, + "num_tokens": 381765893.0, + "step": 10004 + }, + { + "epoch": 1.2727388372980537, + "ewc_loss": 0.06083028018474579, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027871294878423214, + "grad_norm": 7.000696659088135, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8591877222061157, + "num_tokens": 381802782.0, + "step": 10005 + }, + { + "epoch": 1.2728660475766442, + "ewc_loss": 0.060996830463409424, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028037847368977964, + "grad_norm": 7.018605709075928, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.864861011505127, + "num_tokens": 381843448.0, + "step": 10006 + }, + { + "epoch": 1.2729932578552348, + "ewc_loss": 0.060786642134189606, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002782765659503639, + "grad_norm": 7.049346446990967, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8511178493499756, + "num_tokens": 381878850.0, + "step": 10007 + }, + { + "epoch": 1.2731204681338253, + "ewc_loss": 0.06087324023246765, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027914257952943444, + "grad_norm": 7.054633617401123, + "learning_rate": 1e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8407236337661743, + "num_tokens": 381915508.0, + "step": 10008 + }, + { + "epoch": 1.2732476784124156, + "ewc_loss": 0.06086903065443039, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027910046628676355, + "grad_norm": 7.004725456237793, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8524113893508911, + "num_tokens": 381952699.0, + "step": 10009 + }, + { + "epoch": 1.2733748886910061, + "ewc_loss": 0.060882702469825745, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002792371960822493, + "grad_norm": 7.074310302734375, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8557178974151611, + "num_tokens": 381989947.0, + "step": 10010 + }, + { + "epoch": 1.2735020989695967, + "ewc_loss": 0.06084097549319267, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027881990536116064, + "grad_norm": 6.9970927238464355, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8612864017486572, + "num_tokens": 382027701.0, + "step": 10011 + }, + { + "epoch": 1.2736293092481872, + "ewc_loss": 0.06076989322900772, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027810907340608537, + "grad_norm": 7.020190238952637, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8602802753448486, + "num_tokens": 382066415.0, + "step": 10012 + }, + { + "epoch": 1.2737565195267777, + "ewc_loss": 0.060828547924757004, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002786956320051104, + "grad_norm": 7.101390361785889, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8568880558013916, + "num_tokens": 382108402.0, + "step": 10013 + }, + { + "epoch": 1.2738837298053682, + "ewc_loss": 0.06076037138700485, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027801384567283094, + "grad_norm": 7.028989315032959, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8629278540611267, + "num_tokens": 382146865.0, + "step": 10014 + }, + { + "epoch": 1.2740109400839588, + "ewc_loss": 0.06076471135020256, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002780572685878724, + "grad_norm": 7.048519134521484, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8583174347877502, + "num_tokens": 382186746.0, + "step": 10015 + }, + { + "epoch": 1.2741381503625493, + "ewc_loss": 0.06071474403142929, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002775576140265912, + "grad_norm": 7.088818073272705, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8754241466522217, + "num_tokens": 382220693.0, + "step": 10016 + }, + { + "epoch": 1.2742653606411398, + "ewc_loss": 0.06065783649682999, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027698854682967067, + "grad_norm": 7.0265607833862305, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.855634868144989, + "num_tokens": 382259549.0, + "step": 10017 + }, + { + "epoch": 1.2743925709197303, + "ewc_loss": 0.06074943765997887, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027790453168563545, + "grad_norm": 6.9989471435546875, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8724638223648071, + "num_tokens": 382299164.0, + "step": 10018 + }, + { + "epoch": 1.2745197811983209, + "ewc_loss": 0.060609351843595505, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002765036770142615, + "grad_norm": 7.055466651916504, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8759026527404785, + "num_tokens": 382333596.0, + "step": 10019 + }, + { + "epoch": 1.2746469914769114, + "ewc_loss": 0.060704804956912994, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027745822444558144, + "grad_norm": 7.081629753112793, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8635554313659668, + "num_tokens": 382372321.0, + "step": 10020 + }, + { + "epoch": 1.274774201755502, + "ewc_loss": 0.060564398765563965, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002760541392490268, + "grad_norm": 6.988222599029541, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8653459548950195, + "num_tokens": 382411125.0, + "step": 10021 + }, + { + "epoch": 1.2749014120340925, + "ewc_loss": 0.06079176068305969, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002783277886919677, + "grad_norm": 7.103603363037109, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.850657045841217, + "num_tokens": 382450935.0, + "step": 10022 + }, + { + "epoch": 1.275028622312683, + "ewc_loss": 0.06053733825683594, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002757835609372705, + "grad_norm": 7.003775596618652, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8593974709510803, + "num_tokens": 382488232.0, + "step": 10023 + }, + { + "epoch": 1.2751558325912733, + "ewc_loss": 0.06078903377056122, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027830051840282977, + "grad_norm": 7.0413126945495605, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8583632707595825, + "num_tokens": 382522280.0, + "step": 10024 + }, + { + "epoch": 1.2752830428698638, + "ewc_loss": 0.06062769144773483, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027668706024996936, + "grad_norm": 7.052338600158691, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8717241883277893, + "num_tokens": 382562788.0, + "step": 10025 + }, + { + "epoch": 1.2754102531484544, + "ewc_loss": 0.060652054846286774, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027693071751855314, + "grad_norm": 6.996394157409668, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8538807034492493, + "num_tokens": 382598534.0, + "step": 10026 + }, + { + "epoch": 1.2755374634270449, + "ewc_loss": 0.06074470281600952, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027785717975348234, + "grad_norm": 7.044816017150879, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8557178974151611, + "num_tokens": 382641803.0, + "step": 10027 + }, + { + "epoch": 1.2756646737056354, + "ewc_loss": 0.0606638640165329, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027704882086254656, + "grad_norm": 7.069950103759766, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.872376561164856, + "num_tokens": 382677871.0, + "step": 10028 + }, + { + "epoch": 1.275791883984226, + "ewc_loss": 0.060633447021245956, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002767446276266128, + "grad_norm": 6.946351051330566, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8712930083274841, + "num_tokens": 382714989.0, + "step": 10029 + }, + { + "epoch": 1.2759190942628165, + "ewc_loss": 0.06078733876347542, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002782835508696735, + "grad_norm": 7.044081211090088, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8648421764373779, + "num_tokens": 382751919.0, + "step": 10030 + }, + { + "epoch": 1.276046304541407, + "ewc_loss": 0.06060589849948883, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002764691598713398, + "grad_norm": 6.938309669494629, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8729485273361206, + "num_tokens": 382793550.0, + "step": 10031 + }, + { + "epoch": 1.2761735148199975, + "ewc_loss": 0.060871537774801254, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002791255246847868, + "grad_norm": 7.012599468231201, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8711981773376465, + "num_tokens": 382833142.0, + "step": 10032 + }, + { + "epoch": 1.2763007250985878, + "ewc_loss": 0.060742270201444626, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002778328489512205, + "grad_norm": 7.066705703735352, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8701646327972412, + "num_tokens": 382869934.0, + "step": 10033 + }, + { + "epoch": 1.2764279353771784, + "ewc_loss": 0.060695797204971313, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027736814809031785, + "grad_norm": 7.01751184463501, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8483086824417114, + "num_tokens": 382909533.0, + "step": 10034 + }, + { + "epoch": 1.2765551456557689, + "ewc_loss": 0.06081559136509895, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000278566061751917, + "grad_norm": 7.0268988609313965, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.868678629398346, + "num_tokens": 382947462.0, + "step": 10035 + }, + { + "epoch": 1.2766823559343594, + "ewc_loss": 0.06068084388971329, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027721861260943115, + "grad_norm": 6.969161510467529, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8632290363311768, + "num_tokens": 382989973.0, + "step": 10036 + }, + { + "epoch": 1.27680956621295, + "ewc_loss": 0.060898974537849426, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002793998864945024, + "grad_norm": 6.997964382171631, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8613390326499939, + "num_tokens": 383033052.0, + "step": 10037 + }, + { + "epoch": 1.2769367764915405, + "ewc_loss": 0.06074743717908859, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002778845082502812, + "grad_norm": 7.014571189880371, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8605763912200928, + "num_tokens": 383075981.0, + "step": 10038 + }, + { + "epoch": 1.277063986770131, + "ewc_loss": 0.06090812385082245, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002794913889374584, + "grad_norm": 7.015127182006836, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8786053657531738, + "num_tokens": 383111984.0, + "step": 10039 + }, + { + "epoch": 1.2771911970487215, + "ewc_loss": 0.06079703941941261, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002783805539365858, + "grad_norm": 6.950994491577148, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8640369772911072, + "num_tokens": 383155323.0, + "step": 10040 + }, + { + "epoch": 1.277318407327312, + "ewc_loss": 0.06094511225819588, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000279861269518733, + "grad_norm": 7.0950846672058105, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8598687052726746, + "num_tokens": 383190069.0, + "step": 10041 + }, + { + "epoch": 1.2774456176059026, + "ewc_loss": 0.060815345495939255, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027856361703015864, + "grad_norm": 7.0178937911987305, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8539818525314331, + "num_tokens": 383224999.0, + "step": 10042 + }, + { + "epoch": 1.2775728278844931, + "ewc_loss": 0.06092333048582077, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027964345645159483, + "grad_norm": 7.019643306732178, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8556322455406189, + "num_tokens": 383267764.0, + "step": 10043 + }, + { + "epoch": 1.2777000381630836, + "ewc_loss": 0.06089453399181366, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027935547404922545, + "grad_norm": 7.06712007522583, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8724188804626465, + "num_tokens": 383306148.0, + "step": 10044 + }, + { + "epoch": 1.2778272484416742, + "ewc_loss": 0.06080878525972366, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002784979878924787, + "grad_norm": 7.019266128540039, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8662440776824951, + "num_tokens": 383343408.0, + "step": 10045 + }, + { + "epoch": 1.2779544587202647, + "ewc_loss": 0.060817085206508636, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027858102112077177, + "grad_norm": 6.969326972961426, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8858051896095276, + "num_tokens": 383383626.0, + "step": 10046 + }, + { + "epoch": 1.2780816689988552, + "ewc_loss": 0.060860708355903625, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002790172293316573, + "grad_norm": 7.046567440032959, + "learning_rate": 1e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8462868928909302, + "num_tokens": 383423218.0, + "step": 10047 + }, + { + "epoch": 1.2782088792774458, + "ewc_loss": 0.060714490711688995, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027755508199334145, + "grad_norm": 6.982909679412842, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8703221082687378, + "num_tokens": 383461126.0, + "step": 10048 + }, + { + "epoch": 1.278336089556036, + "ewc_loss": 0.06089743599295616, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027938451967202127, + "grad_norm": 7.042511463165283, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8547350764274597, + "num_tokens": 383501808.0, + "step": 10049 + }, + { + "epoch": 1.2784632998346266, + "ewc_loss": 0.06076692044734955, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002780793874990195, + "grad_norm": 6.959230422973633, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.882847249507904, + "num_tokens": 383542395.0, + "step": 10050 + }, + { + "epoch": 1.2785905101132171, + "ewc_loss": 0.06099916249513626, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002804017858579755, + "grad_norm": 7.048671245574951, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8587634563446045, + "num_tokens": 383579968.0, + "step": 10051 + }, + { + "epoch": 1.2787177203918076, + "ewc_loss": 0.060789816081523895, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027830831822939217, + "grad_norm": 6.94907283782959, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.863319993019104, + "num_tokens": 383617861.0, + "step": 10052 + }, + { + "epoch": 1.2788449306703982, + "ewc_loss": 0.061053596436977386, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028094614390283823, + "grad_norm": 7.060968399047852, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8568781614303589, + "num_tokens": 383652534.0, + "step": 10053 + }, + { + "epoch": 1.2789721409489887, + "ewc_loss": 0.060832004994153976, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000278730207355693, + "grad_norm": 6.956608772277832, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8575905561447144, + "num_tokens": 383693120.0, + "step": 10054 + }, + { + "epoch": 1.2790993512275792, + "ewc_loss": 0.061018578708171844, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002805959666147828, + "grad_norm": 7.0214643478393555, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8532786965370178, + "num_tokens": 383734958.0, + "step": 10055 + }, + { + "epoch": 1.2792265615061698, + "ewc_loss": 0.06094386428594589, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027984881307929754, + "grad_norm": 7.02501916885376, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8533455729484558, + "num_tokens": 383768873.0, + "step": 10056 + }, + { + "epoch": 1.2793537717847603, + "ewc_loss": 0.0610562339425087, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002809724828694016, + "grad_norm": 7.043238639831543, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8642141819000244, + "num_tokens": 383804063.0, + "step": 10057 + }, + { + "epoch": 1.2794809820633506, + "ewc_loss": 0.061005719006061554, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028046732768416405, + "grad_norm": 7.0081658363342285, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8758752942085266, + "num_tokens": 383843098.0, + "step": 10058 + }, + { + "epoch": 1.2796081923419411, + "ewc_loss": 0.06105146184563637, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000280924781691283, + "grad_norm": 7.070313930511475, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8714678287506104, + "num_tokens": 383879443.0, + "step": 10059 + }, + { + "epoch": 1.2797354026205316, + "ewc_loss": 0.06093364953994751, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027974662953056395, + "grad_norm": 6.967514514923096, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8612865209579468, + "num_tokens": 383918071.0, + "step": 10060 + }, + { + "epoch": 1.2798626128991222, + "ewc_loss": 0.061095550656318665, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002813656465150416, + "grad_norm": 7.083090305328369, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.854240357875824, + "num_tokens": 383953052.0, + "step": 10061 + }, + { + "epoch": 1.2799898231777127, + "ewc_loss": 0.06092490255832672, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002796592016238719, + "grad_norm": 6.948538780212402, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8601214289665222, + "num_tokens": 383992835.0, + "step": 10062 + }, + { + "epoch": 1.2801170334563032, + "ewc_loss": 0.061139870434999466, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002818088687490672, + "grad_norm": 7.136900901794434, + "learning_rate": 1e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8492122292518616, + "num_tokens": 384031234.0, + "step": 10063 + }, + { + "epoch": 1.2802442437348938, + "ewc_loss": 0.060907114297151566, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002794812899082899, + "grad_norm": 7.003647327423096, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.852413535118103, + "num_tokens": 384063611.0, + "step": 10064 + }, + { + "epoch": 1.2803714540134843, + "ewc_loss": 0.0611182376742363, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028159251087345183, + "grad_norm": 7.137358665466309, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8707430958747864, + "num_tokens": 384106757.0, + "step": 10065 + }, + { + "epoch": 1.2804986642920748, + "ewc_loss": 0.06080006808042526, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027841085102409124, + "grad_norm": 6.975607395172119, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8653057813644409, + "num_tokens": 384143094.0, + "step": 10066 + }, + { + "epoch": 1.2806258745706653, + "ewc_loss": 0.061097003519535065, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028138019843026996, + "grad_norm": 7.083096504211426, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8620907068252563, + "num_tokens": 384181498.0, + "step": 10067 + }, + { + "epoch": 1.2807530848492559, + "ewc_loss": 0.06084678694605827, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027887802571058273, + "grad_norm": 7.100348472595215, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.86562579870224, + "num_tokens": 384209784.0, + "step": 10068 + }, + { + "epoch": 1.2808802951278464, + "ewc_loss": 0.06089114397764206, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002793215971905738, + "grad_norm": 7.039261817932129, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.866899847984314, + "num_tokens": 384248000.0, + "step": 10069 + }, + { + "epoch": 1.281007505406437, + "ewc_loss": 0.06095672398805618, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027997742290608585, + "grad_norm": 6.983355522155762, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8535187840461731, + "num_tokens": 384290897.0, + "step": 10070 + }, + { + "epoch": 1.2811347156850275, + "ewc_loss": 0.06108451634645462, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002812553138937801, + "grad_norm": 7.200315952301025, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8545993566513062, + "num_tokens": 384329464.0, + "step": 10071 + }, + { + "epoch": 1.281261925963618, + "ewc_loss": 0.06077081337571144, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027811829932034016, + "grad_norm": 6.9550347328186035, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.865186333656311, + "num_tokens": 384368915.0, + "step": 10072 + }, + { + "epoch": 1.2813891362422083, + "ewc_loss": 0.061103180050849915, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028144195675849915, + "grad_norm": 7.080085754394531, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8721853494644165, + "num_tokens": 384407445.0, + "step": 10073 + }, + { + "epoch": 1.2815163465207988, + "ewc_loss": 0.060803115367889404, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027844132273457944, + "grad_norm": 7.04965353012085, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8576443195343018, + "num_tokens": 384444384.0, + "step": 10074 + }, + { + "epoch": 1.2816435567993893, + "ewc_loss": 0.061001427471637726, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002804244577419013, + "grad_norm": 7.112825870513916, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8580999970436096, + "num_tokens": 384479670.0, + "step": 10075 + }, + { + "epoch": 1.2817707670779799, + "ewc_loss": 0.060771964490413666, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000278129824437201, + "grad_norm": 6.954698085784912, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8505594730377197, + "num_tokens": 384522436.0, + "step": 10076 + }, + { + "epoch": 1.2818979773565704, + "ewc_loss": 0.060971297323703766, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028012311668135226, + "grad_norm": 7.105534076690674, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8547253012657166, + "num_tokens": 384559754.0, + "step": 10077 + }, + { + "epoch": 1.282025187635161, + "ewc_loss": 0.060723356902599335, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027764373226091266, + "grad_norm": 6.989113807678223, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8582030534744263, + "num_tokens": 384595153.0, + "step": 10078 + }, + { + "epoch": 1.2821523979137515, + "ewc_loss": 0.06096236780285835, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000280033826129511, + "grad_norm": 7.085995197296143, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8609491586685181, + "num_tokens": 384628345.0, + "step": 10079 + }, + { + "epoch": 1.282279608192342, + "ewc_loss": 0.06077929586172104, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002782031078822911, + "grad_norm": 7.032454013824463, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8572498559951782, + "num_tokens": 384666725.0, + "step": 10080 + }, + { + "epoch": 1.2824068184709325, + "ewc_loss": 0.06075393781065941, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027794952620752156, + "grad_norm": 7.071550369262695, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8525483012199402, + "num_tokens": 384706217.0, + "step": 10081 + }, + { + "epoch": 1.2825340287495228, + "ewc_loss": 0.06080106645822525, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027842080453410745, + "grad_norm": 7.0422258377075195, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8582791090011597, + "num_tokens": 384741835.0, + "step": 10082 + }, + { + "epoch": 1.2826612390281134, + "ewc_loss": 0.06076963245868683, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027810645406134427, + "grad_norm": 7.009498596191406, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8624781370162964, + "num_tokens": 384777273.0, + "step": 10083 + }, + { + "epoch": 1.2827884493067039, + "ewc_loss": 0.06075152009725571, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027792537002824247, + "grad_norm": 6.998848915100098, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8746753334999084, + "num_tokens": 384814950.0, + "step": 10084 + }, + { + "epoch": 1.2829156595852944, + "ewc_loss": 0.0608755424618721, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027916557155549526, + "grad_norm": 7.108249664306641, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8542444705963135, + "num_tokens": 384848088.0, + "step": 10085 + }, + { + "epoch": 1.283042869863885, + "ewc_loss": 0.060787178575992584, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002782819210551679, + "grad_norm": 7.050565242767334, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8534222841262817, + "num_tokens": 384887361.0, + "step": 10086 + }, + { + "epoch": 1.2831700801424755, + "ewc_loss": 0.06075892597436905, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002779994101729244, + "grad_norm": 6.9683709144592285, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8677663803100586, + "num_tokens": 384930459.0, + "step": 10087 + }, + { + "epoch": 1.283297290421066, + "ewc_loss": 0.06085583195090294, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027896848041564226, + "grad_norm": 7.009285926818848, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8560646772384644, + "num_tokens": 384969506.0, + "step": 10088 + }, + { + "epoch": 1.2834245006996565, + "ewc_loss": 0.06087851524353027, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027919531567022204, + "grad_norm": 7.028337478637695, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8683582544326782, + "num_tokens": 385004368.0, + "step": 10089 + }, + { + "epoch": 1.283551710978247, + "ewc_loss": 0.06080131232738495, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027842327835969627, + "grad_norm": 6.985256671905518, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8632051944732666, + "num_tokens": 385044553.0, + "step": 10090 + }, + { + "epoch": 1.2836789212568376, + "ewc_loss": 0.06092941761016846, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002797043416649103, + "grad_norm": 7.020333766937256, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8746833801269531, + "num_tokens": 385076873.0, + "step": 10091 + }, + { + "epoch": 1.283806131535428, + "ewc_loss": 0.06084586679935455, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002788688289001584, + "grad_norm": 6.985439777374268, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8646371364593506, + "num_tokens": 385113969.0, + "step": 10092 + }, + { + "epoch": 1.2839333418140186, + "ewc_loss": 0.06109738349914551, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002789426071103662, + "grad_norm": 7.282678604125977, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8726963400840759, + "num_tokens": 385149460.0, + "step": 10093 + }, + { + "epoch": 1.2840605520926092, + "ewc_loss": 0.06068509817123413, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002772611624095589, + "grad_norm": 6.943161487579346, + "learning_rate": 1e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8450047373771667, + "num_tokens": 385187240.0, + "step": 10094 + }, + { + "epoch": 1.2841877623711997, + "ewc_loss": 0.061025410890579224, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028066427330486476, + "grad_norm": 7.066690921783447, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8547562956809998, + "num_tokens": 385225532.0, + "step": 10095 + }, + { + "epoch": 1.2843149726497902, + "ewc_loss": 0.0606348030269146, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002767581900116056, + "grad_norm": 6.9763946533203125, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8641179203987122, + "num_tokens": 385267404.0, + "step": 10096 + }, + { + "epoch": 1.2844421829283807, + "ewc_loss": 0.06098226457834244, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002802328090183437, + "grad_norm": 7.014108657836914, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8503447771072388, + "num_tokens": 385314008.0, + "step": 10097 + }, + { + "epoch": 1.284569393206971, + "ewc_loss": 0.06083459034562111, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027875605155713856, + "grad_norm": 7.065777778625488, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8487322926521301, + "num_tokens": 385352204.0, + "step": 10098 + }, + { + "epoch": 1.2846966034855616, + "ewc_loss": 0.06074223667383194, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002778325288090855, + "grad_norm": 6.946354866027832, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8536784052848816, + "num_tokens": 385393307.0, + "step": 10099 + }, + { + "epoch": 1.2848238137641521, + "ewc_loss": 0.06099560856819153, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028036622097715735, + "grad_norm": 7.081806182861328, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.862195611000061, + "num_tokens": 385426124.0, + "step": 10100 + }, + { + "epoch": 1.2849510240427426, + "ewc_loss": 0.06073937937617302, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000277803948847577, + "grad_norm": 6.9368181228637695, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8665168285369873, + "num_tokens": 385467471.0, + "step": 10101 + }, + { + "epoch": 1.2850782343213332, + "ewc_loss": 0.060957517474889755, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002799853391479701, + "grad_norm": 7.050205230712891, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8533169031143188, + "num_tokens": 385508780.0, + "step": 10102 + }, + { + "epoch": 1.2852054445999237, + "ewc_loss": 0.06078258901834488, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002782360534183681, + "grad_norm": 6.930047512054443, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8675033450126648, + "num_tokens": 385552646.0, + "step": 10103 + }, + { + "epoch": 1.2853326548785142, + "ewc_loss": 0.0610470212996006, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000280880369246006, + "grad_norm": 7.111057281494141, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8491514325141907, + "num_tokens": 385586676.0, + "step": 10104 + }, + { + "epoch": 1.2854598651571048, + "ewc_loss": 0.060808680951595306, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002784969692584127, + "grad_norm": 6.941314697265625, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8491802215576172, + "num_tokens": 385623201.0, + "step": 10105 + }, + { + "epoch": 1.2855870754356953, + "ewc_loss": 0.061043061316013336, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028084078803658485, + "grad_norm": 7.048879146575928, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8737659454345703, + "num_tokens": 385662716.0, + "step": 10106 + }, + { + "epoch": 1.2857142857142856, + "ewc_loss": 0.06149189919233322, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00028044634382240474, + "grad_norm": 9.422411918640137, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8511343598365784, + "num_tokens": 385706986.0, + "step": 10107 + }, + { + "epoch": 1.2858414959928761, + "ewc_loss": 0.06236176937818527, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002940278500318527, + "grad_norm": 7.061070919036865, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8613670468330383, + "num_tokens": 385744240.0, + "step": 10108 + }, + { + "epoch": 1.2859687062714666, + "ewc_loss": 0.06178923323750496, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028830248629674315, + "grad_norm": 7.385656833648682, + "learning_rate": 1e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8460025191307068, + "num_tokens": 385778583.0, + "step": 10109 + }, + { + "epoch": 1.2860959165500572, + "ewc_loss": 0.06086607649922371, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027907092589884996, + "grad_norm": 6.987195014953613, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8618441820144653, + "num_tokens": 385819910.0, + "step": 10110 + }, + { + "epoch": 1.2862231268286477, + "ewc_loss": 0.06190820783376694, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028949225088581443, + "grad_norm": 7.230812072753906, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8595227003097534, + "num_tokens": 385860068.0, + "step": 10111 + }, + { + "epoch": 1.2863503371072382, + "ewc_loss": 0.061352476477622986, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000281493499642238, + "grad_norm": 7.98766565322876, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8520172834396362, + "num_tokens": 385890460.0, + "step": 10112 + }, + { + "epoch": 1.2864775473858288, + "ewc_loss": 0.06094799190759659, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027989005320705473, + "grad_norm": 6.942763328552246, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.857771635055542, + "num_tokens": 385926174.0, + "step": 10113 + }, + { + "epoch": 1.2866047576644193, + "ewc_loss": 0.06166549772024155, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002870651369448751, + "grad_norm": 7.351161956787109, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8716857433319092, + "num_tokens": 385962928.0, + "step": 10114 + }, + { + "epoch": 1.2867319679430098, + "ewc_loss": 0.06063040345907211, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002767142141237855, + "grad_norm": 6.856634616851807, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8703680038452148, + "num_tokens": 386002509.0, + "step": 10115 + }, + { + "epoch": 1.2868591782216003, + "ewc_loss": 0.06168206408619881, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028723079594783485, + "grad_norm": 7.219868183135986, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8613762855529785, + "num_tokens": 386043304.0, + "step": 10116 + }, + { + "epoch": 1.2869863885001909, + "ewc_loss": 0.06075640767812729, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027797420625574887, + "grad_norm": 6.943914890289307, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8688102960586548, + "num_tokens": 386081607.0, + "step": 10117 + }, + { + "epoch": 1.2871135987787814, + "ewc_loss": 0.06147380918264389, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002851482422556728, + "grad_norm": 7.1858811378479, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.879611611366272, + "num_tokens": 386117762.0, + "step": 10118 + }, + { + "epoch": 1.287240809057372, + "ewc_loss": 0.061009373515844345, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002805038820952177, + "grad_norm": 7.017431735992432, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8543285131454468, + "num_tokens": 386155908.0, + "step": 10119 + }, + { + "epoch": 1.2873680193359625, + "ewc_loss": 0.06121613085269928, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002825714473146945, + "grad_norm": 7.10969877243042, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8595221042633057, + "num_tokens": 386199385.0, + "step": 10120 + }, + { + "epoch": 1.287495229614553, + "ewc_loss": 0.06097285449504852, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002801386872306466, + "grad_norm": 7.0794572830200195, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8566572666168213, + "num_tokens": 386231892.0, + "step": 10121 + }, + { + "epoch": 1.2876224398931433, + "ewc_loss": 0.06100302189588547, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002804403775371611, + "grad_norm": 7.052893161773682, + "learning_rate": 1e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.844485878944397, + "num_tokens": 386269657.0, + "step": 10122 + }, + { + "epoch": 1.2877496501717338, + "ewc_loss": 0.06096835434436798, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002800936927087605, + "grad_norm": 7.038149356842041, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8664106130599976, + "num_tokens": 386306936.0, + "step": 10123 + }, + { + "epoch": 1.2878768604503243, + "ewc_loss": 0.060955822467803955, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002799683716148138, + "grad_norm": 6.9962005615234375, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8614509105682373, + "num_tokens": 386342260.0, + "step": 10124 + }, + { + "epoch": 1.2880040707289149, + "ewc_loss": 0.06095492094755173, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002799593785312027, + "grad_norm": 7.040497303009033, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.85782790184021, + "num_tokens": 386381784.0, + "step": 10125 + }, + { + "epoch": 1.2881312810075054, + "ewc_loss": 0.060908637940883636, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002794965112116188, + "grad_norm": 6.98022985458374, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8600701093673706, + "num_tokens": 386422350.0, + "step": 10126 + }, + { + "epoch": 1.288258491286096, + "ewc_loss": 0.06099045276641846, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028031470719724894, + "grad_norm": 7.039935111999512, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8577245473861694, + "num_tokens": 386467951.0, + "step": 10127 + }, + { + "epoch": 1.2883857015646865, + "ewc_loss": 0.06083574518561363, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002787676057778299, + "grad_norm": 6.960978031158447, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8697304725646973, + "num_tokens": 386511896.0, + "step": 10128 + }, + { + "epoch": 1.288512911843277, + "ewc_loss": 0.06108979880809784, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028130816644988954, + "grad_norm": 7.085574150085449, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8495205640792847, + "num_tokens": 386550704.0, + "step": 10129 + }, + { + "epoch": 1.2886401221218675, + "ewc_loss": 0.06085371598601341, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002789473219309002, + "grad_norm": 6.986970901489258, + "learning_rate": 1e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.845282793045044, + "num_tokens": 386589295.0, + "step": 10130 + }, + { + "epoch": 1.2887673324004578, + "ewc_loss": 0.06105111539363861, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028092131833545864, + "grad_norm": 7.154961109161377, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8615672588348389, + "num_tokens": 386623462.0, + "step": 10131 + }, + { + "epoch": 1.2888945426790483, + "ewc_loss": 0.06080811098217964, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002784912649076432, + "grad_norm": 6.944865703582764, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8644942045211792, + "num_tokens": 386663036.0, + "step": 10132 + }, + { + "epoch": 1.2890217529576389, + "ewc_loss": 0.06109708920121193, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002813810424413532, + "grad_norm": 7.088624477386475, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8621311187744141, + "num_tokens": 386702530.0, + "step": 10133 + }, + { + "epoch": 1.2891489632362294, + "ewc_loss": 0.060806214809417725, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027847231831401587, + "grad_norm": 6.967629432678223, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8760632276535034, + "num_tokens": 386739758.0, + "step": 10134 + }, + { + "epoch": 1.28927617351482, + "ewc_loss": 0.061064958572387695, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002810597652569413, + "grad_norm": 7.078217506408691, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8713043332099915, + "num_tokens": 386778287.0, + "step": 10135 + }, + { + "epoch": 1.2894033837934105, + "ewc_loss": 0.06085183471441269, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027892852085642517, + "grad_norm": 7.041167736053467, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8720402717590332, + "num_tokens": 386812348.0, + "step": 10136 + }, + { + "epoch": 1.289530594072001, + "ewc_loss": 0.0610496923327446, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002809070865623653, + "grad_norm": 7.0055694580078125, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8689931631088257, + "num_tokens": 386855540.0, + "step": 10137 + }, + { + "epoch": 1.2896578043505915, + "ewc_loss": 0.06094890832901001, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027989925001747906, + "grad_norm": 7.03616189956665, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8568764925003052, + "num_tokens": 386893775.0, + "step": 10138 + }, + { + "epoch": 1.289785014629182, + "ewc_loss": 0.06096706539392471, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028008079971186817, + "grad_norm": 7.210879325866699, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8616139888763428, + "num_tokens": 386929553.0, + "step": 10139 + }, + { + "epoch": 1.2899122249077726, + "ewc_loss": 0.060888394713401794, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027929412317462265, + "grad_norm": 7.021261215209961, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8557046055793762, + "num_tokens": 386968033.0, + "step": 10140 + }, + { + "epoch": 1.290039435186363, + "ewc_loss": 0.06096179038286209, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028002806357108057, + "grad_norm": 7.101641654968262, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8606447577476501, + "num_tokens": 387004967.0, + "step": 10141 + }, + { + "epoch": 1.2901666454649536, + "ewc_loss": 0.06079889461398125, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002783990930765867, + "grad_norm": 6.968071937561035, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.84992516040802, + "num_tokens": 387038086.0, + "step": 10142 + }, + { + "epoch": 1.2902938557435442, + "ewc_loss": 0.06100407615303993, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028045091312378645, + "grad_norm": 7.134272575378418, + "learning_rate": 1e-06, + "loss": 0.5506, + "mean_token_accuracy": 0.8368358612060547, + "num_tokens": 387077112.0, + "step": 10143 + }, + { + "epoch": 1.2904210660221347, + "ewc_loss": 0.06075058877468109, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027791602769866586, + "grad_norm": 6.938828468322754, + "learning_rate": 1e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8442696332931519, + "num_tokens": 387118800.0, + "step": 10144 + }, + { + "epoch": 1.2905482763007252, + "ewc_loss": 0.06106305494904518, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028104070224799216, + "grad_norm": 7.135390281677246, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8558179140090942, + "num_tokens": 387160321.0, + "step": 10145 + }, + { + "epoch": 1.2906754865793157, + "ewc_loss": 0.060837604105472565, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027878620312549174, + "grad_norm": 6.9722371101379395, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8585460186004639, + "num_tokens": 387199387.0, + "step": 10146 + }, + { + "epoch": 1.290802696857906, + "ewc_loss": 0.061129484325647354, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002817049971781671, + "grad_norm": 7.149659633636475, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8668568134307861, + "num_tokens": 387238859.0, + "step": 10147 + }, + { + "epoch": 1.2909299071364966, + "ewc_loss": 0.06081542372703552, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027856440283358097, + "grad_norm": 6.970537185668945, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8674549460411072, + "num_tokens": 387272149.0, + "step": 10148 + }, + { + "epoch": 1.291057117415087, + "ewc_loss": 0.061389610171318054, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028186486451886594, + "grad_norm": 7.111813068389893, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8595931529998779, + "num_tokens": 387311617.0, + "step": 10149 + }, + { + "epoch": 1.2911843276936776, + "ewc_loss": 0.061085961759090424, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00027882837457582355, + "grad_norm": 6.978272914886475, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8481533527374268, + "num_tokens": 387344281.0, + "step": 10150 + }, + { + "epoch": 1.2913115379722682, + "ewc_loss": 0.06124447286128998, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028041345649398863, + "grad_norm": 7.060569763183594, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.849557638168335, + "num_tokens": 387385959.0, + "step": 10151 + }, + { + "epoch": 1.2914387482508587, + "ewc_loss": 0.06108362227678299, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00027880497509613633, + "grad_norm": 6.937504291534424, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8567423820495605, + "num_tokens": 387426197.0, + "step": 10152 + }, + { + "epoch": 1.2915659585294492, + "ewc_loss": 0.06106923520565033, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002811024896800518, + "grad_norm": 7.037480354309082, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8670172691345215, + "num_tokens": 387462250.0, + "step": 10153 + }, + { + "epoch": 1.2916931688080397, + "ewc_loss": 0.060887910425662994, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002792892337311059, + "grad_norm": 7.005616188049316, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8603396415710449, + "num_tokens": 387499903.0, + "step": 10154 + }, + { + "epoch": 1.2918203790866303, + "ewc_loss": 0.0609721913933754, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002801320515573025, + "grad_norm": 7.034905910491943, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8695303201675415, + "num_tokens": 387535361.0, + "step": 10155 + }, + { + "epoch": 1.2919475893652206, + "ewc_loss": 0.06097477301955223, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028015789575874805, + "grad_norm": 7.037896633148193, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8585712313652039, + "num_tokens": 387574300.0, + "step": 10156 + }, + { + "epoch": 1.2920747996438111, + "ewc_loss": 0.06092998385429382, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002797099878080189, + "grad_norm": 7.041444301605225, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8619346022605896, + "num_tokens": 387606860.0, + "step": 10157 + }, + { + "epoch": 1.2922020099224016, + "ewc_loss": 0.060894422233104706, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027935439720749855, + "grad_norm": 7.022332668304443, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8505812883377075, + "num_tokens": 387646519.0, + "step": 10158 + }, + { + "epoch": 1.2923292202009922, + "ewc_loss": 0.06101804971694946, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002805906697176397, + "grad_norm": 7.080690860748291, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8725378513336182, + "num_tokens": 387681100.0, + "step": 10159 + }, + { + "epoch": 1.2924564304795827, + "ewc_loss": 0.06083197146654129, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027872988721355796, + "grad_norm": 6.9536967277526855, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8826875686645508, + "num_tokens": 387715962.0, + "step": 10160 + }, + { + "epoch": 1.2925836407581732, + "ewc_loss": 0.06105932593345642, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002810034202411771, + "grad_norm": 7.055084228515625, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8595430850982666, + "num_tokens": 387755856.0, + "step": 10161 + }, + { + "epoch": 1.2927108510367638, + "ewc_loss": 0.06084303930401802, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027884053997695446, + "grad_norm": 6.998526573181152, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8691567182540894, + "num_tokens": 387793256.0, + "step": 10162 + }, + { + "epoch": 1.2928380613153543, + "ewc_loss": 0.06103215366601944, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000280731706880033, + "grad_norm": 7.038349628448486, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8557159304618835, + "num_tokens": 387834696.0, + "step": 10163 + }, + { + "epoch": 1.2929652715939448, + "ewc_loss": 0.06090614199638367, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002794715983327478, + "grad_norm": 7.089192867279053, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8493311405181885, + "num_tokens": 387874973.0, + "step": 10164 + }, + { + "epoch": 1.2930924818725353, + "ewc_loss": 0.060918182134628296, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002795920008793473, + "grad_norm": 7.102184772491455, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8496359586715698, + "num_tokens": 387904887.0, + "step": 10165 + }, + { + "epoch": 1.2932196921511259, + "ewc_loss": 0.06090493127703667, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027945946203544736, + "grad_norm": 6.980656147003174, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.867515504360199, + "num_tokens": 387944676.0, + "step": 10166 + }, + { + "epoch": 1.2933469024297164, + "ewc_loss": 0.06100878864526749, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028049806132912636, + "grad_norm": 7.056859970092773, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8611337542533875, + "num_tokens": 387982234.0, + "step": 10167 + }, + { + "epoch": 1.293474112708307, + "ewc_loss": 0.060929328203201294, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027970343944616616, + "grad_norm": 6.9935221672058105, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.861883282661438, + "num_tokens": 388018001.0, + "step": 10168 + }, + { + "epoch": 1.2936013229868975, + "ewc_loss": 0.06130751222372055, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002810438454616815, + "grad_norm": 7.025839805603027, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8562669157981873, + "num_tokens": 388057476.0, + "step": 10169 + }, + { + "epoch": 1.293728533265488, + "ewc_loss": 0.06119435280561447, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002799122594296932, + "grad_norm": 6.9807820320129395, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8798816204071045, + "num_tokens": 388095351.0, + "step": 10170 + }, + { + "epoch": 1.2938557435440783, + "ewc_loss": 0.06130650267004967, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028103377553634346, + "grad_norm": 7.050433158874512, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8621657490730286, + "num_tokens": 388135366.0, + "step": 10171 + }, + { + "epoch": 1.2939829538226688, + "ewc_loss": 0.061238158494234085, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000280350330285728, + "grad_norm": 7.034360408782959, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8497415781021118, + "num_tokens": 388175003.0, + "step": 10172 + }, + { + "epoch": 1.2941101641012593, + "ewc_loss": 0.06123485788702965, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028031732654199004, + "grad_norm": 7.080709457397461, + "learning_rate": 1e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8433059453964233, + "num_tokens": 388207083.0, + "step": 10173 + }, + { + "epoch": 1.2942373743798499, + "ewc_loss": 0.06118054315447807, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002797741908580065, + "grad_norm": 6.976303577423096, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8763867616653442, + "num_tokens": 388245038.0, + "step": 10174 + }, + { + "epoch": 1.2943645846584404, + "ewc_loss": 0.061343517154455185, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002814039180520922, + "grad_norm": 7.028363227844238, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8578862547874451, + "num_tokens": 388285035.0, + "step": 10175 + }, + { + "epoch": 1.294491794937031, + "ewc_loss": 0.06118108332157135, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000279779575066641, + "grad_norm": 6.947462558746338, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8677159547805786, + "num_tokens": 388323971.0, + "step": 10176 + }, + { + "epoch": 1.2946190052156215, + "ewc_loss": 0.061177246272563934, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028218262013979256, + "grad_norm": 7.087222576141357, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8570917844772339, + "num_tokens": 388362256.0, + "step": 10177 + }, + { + "epoch": 1.294746215494212, + "ewc_loss": 0.06086786463856697, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002790887956507504, + "grad_norm": 7.007680416107178, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8822073936462402, + "num_tokens": 388401120.0, + "step": 10178 + }, + { + "epoch": 1.2948734257728025, + "ewc_loss": 0.06111576408147812, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002815677726175636, + "grad_norm": 7.00567102432251, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8643749952316284, + "num_tokens": 388446833.0, + "step": 10179 + }, + { + "epoch": 1.2950006360513928, + "ewc_loss": 0.06121199578046799, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002800887159537524, + "grad_norm": 7.051092147827148, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.873680830001831, + "num_tokens": 388482266.0, + "step": 10180 + }, + { + "epoch": 1.2951278463299833, + "ewc_loss": 0.060958489775657654, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027999503072351217, + "grad_norm": 7.097527027130127, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.862910807132721, + "num_tokens": 388518024.0, + "step": 10181 + }, + { + "epoch": 1.2952550566085739, + "ewc_loss": 0.06093921512365341, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002798023051582277, + "grad_norm": 6.939829349517822, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8636505603790283, + "num_tokens": 388564246.0, + "step": 10182 + }, + { + "epoch": 1.2953822668871644, + "ewc_loss": 0.06100371479988098, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002804473042488098, + "grad_norm": 7.0291242599487305, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8649725914001465, + "num_tokens": 388603556.0, + "step": 10183 + }, + { + "epoch": 1.295509477165755, + "ewc_loss": 0.06101357191801071, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028054587892256677, + "grad_norm": 6.993535041809082, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8703869581222534, + "num_tokens": 388643615.0, + "step": 10184 + }, + { + "epoch": 1.2956366874443455, + "ewc_loss": 0.06107427924871445, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002811529557220638, + "grad_norm": 7.06974983215332, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8654844760894775, + "num_tokens": 388682917.0, + "step": 10185 + }, + { + "epoch": 1.295763897722936, + "ewc_loss": 0.06093586981296539, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027976883575320244, + "grad_norm": 7.037128925323486, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8640009164810181, + "num_tokens": 388720269.0, + "step": 10186 + }, + { + "epoch": 1.2958911080015265, + "ewc_loss": 0.061006128787994385, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002804714604280889, + "grad_norm": 7.061922073364258, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8543670773506165, + "num_tokens": 388759245.0, + "step": 10187 + }, + { + "epoch": 1.296018318280117, + "ewc_loss": 0.06094964221119881, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027990658418275416, + "grad_norm": 7.007715225219727, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8573172092437744, + "num_tokens": 388795673.0, + "step": 10188 + }, + { + "epoch": 1.2961455285587076, + "ewc_loss": 0.060967572033405304, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028008586377836764, + "grad_norm": 7.109530448913574, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8664462566375732, + "num_tokens": 388832926.0, + "step": 10189 + }, + { + "epoch": 1.296272738837298, + "ewc_loss": 0.060803577303886414, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027844595024362206, + "grad_norm": 6.977863788604736, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8656005859375, + "num_tokens": 388869767.0, + "step": 10190 + }, + { + "epoch": 1.2963999491158886, + "ewc_loss": 0.061029501259326935, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002807051641866565, + "grad_norm": 7.082518100738525, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8646534085273743, + "num_tokens": 388913347.0, + "step": 10191 + }, + { + "epoch": 1.2965271593944792, + "ewc_loss": 0.06081920117139816, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002786021796055138, + "grad_norm": 7.004099369049072, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.873741865158081, + "num_tokens": 388948211.0, + "step": 10192 + }, + { + "epoch": 1.2966543696730697, + "ewc_loss": 0.06096388399600983, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002800490183290094, + "grad_norm": 7.067826747894287, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8676879405975342, + "num_tokens": 388989198.0, + "step": 10193 + }, + { + "epoch": 1.2967815799516602, + "ewc_loss": 0.06085328757762909, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002789430145639926, + "grad_norm": 6.98871374130249, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8538590669631958, + "num_tokens": 389029076.0, + "step": 10194 + }, + { + "epoch": 1.2969087902302507, + "ewc_loss": 0.060987211763858795, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002802822564262897, + "grad_norm": 7.1635260581970215, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8629566431045532, + "num_tokens": 389066751.0, + "step": 10195 + }, + { + "epoch": 1.297036000508841, + "ewc_loss": 0.06090576946735382, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002794678439386189, + "grad_norm": 7.037903785705566, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.85966557264328, + "num_tokens": 389104207.0, + "step": 10196 + }, + { + "epoch": 1.2971632107874316, + "ewc_loss": 0.06098775938153267, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000280287757050246, + "grad_norm": 7.073019504547119, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8648084998130798, + "num_tokens": 389141941.0, + "step": 10197 + }, + { + "epoch": 1.297290421066022, + "ewc_loss": 0.06095768138766289, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027998696896247566, + "grad_norm": 7.021746635437012, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8736094832420349, + "num_tokens": 389185326.0, + "step": 10198 + }, + { + "epoch": 1.2974176313446126, + "ewc_loss": 0.06105370447039604, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028094719164073467, + "grad_norm": 7.129336357116699, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8714293837547302, + "num_tokens": 389221512.0, + "step": 10199 + }, + { + "epoch": 1.2975448416232032, + "ewc_loss": 0.06082913279533386, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00027870151097886264, + "grad_norm": 6.989997386932373, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8705954551696777, + "num_tokens": 389259388.0, + "step": 10200 + }, + { + "epoch": 1.2976720519017937, + "ewc_loss": 0.06098482757806778, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028025844949297607, + "grad_norm": 7.1086249351501465, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8502957224845886, + "num_tokens": 389298773.0, + "step": 10201 + }, + { + "epoch": 1.2977992621803842, + "ewc_loss": 0.06094629317522049, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002798731147777289, + "grad_norm": 7.009312629699707, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8633977174758911, + "num_tokens": 389339680.0, + "step": 10202 + }, + { + "epoch": 1.2979264724589747, + "ewc_loss": 0.06094977259635925, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002799078938551247, + "grad_norm": 7.088314056396484, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8656110167503357, + "num_tokens": 389380755.0, + "step": 10203 + }, + { + "epoch": 1.2980536827375653, + "ewc_loss": 0.0610157772898674, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002805679105222225, + "grad_norm": 7.1399455070495605, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8659849166870117, + "num_tokens": 389412031.0, + "step": 10204 + }, + { + "epoch": 1.2981808930161556, + "ewc_loss": 0.06086251512169838, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002790353028103709, + "grad_norm": 7.048425197601318, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8732858896255493, + "num_tokens": 389448299.0, + "step": 10205 + }, + { + "epoch": 1.298308103294746, + "ewc_loss": 0.0609935000538826, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002803451498039067, + "grad_norm": 7.126652717590332, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8568158745765686, + "num_tokens": 389483962.0, + "step": 10206 + }, + { + "epoch": 1.2984353135733366, + "ewc_loss": 0.060837116092443466, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000278781313681975, + "grad_norm": 7.034040451049805, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8550440073013306, + "num_tokens": 389521989.0, + "step": 10207 + }, + { + "epoch": 1.2985625238519272, + "ewc_loss": 0.061041951179504395, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002808296412695199, + "grad_norm": 7.043292045593262, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8571180105209351, + "num_tokens": 389558228.0, + "step": 10208 + }, + { + "epoch": 1.2986897341305177, + "ewc_loss": 0.060910627245903015, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002795164182316512, + "grad_norm": 7.030765533447266, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8604464530944824, + "num_tokens": 389597164.0, + "step": 10209 + }, + { + "epoch": 1.2988169444091082, + "ewc_loss": 0.06108120456337929, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028122219373472035, + "grad_norm": 7.032467365264893, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.862758219242096, + "num_tokens": 389636435.0, + "step": 10210 + }, + { + "epoch": 1.2989441546876987, + "ewc_loss": 0.06108761578798294, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000281286338577047, + "grad_norm": 7.086391925811768, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8545548915863037, + "num_tokens": 389670498.0, + "step": 10211 + }, + { + "epoch": 1.2990713649662893, + "ewc_loss": 0.06101308763027191, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002805410185828805, + "grad_norm": 7.008022308349609, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8642678260803223, + "num_tokens": 389707433.0, + "step": 10212 + }, + { + "epoch": 1.2991985752448798, + "ewc_loss": 0.06113525480031967, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002817627100739628, + "grad_norm": 7.070869445800781, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.863472580909729, + "num_tokens": 389747300.0, + "step": 10213 + }, + { + "epoch": 1.2993257855234703, + "ewc_loss": 0.06096938997507095, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028010408277623355, + "grad_norm": 6.9846320152282715, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8560909032821655, + "num_tokens": 389788381.0, + "step": 10214 + }, + { + "epoch": 1.2994529958020609, + "ewc_loss": 0.06124947592616081, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002829049190040678, + "grad_norm": 7.131308078765869, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8654327392578125, + "num_tokens": 389821792.0, + "step": 10215 + }, + { + "epoch": 1.2995802060806514, + "ewc_loss": 0.06100834533572197, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002804936084430665, + "grad_norm": 7.028100490570068, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8734393119812012, + "num_tokens": 389859370.0, + "step": 10216 + }, + { + "epoch": 1.299707416359242, + "ewc_loss": 0.06113347411155701, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002817448985297233, + "grad_norm": 7.065958023071289, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8530962467193604, + "num_tokens": 389897020.0, + "step": 10217 + }, + { + "epoch": 1.2998346266378324, + "ewc_loss": 0.060887642204761505, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002792865561787039, + "grad_norm": 7.063163757324219, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8366415500640869, + "num_tokens": 389927107.0, + "step": 10218 + }, + { + "epoch": 1.299961836916423, + "ewc_loss": 0.06105583906173706, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028096852474845946, + "grad_norm": 7.039880752563477, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8599910736083984, + "num_tokens": 389966323.0, + "step": 10219 + }, + { + "epoch": 1.3000890471950133, + "ewc_loss": 0.061055704951286316, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002809672150760889, + "grad_norm": 7.056172847747803, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8605283498764038, + "num_tokens": 390001416.0, + "step": 10220 + }, + { + "epoch": 1.3002162574736038, + "ewc_loss": 0.06105972081422806, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002810073783621192, + "grad_norm": 6.993842124938965, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8644150495529175, + "num_tokens": 390042326.0, + "step": 10221 + }, + { + "epoch": 1.3003434677521943, + "ewc_loss": 0.06118381768465042, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028224833658896387, + "grad_norm": 7.055275917053223, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8525705933570862, + "num_tokens": 390084443.0, + "step": 10222 + }, + { + "epoch": 1.3004706780307849, + "ewc_loss": 0.06101716682314873, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028058182215318084, + "grad_norm": 6.9732794761657715, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8559325933456421, + "num_tokens": 390126309.0, + "step": 10223 + }, + { + "epoch": 1.3005978883093754, + "ewc_loss": 0.06110987812280655, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028150895377621055, + "grad_norm": 7.127456188201904, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8660882711410522, + "num_tokens": 390161463.0, + "step": 10224 + }, + { + "epoch": 1.300725098587966, + "ewc_loss": 0.061083342880010605, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028124358505010605, + "grad_norm": 7.050694942474365, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8716728687286377, + "num_tokens": 390203221.0, + "step": 10225 + }, + { + "epoch": 1.3008523088665565, + "ewc_loss": 0.06113528087735176, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002817629720084369, + "grad_norm": 7.078016757965088, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8786507248878479, + "num_tokens": 390243269.0, + "step": 10226 + }, + { + "epoch": 1.300979519145147, + "ewc_loss": 0.061017319560050964, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002805833355523646, + "grad_norm": 6.97257661819458, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8636226654052734, + "num_tokens": 390282488.0, + "step": 10227 + }, + { + "epoch": 1.3011067294237375, + "ewc_loss": 0.06120879948139191, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000282498134765774, + "grad_norm": 7.088077545166016, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8582322597503662, + "num_tokens": 390320268.0, + "step": 10228 + }, + { + "epoch": 1.3012339397023278, + "ewc_loss": 0.06111443415284157, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028155450127087533, + "grad_norm": 7.0996294021606445, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8516173362731934, + "num_tokens": 390354733.0, + "step": 10229 + }, + { + "epoch": 1.3013611499809183, + "ewc_loss": 0.061096206307411194, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028137219487689435, + "grad_norm": 7.069839000701904, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.861121654510498, + "num_tokens": 390388161.0, + "step": 10230 + }, + { + "epoch": 1.3014883602595089, + "ewc_loss": 0.06109441816806793, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002813543542288244, + "grad_norm": 7.054042816162109, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8722003698348999, + "num_tokens": 390424609.0, + "step": 10231 + }, + { + "epoch": 1.3016155705380994, + "ewc_loss": 0.06105656176805496, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002809757716022432, + "grad_norm": 7.031961917877197, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8796233534812927, + "num_tokens": 390460809.0, + "step": 10232 + }, + { + "epoch": 1.30174278081669, + "ewc_loss": 0.061115462332963943, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028156477492302656, + "grad_norm": 7.062915802001953, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8534522652626038, + "num_tokens": 390500198.0, + "step": 10233 + }, + { + "epoch": 1.3018699910952805, + "ewc_loss": 0.06107844412326813, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028119460330344737, + "grad_norm": 6.974501609802246, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8560612201690674, + "num_tokens": 390545588.0, + "step": 10234 + }, + { + "epoch": 1.301997201373871, + "ewc_loss": 0.0611468181014061, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002818783395923674, + "grad_norm": 7.131188869476318, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8526829481124878, + "num_tokens": 390581681.0, + "step": 10235 + }, + { + "epoch": 1.3021244116524615, + "ewc_loss": 0.0610346719622612, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028075685258954763, + "grad_norm": 6.9860615730285645, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8802430629730225, + "num_tokens": 390619122.0, + "step": 10236 + }, + { + "epoch": 1.302251621931052, + "ewc_loss": 0.06115739792585373, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002819841611199081, + "grad_norm": 7.026066303253174, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8490375280380249, + "num_tokens": 390667974.0, + "step": 10237 + }, + { + "epoch": 1.3023788322096426, + "ewc_loss": 0.06118255853652954, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028223576373420656, + "grad_norm": 7.062942028045654, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8646188974380493, + "num_tokens": 390705188.0, + "step": 10238 + }, + { + "epoch": 1.302506042488233, + "ewc_loss": 0.061102986335754395, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028144000680185854, + "grad_norm": 7.022591590881348, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8717454075813293, + "num_tokens": 390746892.0, + "step": 10239 + }, + { + "epoch": 1.3026332527668236, + "ewc_loss": 0.06120128929615021, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002824230759870261, + "grad_norm": 7.033041000366211, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.877023458480835, + "num_tokens": 390787727.0, + "step": 10240 + }, + { + "epoch": 1.3027604630454142, + "ewc_loss": 0.06113667041063309, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002817768545355648, + "grad_norm": 7.269050598144531, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.854378342628479, + "num_tokens": 390823702.0, + "step": 10241 + }, + { + "epoch": 1.3028876733240047, + "ewc_loss": 0.06097950413823128, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028020518948324025, + "grad_norm": 7.005397796630859, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8628076910972595, + "num_tokens": 390860256.0, + "step": 10242 + }, + { + "epoch": 1.3030148836025952, + "ewc_loss": 0.06126970425248146, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002831071906257421, + "grad_norm": 7.054783344268799, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8749104142189026, + "num_tokens": 390901676.0, + "step": 10243 + }, + { + "epoch": 1.3031420938811857, + "ewc_loss": 0.06107896566390991, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002811998128890991, + "grad_norm": 7.06789493560791, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8438463807106018, + "num_tokens": 390936911.0, + "step": 10244 + }, + { + "epoch": 1.303269304159776, + "ewc_loss": 0.06112559512257576, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002816661144606769, + "grad_norm": 7.08021354675293, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8500094413757324, + "num_tokens": 390976185.0, + "step": 10245 + }, + { + "epoch": 1.3033965144383666, + "ewc_loss": 0.06113123148679733, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002817224885802716, + "grad_norm": 7.08634614944458, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.861748993396759, + "num_tokens": 391014661.0, + "step": 10246 + }, + { + "epoch": 1.303523724716957, + "ewc_loss": 0.06110852211713791, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002814953913912177, + "grad_norm": 7.035409450531006, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8541023135185242, + "num_tokens": 391056205.0, + "step": 10247 + }, + { + "epoch": 1.3036509349955476, + "ewc_loss": 0.06118680164217949, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000282278168015182, + "grad_norm": 7.125216484069824, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8637242317199707, + "num_tokens": 391092187.0, + "step": 10248 + }, + { + "epoch": 1.3037781452741382, + "ewc_loss": 0.06104803830385208, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028089055558666587, + "grad_norm": 7.013625144958496, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8766646385192871, + "num_tokens": 391130890.0, + "step": 10249 + }, + { + "epoch": 1.3039053555527287, + "ewc_loss": 0.06128443777561188, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002832545433193445, + "grad_norm": 7.122147083282471, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8712805509567261, + "num_tokens": 391172783.0, + "step": 10250 + }, + { + "epoch": 1.3040325658313192, + "ewc_loss": 0.06094657629728317, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002798759378492832, + "grad_norm": 6.992846965789795, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.844307541847229, + "num_tokens": 391215031.0, + "step": 10251 + }, + { + "epoch": 1.3041597761099097, + "ewc_loss": 0.06124737113714218, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002828838478308171, + "grad_norm": 7.124536514282227, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8654431700706482, + "num_tokens": 391251400.0, + "step": 10252 + }, + { + "epoch": 1.3042869863885003, + "ewc_loss": 0.06095926836133003, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002800028305500746, + "grad_norm": 7.037981986999512, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8533307313919067, + "num_tokens": 391285895.0, + "step": 10253 + }, + { + "epoch": 1.3044141966670906, + "ewc_loss": 0.06117340922355652, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028214423218742013, + "grad_norm": 7.078462600708008, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8637828230857849, + "num_tokens": 391328361.0, + "step": 10254 + }, + { + "epoch": 1.304541406945681, + "ewc_loss": 0.06102266162633896, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028063677018508315, + "grad_norm": 7.100204944610596, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8609820008277893, + "num_tokens": 391365019.0, + "step": 10255 + }, + { + "epoch": 1.3046686172242716, + "ewc_loss": 0.061124224215745926, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002816524065565318, + "grad_norm": 7.075163841247559, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8469951152801514, + "num_tokens": 391400407.0, + "step": 10256 + }, + { + "epoch": 1.3047958275028622, + "ewc_loss": 0.061144594103097916, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028185610426589847, + "grad_norm": 7.030650615692139, + "learning_rate": 1e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8456870913505554, + "num_tokens": 391440660.0, + "step": 10257 + }, + { + "epoch": 1.3049230377814527, + "ewc_loss": 0.06120142340660095, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002824244147632271, + "grad_norm": 7.104106903076172, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8639789819717407, + "num_tokens": 391481096.0, + "step": 10258 + }, + { + "epoch": 1.3050502480600432, + "ewc_loss": 0.0610661655664444, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002810718142427504, + "grad_norm": 7.084772109985352, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8628169894218445, + "num_tokens": 391516356.0, + "step": 10259 + }, + { + "epoch": 1.3051774583386337, + "ewc_loss": 0.06115768849849701, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002819870423991233, + "grad_norm": 7.111092567443848, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.850679874420166, + "num_tokens": 391554691.0, + "step": 10260 + }, + { + "epoch": 1.3053046686172243, + "ewc_loss": 0.06106088310480118, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000281018961686641, + "grad_norm": 7.025048732757568, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8495249152183533, + "num_tokens": 391592473.0, + "step": 10261 + }, + { + "epoch": 1.3054318788958148, + "ewc_loss": 0.0611339695751667, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028174984618090093, + "grad_norm": 7.056468486785889, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8681856393814087, + "num_tokens": 391632091.0, + "step": 10262 + }, + { + "epoch": 1.3055590891744053, + "ewc_loss": 0.06110652536153793, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002814754261635244, + "grad_norm": 7.079861164093018, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8555246591567993, + "num_tokens": 391676121.0, + "step": 10263 + }, + { + "epoch": 1.3056862994529959, + "ewc_loss": 0.06099826842546463, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028039285098202527, + "grad_norm": 7.090209007263184, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8595610857009888, + "num_tokens": 391709664.0, + "step": 10264 + }, + { + "epoch": 1.3058135097315864, + "ewc_loss": 0.06104036420583725, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000280813779681921, + "grad_norm": 7.025205135345459, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8499876260757446, + "num_tokens": 391755828.0, + "step": 10265 + }, + { + "epoch": 1.305940720010177, + "ewc_loss": 0.06116233766078949, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002820335212163627, + "grad_norm": 7.0428786277771, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8779361248016357, + "num_tokens": 391800232.0, + "step": 10266 + }, + { + "epoch": 1.3060679302887674, + "ewc_loss": 0.06111624464392662, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002815726038534194, + "grad_norm": 7.087141513824463, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8640380501747131, + "num_tokens": 391837612.0, + "step": 10267 + }, + { + "epoch": 1.306195140567358, + "ewc_loss": 0.06122547760605812, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028266492881812155, + "grad_norm": 7.15462064743042, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8733119964599609, + "num_tokens": 391871193.0, + "step": 10268 + }, + { + "epoch": 1.3063223508459483, + "ewc_loss": 0.0610530823469162, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028094096342101693, + "grad_norm": 7.075892925262451, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8538757562637329, + "num_tokens": 391908203.0, + "step": 10269 + }, + { + "epoch": 1.3064495611245388, + "ewc_loss": 0.0612693689763546, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028310384368523955, + "grad_norm": 7.192918300628662, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8672935962677002, + "num_tokens": 391938744.0, + "step": 10270 + }, + { + "epoch": 1.3065767714031293, + "ewc_loss": 0.061000242829322815, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028041258337907493, + "grad_norm": 7.05785608291626, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8645589351654053, + "num_tokens": 391976020.0, + "step": 10271 + }, + { + "epoch": 1.3067039816817199, + "ewc_loss": 0.061316318809986115, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002835733466781676, + "grad_norm": 7.099256992340088, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8535492420196533, + "num_tokens": 392018006.0, + "step": 10272 + }, + { + "epoch": 1.3068311919603104, + "ewc_loss": 0.06098096817731857, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028021982870996, + "grad_norm": 7.054655075073242, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8587889075279236, + "num_tokens": 392055358.0, + "step": 10273 + }, + { + "epoch": 1.306958402238901, + "ewc_loss": 0.06122208386659622, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000282630993751809, + "grad_norm": 7.077868938446045, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8593544960021973, + "num_tokens": 392097727.0, + "step": 10274 + }, + { + "epoch": 1.3070856125174914, + "ewc_loss": 0.061044372618198395, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028085385565645993, + "grad_norm": 6.991863250732422, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8675530552864075, + "num_tokens": 392144641.0, + "step": 10275 + }, + { + "epoch": 1.307212822796082, + "ewc_loss": 0.06123882532119751, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002827984280884266, + "grad_norm": 7.190011501312256, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.870095431804657, + "num_tokens": 392178810.0, + "step": 10276 + }, + { + "epoch": 1.3073400330746725, + "ewc_loss": 0.061130739748477936, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002817175700329244, + "grad_norm": 7.083367347717285, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8566586971282959, + "num_tokens": 392213571.0, + "step": 10277 + }, + { + "epoch": 1.3074672433532628, + "ewc_loss": 0.06115530803799629, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002819632354658097, + "grad_norm": 7.089667320251465, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8586280941963196, + "num_tokens": 392254628.0, + "step": 10278 + }, + { + "epoch": 1.3075944536318533, + "ewc_loss": 0.061135433614254, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002817645145114511, + "grad_norm": 7.08692741394043, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8609237670898438, + "num_tokens": 392297243.0, + "step": 10279 + }, + { + "epoch": 1.3077216639104439, + "ewc_loss": 0.06112436205148697, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028165377443656325, + "grad_norm": 7.158219814300537, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8642535209655762, + "num_tokens": 392329116.0, + "step": 10280 + }, + { + "epoch": 1.3078488741890344, + "ewc_loss": 0.061102867126464844, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002814388135448098, + "grad_norm": 7.055780410766602, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8583532571792603, + "num_tokens": 392370542.0, + "step": 10281 + }, + { + "epoch": 1.307976084467625, + "ewc_loss": 0.061157748103141785, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028198762447573245, + "grad_norm": 7.118035793304443, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8666346073150635, + "num_tokens": 392411155.0, + "step": 10282 + }, + { + "epoch": 1.3081032947462155, + "ewc_loss": 0.06107611209154129, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002811712911352515, + "grad_norm": 7.071124076843262, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.862474799156189, + "num_tokens": 392450440.0, + "step": 10283 + }, + { + "epoch": 1.308230505024806, + "ewc_loss": 0.06115619093179703, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002819720539264381, + "grad_norm": 7.07232666015625, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.872631311416626, + "num_tokens": 392488545.0, + "step": 10284 + }, + { + "epoch": 1.3083577153033965, + "ewc_loss": 0.061133429408073425, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000281744432868436, + "grad_norm": 7.087335586547852, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8603833913803101, + "num_tokens": 392527138.0, + "step": 10285 + }, + { + "epoch": 1.308484925581987, + "ewc_loss": 0.061142198741436005, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002818321227096021, + "grad_norm": 7.066822052001953, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8617163896560669, + "num_tokens": 392568181.0, + "step": 10286 + }, + { + "epoch": 1.3086121358605776, + "ewc_loss": 0.06122785061597824, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002826886484399438, + "grad_norm": 7.127039432525635, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8754760026931763, + "num_tokens": 392610900.0, + "step": 10287 + }, + { + "epoch": 1.308739346139168, + "ewc_loss": 0.061148885637521744, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002818990033119917, + "grad_norm": 7.065290451049805, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8534227013587952, + "num_tokens": 392646624.0, + "step": 10288 + }, + { + "epoch": 1.3088665564177586, + "ewc_loss": 0.06129837408661842, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028339389245957136, + "grad_norm": 7.095332622528076, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8617556095123291, + "num_tokens": 392683103.0, + "step": 10289 + }, + { + "epoch": 1.3089937666963491, + "ewc_loss": 0.061292294412851334, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028333309455774724, + "grad_norm": 7.105605125427246, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8625872731208801, + "num_tokens": 392723169.0, + "step": 10290 + }, + { + "epoch": 1.3091209769749397, + "ewc_loss": 0.06119462847709656, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028235645731911063, + "grad_norm": 7.026421070098877, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8774392008781433, + "num_tokens": 392762323.0, + "step": 10291 + }, + { + "epoch": 1.3092481872535302, + "ewc_loss": 0.06136985495686531, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002841087116394192, + "grad_norm": 7.124779224395752, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8798588514328003, + "num_tokens": 392797555.0, + "step": 10292 + }, + { + "epoch": 1.3093753975321207, + "ewc_loss": 0.061212435364723206, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028253451455384493, + "grad_norm": 7.048105239868164, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8727811574935913, + "num_tokens": 392840001.0, + "step": 10293 + }, + { + "epoch": 1.309502607810711, + "ewc_loss": 0.0613919273018837, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028432943508960307, + "grad_norm": 7.091684341430664, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8610849976539612, + "num_tokens": 392880871.0, + "step": 10294 + }, + { + "epoch": 1.3096298180893016, + "ewc_loss": 0.06127243489027023, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028313451912254095, + "grad_norm": 7.148187160491943, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8735204339027405, + "num_tokens": 392917554.0, + "step": 10295 + }, + { + "epoch": 1.309757028367892, + "ewc_loss": 0.06123705208301544, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000282780674751848, + "grad_norm": 7.066738605499268, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8810485005378723, + "num_tokens": 392958601.0, + "step": 10296 + }, + { + "epoch": 1.3098842386464826, + "ewc_loss": 0.06136777997016907, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028408796060830355, + "grad_norm": 7.130680084228516, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8638032078742981, + "num_tokens": 393000108.0, + "step": 10297 + }, + { + "epoch": 1.3100114489250732, + "ewc_loss": 0.0612826943397522, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002832370810210705, + "grad_norm": 7.150935649871826, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8830875158309937, + "num_tokens": 393032321.0, + "step": 10298 + }, + { + "epoch": 1.3101386592036637, + "ewc_loss": 0.061311986297369, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002835300110746175, + "grad_norm": 7.13753080368042, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8616930842399597, + "num_tokens": 393068347.0, + "step": 10299 + }, + { + "epoch": 1.3102658694822542, + "ewc_loss": 0.06129112467169762, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028332139481790364, + "grad_norm": 7.127173900604248, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8793516159057617, + "num_tokens": 393102509.0, + "step": 10300 + }, + { + "epoch": 1.3103930797608447, + "ewc_loss": 0.06123828887939453, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002827930438797921, + "grad_norm": 7.16933012008667, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8719230890274048, + "num_tokens": 393140974.0, + "step": 10301 + }, + { + "epoch": 1.3105202900394353, + "ewc_loss": 0.06114650145173073, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002818751672748476, + "grad_norm": 7.289252281188965, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8722505569458008, + "num_tokens": 393177214.0, + "step": 10302 + }, + { + "epoch": 1.3106475003180256, + "ewc_loss": 0.06105678901076317, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002809780417010188, + "grad_norm": 7.044689655303955, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8659904599189758, + "num_tokens": 393215801.0, + "step": 10303 + }, + { + "epoch": 1.310774710596616, + "ewc_loss": 0.06125246733427048, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028293480863794684, + "grad_norm": 7.104597091674805, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8733521103858948, + "num_tokens": 393256444.0, + "step": 10304 + }, + { + "epoch": 1.3109019208752066, + "ewc_loss": 0.061141565442085266, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000281825807178393, + "grad_norm": 7.073764801025391, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8597033023834229, + "num_tokens": 393300238.0, + "step": 10305 + }, + { + "epoch": 1.3110291311537972, + "ewc_loss": 0.06126122176647186, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002830223529599607, + "grad_norm": 7.09034538269043, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8486509919166565, + "num_tokens": 393344087.0, + "step": 10306 + }, + { + "epoch": 1.3111563414323877, + "ewc_loss": 0.06123296916484833, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028273987118154764, + "grad_norm": 7.1163835525512695, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8681623935699463, + "num_tokens": 393381678.0, + "step": 10307 + }, + { + "epoch": 1.3112835517109782, + "ewc_loss": 0.06127370521426201, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002831472083926201, + "grad_norm": 7.138415813446045, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8621604442596436, + "num_tokens": 393415740.0, + "step": 10308 + }, + { + "epoch": 1.3114107619895687, + "ewc_loss": 0.06150361895561218, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002830049197655171, + "grad_norm": 7.082298755645752, + "learning_rate": 1e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8380237817764282, + "num_tokens": 393457692.0, + "step": 10309 + }, + { + "epoch": 1.3115379722681593, + "ewc_loss": 0.06125868111848831, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028299697441980243, + "grad_norm": 7.097934246063232, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.858575701713562, + "num_tokens": 393499003.0, + "step": 10310 + }, + { + "epoch": 1.3116651825467498, + "ewc_loss": 0.06123562902212143, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002827664720825851, + "grad_norm": 7.1612162590026855, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8518376350402832, + "num_tokens": 393533842.0, + "step": 10311 + }, + { + "epoch": 1.3117923928253403, + "ewc_loss": 0.06121479719877243, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002825581468641758, + "grad_norm": 7.044699192047119, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.870520830154419, + "num_tokens": 393573906.0, + "step": 10312 + }, + { + "epoch": 1.3119196031039309, + "ewc_loss": 0.06124001741409302, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002828103315550834, + "grad_norm": 7.127374649047852, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8606021404266357, + "num_tokens": 393615298.0, + "step": 10313 + }, + { + "epoch": 1.3120468133825214, + "ewc_loss": 0.061455875635147095, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028252750053070486, + "grad_norm": 7.145136833190918, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8527557849884033, + "num_tokens": 393654690.0, + "step": 10314 + }, + { + "epoch": 1.312174023661112, + "ewc_loss": 0.06144386902451515, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028240744723007083, + "grad_norm": 7.070374488830566, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8634178638458252, + "num_tokens": 393696222.0, + "step": 10315 + }, + { + "epoch": 1.3123012339397024, + "ewc_loss": 0.061202406883239746, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000282434222754091, + "grad_norm": 7.113209247589111, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8593019247055054, + "num_tokens": 393736748.0, + "step": 10316 + }, + { + "epoch": 1.312428444218293, + "ewc_loss": 0.06112358719110489, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002816460037138313, + "grad_norm": 7.103693008422852, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.854902982711792, + "num_tokens": 393773077.0, + "step": 10317 + }, + { + "epoch": 1.3125556544968833, + "ewc_loss": 0.061193257570266724, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028234272031113505, + "grad_norm": 7.141447067260742, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8737869262695312, + "num_tokens": 393806466.0, + "step": 10318 + }, + { + "epoch": 1.3126828647754738, + "ewc_loss": 0.06112435460090637, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002816536871250719, + "grad_norm": 7.174508571624756, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8589401245117188, + "num_tokens": 393840922.0, + "step": 10319 + }, + { + "epoch": 1.3128100750540643, + "ewc_loss": 0.06110600009560585, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002814701583702117, + "grad_norm": 7.125986099243164, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8674246668815613, + "num_tokens": 393874544.0, + "step": 10320 + }, + { + "epoch": 1.3129372853326549, + "ewc_loss": 0.06110434979200363, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028145365649834275, + "grad_norm": 7.04344367980957, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.859082043170929, + "num_tokens": 393917773.0, + "step": 10321 + }, + { + "epoch": 1.3130644956112454, + "ewc_loss": 0.06115596741437912, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002819698420353234, + "grad_norm": 7.149253845214844, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8568196296691895, + "num_tokens": 393951525.0, + "step": 10322 + }, + { + "epoch": 1.313191705889836, + "ewc_loss": 0.061108555644750595, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028149571153335273, + "grad_norm": 7.0391764640808105, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8665257692337036, + "num_tokens": 393989796.0, + "step": 10323 + }, + { + "epoch": 1.3133189161684264, + "ewc_loss": 0.06124068796634674, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002828170545399189, + "grad_norm": 7.202429294586182, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8591809868812561, + "num_tokens": 394028746.0, + "step": 10324 + }, + { + "epoch": 1.313446126447017, + "ewc_loss": 0.0609867200255394, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028027736698277295, + "grad_norm": 6.9973835945129395, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8699117302894592, + "num_tokens": 394065241.0, + "step": 10325 + }, + { + "epoch": 1.3135733367256075, + "ewc_loss": 0.0613650307059288, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002840604865923524, + "grad_norm": 7.135337829589844, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8570170402526855, + "num_tokens": 394110113.0, + "step": 10326 + }, + { + "epoch": 1.3137005470041978, + "ewc_loss": 0.06107950210571289, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028120516799390316, + "grad_norm": 7.053788185119629, + "learning_rate": 1e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8447486162185669, + "num_tokens": 394146577.0, + "step": 10327 + }, + { + "epoch": 1.3138277572827883, + "ewc_loss": 0.06128420680761337, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028325224411673844, + "grad_norm": 7.103965759277344, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8623533248901367, + "num_tokens": 394185392.0, + "step": 10328 + }, + { + "epoch": 1.3139549675613789, + "ewc_loss": 0.061166174709796906, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028207190916873515, + "grad_norm": 7.023418426513672, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8535714149475098, + "num_tokens": 394226301.0, + "step": 10329 + }, + { + "epoch": 1.3140821778399694, + "ewc_loss": 0.061374731361866, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002841574605554342, + "grad_norm": 7.124804496765137, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8540160655975342, + "num_tokens": 394262654.0, + "step": 10330 + }, + { + "epoch": 1.31420938811856, + "ewc_loss": 0.061200208961963654, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028241222025826573, + "grad_norm": 7.064239978790283, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8740208148956299, + "num_tokens": 394299185.0, + "step": 10331 + }, + { + "epoch": 1.3143365983971504, + "ewc_loss": 0.061578281223773956, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002837515785358846, + "grad_norm": 7.0923848152160645, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8639671802520752, + "num_tokens": 394331364.0, + "step": 10332 + }, + { + "epoch": 1.314463808675741, + "ewc_loss": 0.06153709813952446, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002833397302310914, + "grad_norm": 7.0803937911987305, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.849521279335022, + "num_tokens": 394375479.0, + "step": 10333 + }, + { + "epoch": 1.3145910189543315, + "ewc_loss": 0.06165314465761185, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002845002163667232, + "grad_norm": 7.184528350830078, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8681376576423645, + "num_tokens": 394414913.0, + "step": 10334 + }, + { + "epoch": 1.314718229232922, + "ewc_loss": 0.061501726508140564, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002829860313795507, + "grad_norm": 7.068235874176025, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.862897515296936, + "num_tokens": 394450889.0, + "step": 10335 + }, + { + "epoch": 1.3148454395115126, + "ewc_loss": 0.06192854419350624, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002848127915058285, + "grad_norm": 7.092337131500244, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8463796973228455, + "num_tokens": 394495280.0, + "step": 10336 + }, + { + "epoch": 1.314972649790103, + "ewc_loss": 0.06152919679880142, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002832607424352318, + "grad_norm": 7.0468621253967285, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8711729049682617, + "num_tokens": 394536811.0, + "step": 10337 + }, + { + "epoch": 1.3150998600686936, + "ewc_loss": 0.06152893602848053, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002832581230904907, + "grad_norm": 7.069644927978516, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8596650958061218, + "num_tokens": 394573353.0, + "step": 10338 + }, + { + "epoch": 1.3152270703472841, + "ewc_loss": 0.06132892519235611, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002836993953678757, + "grad_norm": 7.162627696990967, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8551949262619019, + "num_tokens": 394603517.0, + "step": 10339 + }, + { + "epoch": 1.3153542806258747, + "ewc_loss": 0.061143480241298676, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028184495749883354, + "grad_norm": 6.939296245574951, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8689354658126831, + "num_tokens": 394650547.0, + "step": 10340 + }, + { + "epoch": 1.3154814909044652, + "ewc_loss": 0.06157936155796051, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028620377997867763, + "grad_norm": 7.185763835906982, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8670050501823425, + "num_tokens": 394684460.0, + "step": 10341 + }, + { + "epoch": 1.3156087011830557, + "ewc_loss": 0.06116113066673279, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028202144312672317, + "grad_norm": 7.098196983337402, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8781948685646057, + "num_tokens": 394733520.0, + "step": 10342 + }, + { + "epoch": 1.315735911461646, + "ewc_loss": 0.06136603653430939, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028407052741385996, + "grad_norm": 7.041890621185303, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.856336236000061, + "num_tokens": 394775498.0, + "step": 10343 + }, + { + "epoch": 1.3158631217402366, + "ewc_loss": 0.061315521597862244, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028356537222862244, + "grad_norm": 7.066420078277588, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8494269251823425, + "num_tokens": 394815138.0, + "step": 10344 + }, + { + "epoch": 1.315990332018827, + "ewc_loss": 0.06136506423354149, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002840608067344874, + "grad_norm": 7.1974921226501465, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.854921817779541, + "num_tokens": 394850447.0, + "step": 10345 + }, + { + "epoch": 1.3161175422974176, + "ewc_loss": 0.06124667823314667, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002828769211191684, + "grad_norm": 7.057233810424805, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8563467264175415, + "num_tokens": 394891800.0, + "step": 10346 + }, + { + "epoch": 1.3162447525760081, + "ewc_loss": 0.06144223362207413, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028483246569521725, + "grad_norm": 7.178643226623535, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8678008913993835, + "num_tokens": 394930087.0, + "step": 10347 + }, + { + "epoch": 1.3163719628545987, + "ewc_loss": 0.06113201379776001, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000281730288406834, + "grad_norm": 7.047916889190674, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8724045753479004, + "num_tokens": 394964497.0, + "step": 10348 + }, + { + "epoch": 1.3164991731331892, + "ewc_loss": 0.061427436769008636, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002846845018211752, + "grad_norm": 7.1816582679748535, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.844369649887085, + "num_tokens": 395003127.0, + "step": 10349 + }, + { + "epoch": 1.3166263834117797, + "ewc_loss": 0.06110972538590431, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028150741127319634, + "grad_norm": 7.097907066345215, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8531244397163391, + "num_tokens": 395040570.0, + "step": 10350 + }, + { + "epoch": 1.3167535936903703, + "ewc_loss": 0.06133127957582474, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028372296947054565, + "grad_norm": 7.129033088684082, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8640216588973999, + "num_tokens": 395075205.0, + "step": 10351 + }, + { + "epoch": 1.3168808039689606, + "ewc_loss": 0.0612068772315979, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028247892623767257, + "grad_norm": 7.083811283111572, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8612135648727417, + "num_tokens": 395117492.0, + "step": 10352 + }, + { + "epoch": 1.317008014247551, + "ewc_loss": 0.061234861612319946, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002827587886713445, + "grad_norm": 7.057704925537109, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8600424528121948, + "num_tokens": 395162449.0, + "step": 10353 + }, + { + "epoch": 1.3171352245261416, + "ewc_loss": 0.06122475862503052, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028265774017199874, + "grad_norm": 7.096628665924072, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8710315227508545, + "num_tokens": 395197310.0, + "step": 10354 + }, + { + "epoch": 1.3172624348047322, + "ewc_loss": 0.06132097914814949, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002836199419107288, + "grad_norm": 7.120417594909668, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8732831478118896, + "num_tokens": 395232420.0, + "step": 10355 + }, + { + "epoch": 1.3173896450833227, + "ewc_loss": 0.061267703771591187, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028308716719038785, + "grad_norm": 7.109573841094971, + "learning_rate": 1e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8413619995117188, + "num_tokens": 395269392.0, + "step": 10356 + }, + { + "epoch": 1.3175168553619132, + "ewc_loss": 0.06126239150762558, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028303408180363476, + "grad_norm": 7.114233016967773, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8499314188957214, + "num_tokens": 395305496.0, + "step": 10357 + }, + { + "epoch": 1.3176440656405037, + "ewc_loss": 0.06118950992822647, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028230526368133724, + "grad_norm": 7.034980297088623, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8717104196548462, + "num_tokens": 395347089.0, + "step": 10358 + }, + { + "epoch": 1.3177712759190943, + "ewc_loss": 0.06138649955391884, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028427515644580126, + "grad_norm": 7.100482940673828, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8730072975158691, + "num_tokens": 395382885.0, + "step": 10359 + }, + { + "epoch": 1.3178984861976848, + "ewc_loss": 0.06124245375394821, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028283472056500614, + "grad_norm": 7.106476783752441, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.876255214214325, + "num_tokens": 395416260.0, + "step": 10360 + }, + { + "epoch": 1.3180256964762753, + "ewc_loss": 0.06121625006198883, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028257264057174325, + "grad_norm": 7.072586536407471, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8540980815887451, + "num_tokens": 395454179.0, + "step": 10361 + }, + { + "epoch": 1.3181529067548658, + "ewc_loss": 0.06136426329612732, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002840528031811118, + "grad_norm": 7.111304759979248, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8525762557983398, + "num_tokens": 395489173.0, + "step": 10362 + }, + { + "epoch": 1.3182801170334564, + "ewc_loss": 0.06127067655324936, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002831169113051146, + "grad_norm": 7.085671424865723, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8556028604507446, + "num_tokens": 395532057.0, + "step": 10363 + }, + { + "epoch": 1.318407327312047, + "ewc_loss": 0.061425842344760895, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002846685820259154, + "grad_norm": 7.117480754852295, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.869504451751709, + "num_tokens": 395570321.0, + "step": 10364 + }, + { + "epoch": 1.3185345375906374, + "ewc_loss": 0.06147361546754837, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002827049174811691, + "grad_norm": 7.054253101348877, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8824372291564941, + "num_tokens": 395609041.0, + "step": 10365 + }, + { + "epoch": 1.318661747869228, + "ewc_loss": 0.06157606467604637, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002837294014170766, + "grad_norm": 7.125798225402832, + "learning_rate": 1e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.839189887046814, + "num_tokens": 395649961.0, + "step": 10366 + }, + { + "epoch": 1.3187889581478183, + "ewc_loss": 0.06126663088798523, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002830764860846102, + "grad_norm": 7.006105422973633, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8633894920349121, + "num_tokens": 395691882.0, + "step": 10367 + }, + { + "epoch": 1.3189161684264088, + "ewc_loss": 0.061420686542987823, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002846170391421765, + "grad_norm": 7.1142168045043945, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8490452766418457, + "num_tokens": 395731496.0, + "step": 10368 + }, + { + "epoch": 1.3190433787049993, + "ewc_loss": 0.06131422892212868, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028355245012789965, + "grad_norm": 7.050328254699707, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8663229942321777, + "num_tokens": 395772618.0, + "step": 10369 + }, + { + "epoch": 1.3191705889835899, + "ewc_loss": 0.06146465986967087, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028505673981271684, + "grad_norm": 7.140347003936768, + "learning_rate": 1e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8449311256408691, + "num_tokens": 395811907.0, + "step": 10370 + }, + { + "epoch": 1.3192977992621804, + "ewc_loss": 0.06129520758986473, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028336222749203444, + "grad_norm": 7.0782246589660645, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8557904362678528, + "num_tokens": 395851186.0, + "step": 10371 + }, + { + "epoch": 1.319425009540771, + "ewc_loss": 0.061508700251579285, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028549713897518814, + "grad_norm": 7.114733695983887, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8616002202033997, + "num_tokens": 395893868.0, + "step": 10372 + }, + { + "epoch": 1.3195522198193614, + "ewc_loss": 0.06140970066189766, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028450717218220234, + "grad_norm": 7.178703784942627, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8724184036254883, + "num_tokens": 395930519.0, + "step": 10373 + }, + { + "epoch": 1.319679430097952, + "ewc_loss": 0.06185126304626465, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00028403999749571085, + "grad_norm": 7.117521286010742, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8775316476821899, + "num_tokens": 395965692.0, + "step": 10374 + }, + { + "epoch": 1.3198066403765425, + "ewc_loss": 0.061353228986263275, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002839424414560199, + "grad_norm": 7.124668598175049, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8632106184959412, + "num_tokens": 396002316.0, + "step": 10375 + }, + { + "epoch": 1.3199338506551328, + "ewc_loss": 0.061302732676267624, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028343748999759555, + "grad_norm": 7.099189758300781, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8681924343109131, + "num_tokens": 396043392.0, + "step": 10376 + }, + { + "epoch": 1.3200610609337233, + "ewc_loss": 0.06134886294603348, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002838987857103348, + "grad_norm": 7.140711307525635, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8600220680236816, + "num_tokens": 396078603.0, + "step": 10377 + }, + { + "epoch": 1.3201882712123139, + "ewc_loss": 0.06124619022011757, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002828720607794821, + "grad_norm": 7.086597442626953, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8529355525970459, + "num_tokens": 396121132.0, + "step": 10378 + }, + { + "epoch": 1.3203154814909044, + "ewc_loss": 0.0614236444234848, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028464660863392055, + "grad_norm": 7.180474281311035, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.871103823184967, + "num_tokens": 396155730.0, + "step": 10379 + }, + { + "epoch": 1.320442691769495, + "ewc_loss": 0.061312705278396606, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002835372288245708, + "grad_norm": 7.12143087387085, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8476929068565369, + "num_tokens": 396190976.0, + "step": 10380 + }, + { + "epoch": 1.3205699020480854, + "ewc_loss": 0.061337828636169434, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002837884530890733, + "grad_norm": 7.117780685424805, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8622824549674988, + "num_tokens": 396229571.0, + "step": 10381 + }, + { + "epoch": 1.320697112326676, + "ewc_loss": 0.061359383165836334, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028400399605743587, + "grad_norm": 7.146689414978027, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8622878789901733, + "num_tokens": 396265586.0, + "step": 10382 + }, + { + "epoch": 1.3208243226052665, + "ewc_loss": 0.06124288588762283, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028283902793191373, + "grad_norm": 7.157103061676025, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.867840051651001, + "num_tokens": 396303810.0, + "step": 10383 + }, + { + "epoch": 1.320951532883857, + "ewc_loss": 0.06133931130170822, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028380329604260623, + "grad_norm": 7.146819114685059, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8811531066894531, + "num_tokens": 396345671.0, + "step": 10384 + }, + { + "epoch": 1.3210787431624476, + "ewc_loss": 0.0611640140414238, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028205031412653625, + "grad_norm": 7.065569877624512, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8783963918685913, + "num_tokens": 396388062.0, + "step": 10385 + }, + { + "epoch": 1.321205953441038, + "ewc_loss": 0.06128178536891937, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000283228000625968, + "grad_norm": 7.205509662628174, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.875569760799408, + "num_tokens": 396425189.0, + "step": 10386 + }, + { + "epoch": 1.3213331637196286, + "ewc_loss": 0.06109727919101715, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028138296329416335, + "grad_norm": 7.070329666137695, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8517436981201172, + "num_tokens": 396461861.0, + "step": 10387 + }, + { + "epoch": 1.3214603739982191, + "ewc_loss": 0.0613899864256382, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002843100228346884, + "grad_norm": 7.205548286437988, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.872953474521637, + "num_tokens": 396506190.0, + "step": 10388 + }, + { + "epoch": 1.3215875842768097, + "ewc_loss": 0.06112638860940933, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002816740598063916, + "grad_norm": 7.12953519821167, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.864409327507019, + "num_tokens": 396538900.0, + "step": 10389 + }, + { + "epoch": 1.3217147945554002, + "ewc_loss": 0.061275795102119446, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000283168104942888, + "grad_norm": 7.111554145812988, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8706400990486145, + "num_tokens": 396577029.0, + "step": 10390 + }, + { + "epoch": 1.3218420048339907, + "ewc_loss": 0.06132739037275314, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000283684057649225, + "grad_norm": 7.136519432067871, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8480243682861328, + "num_tokens": 396616917.0, + "step": 10391 + }, + { + "epoch": 1.321969215112581, + "ewc_loss": 0.06122748926281929, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028268503956496716, + "grad_norm": 7.12140417098999, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8708550930023193, + "num_tokens": 396651726.0, + "step": 10392 + }, + { + "epoch": 1.3220964253911716, + "ewc_loss": 0.06126433610916138, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002830534940585494, + "grad_norm": 7.163585662841797, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8507308959960938, + "num_tokens": 396693796.0, + "step": 10393 + }, + { + "epoch": 1.322223635669762, + "ewc_loss": 0.061243634670972824, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002828465076163411, + "grad_norm": 7.129191875457764, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8513641357421875, + "num_tokens": 396731947.0, + "step": 10394 + }, + { + "epoch": 1.3223508459483526, + "ewc_loss": 0.061300769448280334, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028341784491203725, + "grad_norm": 7.099974632263184, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8660738468170166, + "num_tokens": 396767062.0, + "step": 10395 + }, + { + "epoch": 1.3224780562269431, + "ewc_loss": 0.06129435449838638, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002833537000697106, + "grad_norm": 7.165022373199463, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8662810921669006, + "num_tokens": 396803918.0, + "step": 10396 + }, + { + "epoch": 1.3226052665055337, + "ewc_loss": 0.06133146584033966, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002837248321156949, + "grad_norm": 7.1065592765808105, + "learning_rate": 1e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.8362988233566284, + "num_tokens": 396850314.0, + "step": 10397 + }, + { + "epoch": 1.3227324767841242, + "ewc_loss": 0.06135173887014389, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028392754029482603, + "grad_norm": 7.2061920166015625, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8551350831985474, + "num_tokens": 396887920.0, + "step": 10398 + }, + { + "epoch": 1.3228596870627147, + "ewc_loss": 0.06121228635311127, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002825330011546612, + "grad_norm": 7.15449857711792, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8527839183807373, + "num_tokens": 396923369.0, + "step": 10399 + }, + { + "epoch": 1.3229868973413053, + "ewc_loss": 0.06137378141283989, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028414797270670533, + "grad_norm": 7.139761447906494, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8598430156707764, + "num_tokens": 396962338.0, + "step": 10400 + }, + { + "epoch": 1.3231141076198956, + "ewc_loss": 0.06128169968724251, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028322715661488473, + "grad_norm": 7.08268928527832, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8732352256774902, + "num_tokens": 396998275.0, + "step": 10401 + }, + { + "epoch": 1.323241317898486, + "ewc_loss": 0.0614200197160244, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028461034526117146, + "grad_norm": 7.196513652801514, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8593441247940063, + "num_tokens": 397040034.0, + "step": 10402 + }, + { + "epoch": 1.3233685281770766, + "ewc_loss": 0.061259470880031586, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002830048615578562, + "grad_norm": 7.10081672668457, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8824549913406372, + "num_tokens": 397080549.0, + "step": 10403 + }, + { + "epoch": 1.3234957384556671, + "ewc_loss": 0.06170399487018585, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002850086893886328, + "grad_norm": 7.217899322509766, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8755810856819153, + "num_tokens": 397120803.0, + "step": 10404 + }, + { + "epoch": 1.3236229487342577, + "ewc_loss": 0.061204563826322556, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028245578869245946, + "grad_norm": 7.045815944671631, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8668861389160156, + "num_tokens": 397160463.0, + "step": 10405 + }, + { + "epoch": 1.3237501590128482, + "ewc_loss": 0.06147192791104317, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028512944118119776, + "grad_norm": 7.235225200653076, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8562919497489929, + "num_tokens": 397205425.0, + "step": 10406 + }, + { + "epoch": 1.3238773692914387, + "ewc_loss": 0.06113618612289429, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002817719941958785, + "grad_norm": 7.120449066162109, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8569111227989197, + "num_tokens": 397238907.0, + "step": 10407 + }, + { + "epoch": 1.3240045795700293, + "ewc_loss": 0.06145303696393967, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028494049911387265, + "grad_norm": 7.170141220092773, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.863113522529602, + "num_tokens": 397281639.0, + "step": 10408 + }, + { + "epoch": 1.3241317898486198, + "ewc_loss": 0.06121194735169411, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002825296251103282, + "grad_norm": 7.111811637878418, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8514208793640137, + "num_tokens": 397323350.0, + "step": 10409 + }, + { + "epoch": 1.3242590001272103, + "ewc_loss": 0.06134558469057083, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002838660147972405, + "grad_norm": 7.155301570892334, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8564682006835938, + "num_tokens": 397364438.0, + "step": 10410 + }, + { + "epoch": 1.3243862104058008, + "ewc_loss": 0.061320655047893524, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002836167113855481, + "grad_norm": 7.227227210998535, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8491725921630859, + "num_tokens": 397406881.0, + "step": 10411 + }, + { + "epoch": 1.3245134206843914, + "ewc_loss": 0.0612032487988472, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002824426628649235, + "grad_norm": 7.109021186828613, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8642476201057434, + "num_tokens": 397451190.0, + "step": 10412 + }, + { + "epoch": 1.324640630962982, + "ewc_loss": 0.061467453837394714, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002850846794899553, + "grad_norm": 7.195485591888428, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8626044988632202, + "num_tokens": 397491063.0, + "step": 10413 + }, + { + "epoch": 1.3247678412415724, + "ewc_loss": 0.061160434037446976, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000282014487311244, + "grad_norm": 7.159495830535889, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8632585406303406, + "num_tokens": 397529961.0, + "step": 10414 + }, + { + "epoch": 1.324895051520163, + "ewc_loss": 0.06137113645672798, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028412151732482016, + "grad_norm": 7.1942973136901855, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8736292123794556, + "num_tokens": 397565616.0, + "step": 10415 + }, + { + "epoch": 1.3250222617987533, + "ewc_loss": 0.06123553588986397, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028276551165618, + "grad_norm": 7.161871433258057, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8691239356994629, + "num_tokens": 397602223.0, + "step": 10416 + }, + { + "epoch": 1.3251494720773438, + "ewc_loss": 0.06130138039588928, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002834239858202636, + "grad_norm": 7.211639404296875, + "learning_rate": 1e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.8409632444381714, + "num_tokens": 397638910.0, + "step": 10417 + }, + { + "epoch": 1.3252766823559343, + "ewc_loss": 0.06127037853002548, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028311394271440804, + "grad_norm": 7.1032538414001465, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8554567098617554, + "num_tokens": 397681200.0, + "step": 10418 + }, + { + "epoch": 1.3254038926345248, + "ewc_loss": 0.06147010624408722, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002851111930795014, + "grad_norm": 7.155839443206787, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8639196157455444, + "num_tokens": 397724832.0, + "step": 10419 + }, + { + "epoch": 1.3255311029131154, + "ewc_loss": 0.06130693107843399, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028347945772111416, + "grad_norm": 7.1460652351379395, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8549100160598755, + "num_tokens": 397765537.0, + "step": 10420 + }, + { + "epoch": 1.325658313191706, + "ewc_loss": 0.06153196468949318, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028572979499585927, + "grad_norm": 7.237689971923828, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8555736541748047, + "num_tokens": 397800945.0, + "step": 10421 + }, + { + "epoch": 1.3257855234702964, + "ewc_loss": 0.06137840077280998, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028419416048564017, + "grad_norm": 7.174134731292725, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8671531081199646, + "num_tokens": 397840786.0, + "step": 10422 + }, + { + "epoch": 1.325912733748887, + "ewc_loss": 0.06140701472759247, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002844803093466908, + "grad_norm": 7.23519229888916, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8672448992729187, + "num_tokens": 397875607.0, + "step": 10423 + }, + { + "epoch": 1.3260399440274775, + "ewc_loss": 0.06130512058734894, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028346135513857007, + "grad_norm": 7.178255081176758, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8781256675720215, + "num_tokens": 397909524.0, + "step": 10424 + }, + { + "epoch": 1.3261671543060678, + "ewc_loss": 0.06137606501579285, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028417081921361387, + "grad_norm": 7.176488876342773, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8467075228691101, + "num_tokens": 397947862.0, + "step": 10425 + }, + { + "epoch": 1.3262943645846583, + "ewc_loss": 0.061279743909835815, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028320756973698735, + "grad_norm": 7.251919269561768, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8759033679962158, + "num_tokens": 397980958.0, + "step": 10426 + }, + { + "epoch": 1.3264215748632489, + "ewc_loss": 0.06120063364505768, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002824164985213429, + "grad_norm": 7.128992557525635, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8600435853004456, + "num_tokens": 398021231.0, + "step": 10427 + }, + { + "epoch": 1.3265487851418394, + "ewc_loss": 0.061505891382694244, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002854690537787974, + "grad_norm": 7.190065860748291, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8597744107246399, + "num_tokens": 398059202.0, + "step": 10428 + }, + { + "epoch": 1.32667599542043, + "ewc_loss": 0.06128327548503876, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002832429017871618, + "grad_norm": 7.126861095428467, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8835780620574951, + "num_tokens": 398097121.0, + "step": 10429 + }, + { + "epoch": 1.3268032056990204, + "ewc_loss": 0.061435725539922714, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028476741863414645, + "grad_norm": 7.216746807098389, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.862265408039093, + "num_tokens": 398130413.0, + "step": 10430 + }, + { + "epoch": 1.326930415977611, + "ewc_loss": 0.061301760375499725, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028342774021439254, + "grad_norm": 7.141994476318359, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8606427907943726, + "num_tokens": 398172535.0, + "step": 10431 + }, + { + "epoch": 1.3270576262562015, + "ewc_loss": 0.0614250972867012, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028466113144531846, + "grad_norm": 7.232931137084961, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8630657196044922, + "num_tokens": 398206681.0, + "step": 10432 + }, + { + "epoch": 1.327184836534792, + "ewc_loss": 0.061242081224918365, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002828309952747077, + "grad_norm": 7.232918739318848, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8656233549118042, + "num_tokens": 398245206.0, + "step": 10433 + }, + { + "epoch": 1.3273120468133826, + "ewc_loss": 0.061337340623140335, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028378356364555657, + "grad_norm": 7.188419342041016, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8579772710800171, + "num_tokens": 398278902.0, + "step": 10434 + }, + { + "epoch": 1.327439257091973, + "ewc_loss": 0.06133661046624184, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028377625858411193, + "grad_norm": 7.121549129486084, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8623340129852295, + "num_tokens": 398315833.0, + "step": 10435 + }, + { + "epoch": 1.3275664673705636, + "ewc_loss": 0.061348408460617065, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028389421640895307, + "grad_norm": 7.175613880157471, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8452054262161255, + "num_tokens": 398356496.0, + "step": 10436 + }, + { + "epoch": 1.3276936776491541, + "ewc_loss": 0.061349477618932724, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028390492661856115, + "grad_norm": 7.134081840515137, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.869114875793457, + "num_tokens": 398393661.0, + "step": 10437 + }, + { + "epoch": 1.3278208879277447, + "ewc_loss": 0.06145521253347397, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028496229788288474, + "grad_norm": 7.183847904205322, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8792212009429932, + "num_tokens": 398430463.0, + "step": 10438 + }, + { + "epoch": 1.3279480982063352, + "ewc_loss": 0.06127065792679787, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002831167366821319, + "grad_norm": 7.160045623779297, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8618420362472534, + "num_tokens": 398470382.0, + "step": 10439 + }, + { + "epoch": 1.3280753084849257, + "ewc_loss": 0.06141647696495056, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002845749258995056, + "grad_norm": 7.227306842803955, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8447824716567993, + "num_tokens": 398504793.0, + "step": 10440 + }, + { + "epoch": 1.328202518763516, + "ewc_loss": 0.06140133738517761, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002819821238517761, + "grad_norm": 7.1081862449646, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8543624877929688, + "num_tokens": 398541038.0, + "step": 10441 + }, + { + "epoch": 1.3283297290421066, + "ewc_loss": 0.06173407658934593, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002853095065802336, + "grad_norm": 7.188680648803711, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8457728624343872, + "num_tokens": 398578047.0, + "step": 10442 + }, + { + "epoch": 1.328456939320697, + "ewc_loss": 0.061267200857400894, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002830821613315493, + "grad_norm": 7.116896152496338, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8501955270767212, + "num_tokens": 398618946.0, + "step": 10443 + }, + { + "epoch": 1.3285841495992876, + "ewc_loss": 0.06171003356575966, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002850690798368305, + "grad_norm": 7.1689453125, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8659868240356445, + "num_tokens": 398656895.0, + "step": 10444 + }, + { + "epoch": 1.3287113598778781, + "ewc_loss": 0.06156904250383377, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002836591738741845, + "grad_norm": 7.1154279708862305, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8632847666740417, + "num_tokens": 398696521.0, + "step": 10445 + }, + { + "epoch": 1.3288385701564687, + "ewc_loss": 0.06171989068388939, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028516765451058745, + "grad_norm": 7.161581039428711, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8634917736053467, + "num_tokens": 398735275.0, + "step": 10446 + }, + { + "epoch": 1.3289657804350592, + "ewc_loss": 0.06167808175086975, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002847495779860765, + "grad_norm": 7.141945838928223, + "learning_rate": 1e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8472615480422974, + "num_tokens": 398774753.0, + "step": 10447 + }, + { + "epoch": 1.3290929907136497, + "ewc_loss": 0.06166272982954979, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002845960552804172, + "grad_norm": 7.152739524841309, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8562976121902466, + "num_tokens": 398810893.0, + "step": 10448 + }, + { + "epoch": 1.3292202009922403, + "ewc_loss": 0.06176241487264633, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002855928905773908, + "grad_norm": 7.175599098205566, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8567604422569275, + "num_tokens": 398849741.0, + "step": 10449 + }, + { + "epoch": 1.3293474112708306, + "ewc_loss": 0.06159339100122452, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000283902685623616, + "grad_norm": 7.202078342437744, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8557944297790527, + "num_tokens": 398881186.0, + "step": 10450 + }, + { + "epoch": 1.329474621549421, + "ewc_loss": 0.06158961355686188, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028386490885168314, + "grad_norm": 7.1285529136657715, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8792093992233276, + "num_tokens": 398918677.0, + "step": 10451 + }, + { + "epoch": 1.3296018318280116, + "ewc_loss": 0.06168802082538605, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028484893846325576, + "grad_norm": 7.173669338226318, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8600541949272156, + "num_tokens": 398953435.0, + "step": 10452 + }, + { + "epoch": 1.3297290421066021, + "ewc_loss": 0.06150353327393532, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028300407575443387, + "grad_norm": 7.090155124664307, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8683196306228638, + "num_tokens": 398991799.0, + "step": 10453 + }, + { + "epoch": 1.3298562523851927, + "ewc_loss": 0.06173159182071686, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000285284681012854, + "grad_norm": 7.170546531677246, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8658955097198486, + "num_tokens": 399023211.0, + "step": 10454 + }, + { + "epoch": 1.3299834626637832, + "ewc_loss": 0.06154961138963699, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028346487670205534, + "grad_norm": 7.110917568206787, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8575088977813721, + "num_tokens": 399061431.0, + "step": 10455 + }, + { + "epoch": 1.3301106729423737, + "ewc_loss": 0.06164898723363876, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002844586269930005, + "grad_norm": 7.1508989334106445, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8647180795669556, + "num_tokens": 399097384.0, + "step": 10456 + }, + { + "epoch": 1.3302378832209643, + "ewc_loss": 0.06158986687660217, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002838674408849329, + "grad_norm": 7.056121349334717, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8651812076568604, + "num_tokens": 399133935.0, + "step": 10457 + }, + { + "epoch": 1.3303650934995548, + "ewc_loss": 0.061492957174777985, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002853397454600781, + "grad_norm": 7.1989216804504395, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8670823574066162, + "num_tokens": 399165709.0, + "step": 10458 + }, + { + "epoch": 1.3304923037781453, + "ewc_loss": 0.06150033324956894, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028297206154093146, + "grad_norm": 7.051016330718994, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.859628438949585, + "num_tokens": 399203548.0, + "step": 10459 + }, + { + "epoch": 1.3306195140567358, + "ewc_loss": 0.061882827430963516, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002867970324587077, + "grad_norm": 7.134367942810059, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8606224060058594, + "num_tokens": 399244341.0, + "step": 10460 + }, + { + "epoch": 1.3307467243353264, + "ewc_loss": 0.06160372495651245, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002840060042217374, + "grad_norm": 7.048805236816406, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8679129481315613, + "num_tokens": 399282284.0, + "step": 10461 + }, + { + "epoch": 1.330873934613917, + "ewc_loss": 0.06187018007040024, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002866705763153732, + "grad_norm": 7.1152567863464355, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8767267465591431, + "num_tokens": 399320523.0, + "step": 10462 + }, + { + "epoch": 1.3310011448925074, + "ewc_loss": 0.06174812465906143, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002854500198736787, + "grad_norm": 7.107069492340088, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8627211451530457, + "num_tokens": 399357489.0, + "step": 10463 + }, + { + "epoch": 1.331128355171098, + "ewc_loss": 0.061721064150333405, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000285179412458092, + "grad_norm": 7.111140251159668, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8498357534408569, + "num_tokens": 399393561.0, + "step": 10464 + }, + { + "epoch": 1.3312555654496883, + "ewc_loss": 0.06150698661804199, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002854800259228796, + "grad_norm": 7.160722732543945, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8671280145645142, + "num_tokens": 399424514.0, + "step": 10465 + }, + { + "epoch": 1.3313827757282788, + "ewc_loss": 0.0614023357629776, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002844335394911468, + "grad_norm": 7.0724778175354, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8678193688392639, + "num_tokens": 399461372.0, + "step": 10466 + }, + { + "epoch": 1.3315099860068693, + "ewc_loss": 0.06159443408250809, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002863544796127826, + "grad_norm": 7.190605640411377, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8775230646133423, + "num_tokens": 399502503.0, + "step": 10467 + }, + { + "epoch": 1.3316371962854598, + "ewc_loss": 0.06155005842447281, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002834693295881152, + "grad_norm": 7.200113773345947, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8560383319854736, + "num_tokens": 399537836.0, + "step": 10468 + }, + { + "epoch": 1.3317644065640504, + "ewc_loss": 0.061614297330379486, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028411170933395624, + "grad_norm": 7.056405067443848, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8579044938087463, + "num_tokens": 399575189.0, + "step": 10469 + }, + { + "epoch": 1.331891616842641, + "ewc_loss": 0.06156110763549805, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028602121165022254, + "grad_norm": 7.194567680358887, + "learning_rate": 1e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.8429699540138245, + "num_tokens": 399614406.0, + "step": 10470 + }, + { + "epoch": 1.3320188271212314, + "ewc_loss": 0.06159704923629761, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002839392691385001, + "grad_norm": 7.042306900024414, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8678292036056519, + "num_tokens": 399652847.0, + "step": 10471 + }, + { + "epoch": 1.332146037399822, + "ewc_loss": 0.06177614629268646, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002857302315533161, + "grad_norm": 7.133433818817139, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8515999913215637, + "num_tokens": 399691628.0, + "step": 10472 + }, + { + "epoch": 1.3322732476784125, + "ewc_loss": 0.06135908514261246, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002840010274667293, + "grad_norm": 7.079750061035156, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8731290698051453, + "num_tokens": 399732875.0, + "step": 10473 + }, + { + "epoch": 1.3324004579570028, + "ewc_loss": 0.061517052352428436, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028558066696859896, + "grad_norm": 7.145284175872803, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8734369874000549, + "num_tokens": 399769537.0, + "step": 10474 + }, + { + "epoch": 1.3325276682355933, + "ewc_loss": 0.06133757904171944, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000283785950159654, + "grad_norm": 7.130164623260498, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8470205068588257, + "num_tokens": 399807757.0, + "step": 10475 + }, + { + "epoch": 1.3326548785141838, + "ewc_loss": 0.061674997210502625, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002847186988219619, + "grad_norm": 7.18949556350708, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8688567280769348, + "num_tokens": 399837351.0, + "step": 10476 + }, + { + "epoch": 1.3327820887927744, + "ewc_loss": 0.06154334172606468, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002834021579474211, + "grad_norm": 7.115830421447754, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8810759782791138, + "num_tokens": 399873678.0, + "step": 10477 + }, + { + "epoch": 1.332909299071365, + "ewc_loss": 0.061669476330280304, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002846635179594159, + "grad_norm": 7.105536937713623, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8634271621704102, + "num_tokens": 399916206.0, + "step": 10478 + }, + { + "epoch": 1.3330365093499554, + "ewc_loss": 0.06165847182273865, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028455344727262855, + "grad_norm": 7.138280391693115, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8747838735580444, + "num_tokens": 399954144.0, + "step": 10479 + }, + { + "epoch": 1.333163719628546, + "ewc_loss": 0.061672043055295944, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028468918753787875, + "grad_norm": 7.142886638641357, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8714534044265747, + "num_tokens": 399995865.0, + "step": 10480 + }, + { + "epoch": 1.3332909299071365, + "ewc_loss": 0.061744704842567444, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000285415822872892, + "grad_norm": 7.1389851570129395, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8636415004730225, + "num_tokens": 400033234.0, + "step": 10481 + }, + { + "epoch": 1.333418140185727, + "ewc_loss": 0.061609622091054916, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028406496858224273, + "grad_norm": 7.171164035797119, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8706178665161133, + "num_tokens": 400066842.0, + "step": 10482 + }, + { + "epoch": 1.3335453504643175, + "ewc_loss": 0.061616960912942886, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002841383684426546, + "grad_norm": 7.115767955780029, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8735721707344055, + "num_tokens": 400099380.0, + "step": 10483 + }, + { + "epoch": 1.333672560742908, + "ewc_loss": 0.061693329364061356, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002849020529538393, + "grad_norm": 7.100405216217041, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8744724988937378, + "num_tokens": 400137419.0, + "step": 10484 + }, + { + "epoch": 1.3337997710214986, + "ewc_loss": 0.061717789620161057, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002851466415449977, + "grad_norm": 7.185887336730957, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8630579710006714, + "num_tokens": 400171086.0, + "step": 10485 + }, + { + "epoch": 1.3339269813000891, + "ewc_loss": 0.06152793765068054, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002832481113728136, + "grad_norm": 7.080679893493652, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8606700897216797, + "num_tokens": 400206763.0, + "step": 10486 + }, + { + "epoch": 1.3340541915786797, + "ewc_loss": 0.06171919405460358, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002851606986951083, + "grad_norm": 7.2101731300354, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8472820520401001, + "num_tokens": 400242708.0, + "step": 10487 + }, + { + "epoch": 1.3341814018572702, + "ewc_loss": 0.06146407872438431, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002826095442287624, + "grad_norm": 7.1040544509887695, + "learning_rate": 1e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8424744009971619, + "num_tokens": 400277417.0, + "step": 10488 + }, + { + "epoch": 1.3343086121358605, + "ewc_loss": 0.061854287981987, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002840702363755554, + "grad_norm": 7.108111381530762, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8612141013145447, + "num_tokens": 400315811.0, + "step": 10489 + }, + { + "epoch": 1.334435822414451, + "ewc_loss": 0.06160924583673477, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028406118508428335, + "grad_norm": 7.0513224601745605, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8435106873512268, + "num_tokens": 400356321.0, + "step": 10490 + }, + { + "epoch": 1.3345630326930416, + "ewc_loss": 0.06197467818856239, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002852741163223982, + "grad_norm": 7.11655330657959, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8614687919616699, + "num_tokens": 400392382.0, + "step": 10491 + }, + { + "epoch": 1.334690242971632, + "ewc_loss": 0.0616515688598156, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028448444209061563, + "grad_norm": 7.068282127380371, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8699897527694702, + "num_tokens": 400434947.0, + "step": 10492 + }, + { + "epoch": 1.3348174532502226, + "ewc_loss": 0.06201166659593582, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002856439969036728, + "grad_norm": 7.1205153465271, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8441171646118164, + "num_tokens": 400472742.0, + "step": 10493 + }, + { + "epoch": 1.3349446635288131, + "ewc_loss": 0.06169132888317108, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002848820586223155, + "grad_norm": 7.067528247833252, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8756014108657837, + "num_tokens": 400510029.0, + "step": 10494 + }, + { + "epoch": 1.3350718738074037, + "ewc_loss": 0.06179514527320862, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002859202213585377, + "grad_norm": 7.111146926879883, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8594046235084534, + "num_tokens": 400547383.0, + "step": 10495 + }, + { + "epoch": 1.3351990840859942, + "ewc_loss": 0.06173225864768028, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028529134579002857, + "grad_norm": 7.081634521484375, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.87389075756073, + "num_tokens": 400585242.0, + "step": 10496 + }, + { + "epoch": 1.3353262943645847, + "ewc_loss": 0.061823293566703796, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028620168450288475, + "grad_norm": 7.1431355476379395, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8743664026260376, + "num_tokens": 400623117.0, + "step": 10497 + }, + { + "epoch": 1.3354535046431752, + "ewc_loss": 0.06164885312318802, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028445725911296904, + "grad_norm": 7.032971382141113, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8525468707084656, + "num_tokens": 400660170.0, + "step": 10498 + }, + { + "epoch": 1.3355807149217656, + "ewc_loss": 0.061976730823516846, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002877360675483942, + "grad_norm": 7.208573818206787, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8691059947013855, + "num_tokens": 400700467.0, + "step": 10499 + }, + { + "epoch": 1.335707925200356, + "ewc_loss": 0.061582282185554504, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028379156719893217, + "grad_norm": 7.043508529663086, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8562413454055786, + "num_tokens": 400740269.0, + "step": 10500 + }, + { + "epoch": 1.3358351354789466, + "ewc_loss": 0.062014270573854446, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028811144875362515, + "grad_norm": 7.190164566040039, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8729191422462463, + "num_tokens": 400776465.0, + "step": 10501 + }, + { + "epoch": 1.3359623457575371, + "ewc_loss": 0.06168307363986969, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028479949105530977, + "grad_norm": 7.1585893630981445, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.857650876045227, + "num_tokens": 400811210.0, + "step": 10502 + }, + { + "epoch": 1.3360895560361277, + "ewc_loss": 0.0617280974984169, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028524972731247544, + "grad_norm": 7.109151363372803, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8549541234970093, + "num_tokens": 400848820.0, + "step": 10503 + }, + { + "epoch": 1.3362167663147182, + "ewc_loss": 0.061889924108982086, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028686801670119166, + "grad_norm": 7.103970527648926, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8645783066749573, + "num_tokens": 400888774.0, + "step": 10504 + }, + { + "epoch": 1.3363439765933087, + "ewc_loss": 0.06169307231903076, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028489946271292865, + "grad_norm": 7.054177284240723, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8640285730361938, + "num_tokens": 400935067.0, + "step": 10505 + }, + { + "epoch": 1.3364711868718993, + "ewc_loss": 0.06188163906335831, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002867851289920509, + "grad_norm": 7.121299743652344, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8727856874465942, + "num_tokens": 400974122.0, + "step": 10506 + }, + { + "epoch": 1.3365983971504898, + "ewc_loss": 0.06175627559423447, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028553151059895754, + "grad_norm": 7.100540637969971, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8717702627182007, + "num_tokens": 401008911.0, + "step": 10507 + }, + { + "epoch": 1.3367256074290803, + "ewc_loss": 0.06184743344783783, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028644310077652335, + "grad_norm": 7.167309761047363, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8693044185638428, + "num_tokens": 401039453.0, + "step": 10508 + }, + { + "epoch": 1.3368528177076708, + "ewc_loss": 0.06181560456752777, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028612479218281806, + "grad_norm": 7.115522384643555, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8729963302612305, + "num_tokens": 401075854.0, + "step": 10509 + }, + { + "epoch": 1.3369800279862614, + "ewc_loss": 0.0618164949119091, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002861336979549378, + "grad_norm": 7.133865833282471, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8619275689125061, + "num_tokens": 401114922.0, + "step": 10510 + }, + { + "epoch": 1.337107238264852, + "ewc_loss": 0.061816319823265076, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002861319517251104, + "grad_norm": 7.11501407623291, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8720113635063171, + "num_tokens": 401155833.0, + "step": 10511 + }, + { + "epoch": 1.3372344485434424, + "ewc_loss": 0.06194916367530823, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028746036696247756, + "grad_norm": 7.215213775634766, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8687782287597656, + "num_tokens": 401196971.0, + "step": 10512 + }, + { + "epoch": 1.337361658822033, + "ewc_loss": 0.06167290732264519, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002846978313755244, + "grad_norm": 7.110800743103027, + "learning_rate": 1e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.8308823108673096, + "num_tokens": 401237312.0, + "step": 10513 + }, + { + "epoch": 1.3374888691006233, + "ewc_loss": 0.06187751144170761, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002867438888642937, + "grad_norm": 7.1857590675354, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.86053067445755, + "num_tokens": 401274630.0, + "step": 10514 + }, + { + "epoch": 1.3376160793792138, + "ewc_loss": 0.06174784153699875, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002854471676982939, + "grad_norm": 7.109588623046875, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8672689199447632, + "num_tokens": 401311780.0, + "step": 10515 + }, + { + "epoch": 1.3377432896578043, + "ewc_loss": 0.06194949150085449, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002874636556953192, + "grad_norm": 7.181464195251465, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8693902492523193, + "num_tokens": 401352614.0, + "step": 10516 + }, + { + "epoch": 1.3378704999363948, + "ewc_loss": 0.06186066195368767, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002865753776859492, + "grad_norm": 7.142995357513428, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8658708333969116, + "num_tokens": 401394495.0, + "step": 10517 + }, + { + "epoch": 1.3379977102149854, + "ewc_loss": 0.061791472136974335, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002858834923245013, + "grad_norm": 7.214295864105225, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.848885178565979, + "num_tokens": 401427758.0, + "step": 10518 + }, + { + "epoch": 1.338124920493576, + "ewc_loss": 0.06159142404794693, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002863244153559208, + "grad_norm": 7.089449882507324, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8473235368728638, + "num_tokens": 401473868.0, + "step": 10519 + }, + { + "epoch": 1.3382521307721664, + "ewc_loss": 0.061975087970495224, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028771962388418615, + "grad_norm": 7.21768045425415, + "learning_rate": 1e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8462436199188232, + "num_tokens": 401510459.0, + "step": 10520 + }, + { + "epoch": 1.338379341050757, + "ewc_loss": 0.06173570826649666, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002853258338291198, + "grad_norm": 7.140372276306152, + "learning_rate": 1e-06, + "loss": 0.5626, + "mean_token_accuracy": 0.8344120979309082, + "num_tokens": 401545283.0, + "step": 10521 + }, + { + "epoch": 1.3385065513293475, + "ewc_loss": 0.0619591549038887, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000287560309516266, + "grad_norm": 7.274886608123779, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8636999726295471, + "num_tokens": 401577166.0, + "step": 10522 + }, + { + "epoch": 1.3386337616079378, + "ewc_loss": 0.061825230717659, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028622106765396893, + "grad_norm": 7.100480079650879, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.875575602054596, + "num_tokens": 401619075.0, + "step": 10523 + }, + { + "epoch": 1.3387609718865283, + "ewc_loss": 0.06190424785017967, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028701123665086925, + "grad_norm": 7.147243022918701, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.857169508934021, + "num_tokens": 401661286.0, + "step": 10524 + }, + { + "epoch": 1.3388881821651188, + "ewc_loss": 0.06172150373458862, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028518380713649094, + "grad_norm": 7.104040622711182, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8550330400466919, + "num_tokens": 401699672.0, + "step": 10525 + }, + { + "epoch": 1.3390153924437094, + "ewc_loss": 0.061904940754175186, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028701816336251795, + "grad_norm": 7.178681373596191, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8558712005615234, + "num_tokens": 401732676.0, + "step": 10526 + }, + { + "epoch": 1.3391426027223, + "ewc_loss": 0.06184614449739456, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000286430207779631, + "grad_norm": 7.145660400390625, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8606334328651428, + "num_tokens": 401771593.0, + "step": 10527 + }, + { + "epoch": 1.3392698130008904, + "ewc_loss": 0.061881035566329956, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028677910449914634, + "grad_norm": 7.20660924911499, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8478697538375854, + "num_tokens": 401809605.0, + "step": 10528 + }, + { + "epoch": 1.339397023279481, + "ewc_loss": 0.06184759736061096, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028644470148719847, + "grad_norm": 7.121929168701172, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8543659448623657, + "num_tokens": 401845349.0, + "step": 10529 + }, + { + "epoch": 1.3395242335580715, + "ewc_loss": 0.06192774698138237, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002872462209779769, + "grad_norm": 7.161156177520752, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8851431012153625, + "num_tokens": 401883186.0, + "step": 10530 + }, + { + "epoch": 1.339651443836662, + "ewc_loss": 0.06182760000228882, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028624472906813025, + "grad_norm": 7.209968566894531, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.847928524017334, + "num_tokens": 401921149.0, + "step": 10531 + }, + { + "epoch": 1.3397786541152525, + "ewc_loss": 0.06199566647410393, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00028548401314765215, + "grad_norm": 7.116291522979736, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8559123277664185, + "num_tokens": 401961813.0, + "step": 10532 + }, + { + "epoch": 1.339905864393843, + "ewc_loss": 0.061864010989665985, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002866088761948049, + "grad_norm": 7.164890766143799, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8454526662826538, + "num_tokens": 402001212.0, + "step": 10533 + }, + { + "epoch": 1.3400330746724336, + "ewc_loss": 0.061744481325149536, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002854135527741164, + "grad_norm": 7.057008266448975, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8564660549163818, + "num_tokens": 402044648.0, + "step": 10534 + }, + { + "epoch": 1.3401602849510241, + "ewc_loss": 0.06189341098070145, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028690288309007883, + "grad_norm": 7.170807361602783, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8648688793182373, + "num_tokens": 402087391.0, + "step": 10535 + }, + { + "epoch": 1.3402874952296147, + "ewc_loss": 0.06174545735120773, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002854233025573194, + "grad_norm": 7.101787567138672, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8814847469329834, + "num_tokens": 402119598.0, + "step": 10536 + }, + { + "epoch": 1.3404147055082052, + "ewc_loss": 0.0617997832596302, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028596658376045525, + "grad_norm": 7.187045097351074, + "learning_rate": 1e-06, + "loss": 0.5523, + "mean_token_accuracy": 0.8371744155883789, + "num_tokens": 402153628.0, + "step": 10537 + }, + { + "epoch": 1.3405419157867955, + "ewc_loss": 0.06169889122247696, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002849576703738421, + "grad_norm": 7.063128471374512, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8794646263122559, + "num_tokens": 402190876.0, + "step": 10538 + }, + { + "epoch": 1.340669126065386, + "ewc_loss": 0.061815276741981506, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002861215325538069, + "grad_norm": 7.147245407104492, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8708994388580322, + "num_tokens": 402227168.0, + "step": 10539 + }, + { + "epoch": 1.3407963363439765, + "ewc_loss": 0.06165684759616852, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028453723643906415, + "grad_norm": 7.046093463897705, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.87083899974823, + "num_tokens": 402262731.0, + "step": 10540 + }, + { + "epoch": 1.340923546622567, + "ewc_loss": 0.06185632944107056, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002865320711862296, + "grad_norm": 7.191683769226074, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8642768263816833, + "num_tokens": 402299992.0, + "step": 10541 + }, + { + "epoch": 1.3410507569011576, + "ewc_loss": 0.06162020191550255, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028417076100595295, + "grad_norm": 7.0983147621154785, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8552635908126831, + "num_tokens": 402339129.0, + "step": 10542 + }, + { + "epoch": 1.3411779671797481, + "ewc_loss": 0.06187731400132179, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002867418806999922, + "grad_norm": 7.221716403961182, + "learning_rate": 1e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8452800512313843, + "num_tokens": 402375380.0, + "step": 10543 + }, + { + "epoch": 1.3413051774583387, + "ewc_loss": 0.06166917458176613, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002846604911610484, + "grad_norm": 7.103224277496338, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8519231081008911, + "num_tokens": 402412159.0, + "step": 10544 + }, + { + "epoch": 1.3414323877369292, + "ewc_loss": 0.06181640923023224, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002861328248400241, + "grad_norm": 7.1635565757751465, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8589083552360535, + "num_tokens": 402446676.0, + "step": 10545 + }, + { + "epoch": 1.3415595980155197, + "ewc_loss": 0.06168818473815918, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002848505973815918, + "grad_norm": 7.040209770202637, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8648048043251038, + "num_tokens": 402490741.0, + "step": 10546 + }, + { + "epoch": 1.3416868082941102, + "ewc_loss": 0.06189168989658356, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028688565362244844, + "grad_norm": 7.1892218589782715, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8721935749053955, + "num_tokens": 402527398.0, + "step": 10547 + }, + { + "epoch": 1.3418140185727006, + "ewc_loss": 0.0617394857108593, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028536361060105264, + "grad_norm": 7.119486331939697, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.862169623374939, + "num_tokens": 402565914.0, + "step": 10548 + }, + { + "epoch": 1.341941228851291, + "ewc_loss": 0.06178228557109833, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002857915824279189, + "grad_norm": 7.125033378601074, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8661595582962036, + "num_tokens": 402600671.0, + "step": 10549 + }, + { + "epoch": 1.3420684391298816, + "ewc_loss": 0.06167859584093094, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002847547293640673, + "grad_norm": 7.066450119018555, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8749220371246338, + "num_tokens": 402637381.0, + "step": 10550 + }, + { + "epoch": 1.3421956494084721, + "ewc_loss": 0.06176542490720749, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028562298393808305, + "grad_norm": 7.129669189453125, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8676595687866211, + "num_tokens": 402671190.0, + "step": 10551 + }, + { + "epoch": 1.3423228596870627, + "ewc_loss": 0.06171908974647522, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002851596218533814, + "grad_norm": 7.051276683807373, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8629343509674072, + "num_tokens": 402721518.0, + "step": 10552 + }, + { + "epoch": 1.3424500699656532, + "ewc_loss": 0.06178602576255798, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028582900995388627, + "grad_norm": 7.166520118713379, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8624749779701233, + "num_tokens": 402758949.0, + "step": 10553 + }, + { + "epoch": 1.3425772802442437, + "ewc_loss": 0.061651427298784256, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028448301600292325, + "grad_norm": 7.126514434814453, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8567630052566528, + "num_tokens": 402794342.0, + "step": 10554 + }, + { + "epoch": 1.3427044905228342, + "ewc_loss": 0.061828263103961945, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028625139384530485, + "grad_norm": 7.113490581512451, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8742841482162476, + "num_tokens": 402831031.0, + "step": 10555 + }, + { + "epoch": 1.3428317008014248, + "ewc_loss": 0.061772365123033524, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028569239657372236, + "grad_norm": 7.084567546844482, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8732137680053711, + "num_tokens": 402871486.0, + "step": 10556 + }, + { + "epoch": 1.3429589110800153, + "ewc_loss": 0.06177914887666702, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000285760237602517, + "grad_norm": 7.121438026428223, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.875114917755127, + "num_tokens": 402916024.0, + "step": 10557 + }, + { + "epoch": 1.3430861213586058, + "ewc_loss": 0.06184927374124527, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000286461494397372, + "grad_norm": 7.224330425262451, + "learning_rate": 1e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8448183536529541, + "num_tokens": 402952116.0, + "step": 10558 + }, + { + "epoch": 1.3432133316371964, + "ewc_loss": 0.061856552958488464, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002865342830773443, + "grad_norm": 7.107747554779053, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8613991737365723, + "num_tokens": 402998064.0, + "step": 10559 + }, + { + "epoch": 1.343340541915787, + "ewc_loss": 0.061888016760349274, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002868488954845816, + "grad_norm": 7.163656234741211, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8664683103561401, + "num_tokens": 403041763.0, + "step": 10560 + }, + { + "epoch": 1.3434677521943774, + "ewc_loss": 0.061742350459098816, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028539224877022207, + "grad_norm": 7.137576103210449, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8745659589767456, + "num_tokens": 403080956.0, + "step": 10561 + }, + { + "epoch": 1.343594962472968, + "ewc_loss": 0.06202353537082672, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028820408624596894, + "grad_norm": 7.2782063484191895, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8714224696159363, + "num_tokens": 403117200.0, + "step": 10562 + }, + { + "epoch": 1.3437221727515583, + "ewc_loss": 0.06166572868824005, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002846260613296181, + "grad_norm": 7.120846271514893, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8635997772216797, + "num_tokens": 403154112.0, + "step": 10563 + }, + { + "epoch": 1.3438493830301488, + "ewc_loss": 0.061943817883729935, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000287406932329759, + "grad_norm": 7.186936855316162, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.860668957233429, + "num_tokens": 403188265.0, + "step": 10564 + }, + { + "epoch": 1.3439765933087393, + "ewc_loss": 0.061751991510391235, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002854886697605252, + "grad_norm": 7.0956034660339355, + "learning_rate": 1e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.840067982673645, + "num_tokens": 403231236.0, + "step": 10565 + }, + { + "epoch": 1.3441038035873298, + "ewc_loss": 0.06195037066936493, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002874724450521171, + "grad_norm": 7.159703254699707, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8646548986434937, + "num_tokens": 403268642.0, + "step": 10566 + }, + { + "epoch": 1.3442310138659204, + "ewc_loss": 0.06189616024494171, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028693035710603, + "grad_norm": 7.135545253753662, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8712368607521057, + "num_tokens": 403307679.0, + "step": 10567 + }, + { + "epoch": 1.344358224144511, + "ewc_loss": 0.06188041344285011, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002867728762794286, + "grad_norm": 7.1733717918396, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8680521249771118, + "num_tokens": 403346395.0, + "step": 10568 + }, + { + "epoch": 1.3444854344231014, + "ewc_loss": 0.06162241846323013, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028663434204645455, + "grad_norm": 7.149745464324951, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8708958029747009, + "num_tokens": 403383001.0, + "step": 10569 + }, + { + "epoch": 1.344612644701692, + "ewc_loss": 0.061937764286994934, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028734636725857854, + "grad_norm": 7.171577453613281, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8543359041213989, + "num_tokens": 403426090.0, + "step": 10570 + }, + { + "epoch": 1.3447398549802825, + "ewc_loss": 0.06174783036112785, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002854470512829721, + "grad_norm": 7.081818580627441, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8558616638183594, + "num_tokens": 403470231.0, + "step": 10571 + }, + { + "epoch": 1.3448670652588728, + "ewc_loss": 0.0619383305311203, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028735207160934806, + "grad_norm": 7.1504292488098145, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8686632513999939, + "num_tokens": 403508677.0, + "step": 10572 + }, + { + "epoch": 1.3449942755374633, + "ewc_loss": 0.06180715188384056, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002860402746591717, + "grad_norm": 7.204520225524902, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8433902263641357, + "num_tokens": 403545175.0, + "step": 10573 + }, + { + "epoch": 1.3451214858160538, + "ewc_loss": 0.06193477287888527, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002873164776246995, + "grad_norm": 7.1414875984191895, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8771721124649048, + "num_tokens": 403583321.0, + "step": 10574 + }, + { + "epoch": 1.3452486960946444, + "ewc_loss": 0.06192086637020111, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028717739041894674, + "grad_norm": 7.176749229431152, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8559887409210205, + "num_tokens": 403618547.0, + "step": 10575 + }, + { + "epoch": 1.345375906373235, + "ewc_loss": 0.061555974185466766, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028596987249329686, + "grad_norm": 7.1300578117370605, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8664107322692871, + "num_tokens": 403657552.0, + "step": 10576 + }, + { + "epoch": 1.3455031166518254, + "ewc_loss": 0.061751626431941986, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002879264357034117, + "grad_norm": 7.165181636810303, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8648830652236938, + "num_tokens": 403697021.0, + "step": 10577 + }, + { + "epoch": 1.345630326930416, + "ewc_loss": 0.06147949770092964, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002852051402442157, + "grad_norm": 7.120011329650879, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8527432084083557, + "num_tokens": 403736480.0, + "step": 10578 + }, + { + "epoch": 1.3457575372090065, + "ewc_loss": 0.06171578913927078, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000287568022031337, + "grad_norm": 7.16621208190918, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.870894193649292, + "num_tokens": 403772056.0, + "step": 10579 + }, + { + "epoch": 1.345884747487597, + "ewc_loss": 0.061491191387176514, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002853220503311604, + "grad_norm": 7.122348308563232, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8580107092857361, + "num_tokens": 403813646.0, + "step": 10580 + }, + { + "epoch": 1.3460119577661875, + "ewc_loss": 0.0617060586810112, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002874707570299506, + "grad_norm": 7.128159046173096, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8665029406547546, + "num_tokens": 403853904.0, + "step": 10581 + }, + { + "epoch": 1.346139168044778, + "ewc_loss": 0.06183052808046341, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028627400752156973, + "grad_norm": 7.120222568511963, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8460230827331543, + "num_tokens": 403898528.0, + "step": 10582 + }, + { + "epoch": 1.3462663783233686, + "ewc_loss": 0.06167732924222946, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028718344401568174, + "grad_norm": 7.248006820678711, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8483081459999084, + "num_tokens": 403929518.0, + "step": 10583 + }, + { + "epoch": 1.3463935886019591, + "ewc_loss": 0.0617406852543354, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002853756013792008, + "grad_norm": 7.191966533660889, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8648825287818909, + "num_tokens": 403970323.0, + "step": 10584 + }, + { + "epoch": 1.3465207988805497, + "ewc_loss": 0.061496444046497345, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002853746118489653, + "grad_norm": 7.094494342803955, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8670141100883484, + "num_tokens": 404012499.0, + "step": 10585 + }, + { + "epoch": 1.3466480091591402, + "ewc_loss": 0.06164957582950592, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002869059389922768, + "grad_norm": 7.149016380310059, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8586356043815613, + "num_tokens": 404055370.0, + "step": 10586 + }, + { + "epoch": 1.3467752194377305, + "ewc_loss": 0.061465442180633545, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028506459784694016, + "grad_norm": 7.073723793029785, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.864028811454773, + "num_tokens": 404092498.0, + "step": 10587 + }, + { + "epoch": 1.346902429716321, + "ewc_loss": 0.06200805678963661, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028804931207560003, + "grad_norm": 7.168639183044434, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8687189817428589, + "num_tokens": 404132466.0, + "step": 10588 + }, + { + "epoch": 1.3470296399949115, + "ewc_loss": 0.06181769445538521, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000286145688733086, + "grad_norm": 7.068221092224121, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8628984093666077, + "num_tokens": 404171913.0, + "step": 10589 + }, + { + "epoch": 1.347156850273502, + "ewc_loss": 0.062143340706825256, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002894021454267204, + "grad_norm": 7.198195934295654, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8769792318344116, + "num_tokens": 404205817.0, + "step": 10590 + }, + { + "epoch": 1.3472840605520926, + "ewc_loss": 0.0618482381105423, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002864511334337294, + "grad_norm": 7.063830375671387, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8667466640472412, + "num_tokens": 404243922.0, + "step": 10591 + }, + { + "epoch": 1.3474112708306831, + "ewc_loss": 0.06209234148263931, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002888921881094575, + "grad_norm": 7.188236713409424, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8653368353843689, + "num_tokens": 404283392.0, + "step": 10592 + }, + { + "epoch": 1.3475384811092737, + "ewc_loss": 0.06195472180843353, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002875159843824804, + "grad_norm": 7.109871864318848, + "learning_rate": 1e-06, + "loss": 0.5455, + "mean_token_accuracy": 0.8368827700614929, + "num_tokens": 404323044.0, + "step": 10593 + }, + { + "epoch": 1.3476656913878642, + "ewc_loss": 0.06203535199165344, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002883222477976233, + "grad_norm": 7.193171977996826, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8681216239929199, + "num_tokens": 404359761.0, + "step": 10594 + }, + { + "epoch": 1.3477929016664547, + "ewc_loss": 0.061932194977998734, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002872906916309148, + "grad_norm": 7.143002986907959, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8493617177009583, + "num_tokens": 404395354.0, + "step": 10595 + }, + { + "epoch": 1.3479201119450452, + "ewc_loss": 0.06196025013923645, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002875712525565177, + "grad_norm": 7.160187244415283, + "learning_rate": 1e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8466829061508179, + "num_tokens": 404432052.0, + "step": 10596 + }, + { + "epoch": 1.3480473222236355, + "ewc_loss": 0.06199953705072403, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028796412516385317, + "grad_norm": 7.148678779602051, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8559368848800659, + "num_tokens": 404472689.0, + "step": 10597 + }, + { + "epoch": 1.348174532502226, + "ewc_loss": 0.06191267445683479, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002870954922400415, + "grad_norm": 7.126290321350098, + "learning_rate": 1e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8476779460906982, + "num_tokens": 404515916.0, + "step": 10598 + }, + { + "epoch": 1.3483017427808166, + "ewc_loss": 0.06197492778301239, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002877180522773415, + "grad_norm": 7.093927383422852, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8612440824508667, + "num_tokens": 404557399.0, + "step": 10599 + }, + { + "epoch": 1.3484289530594071, + "ewc_loss": 0.061975911259651184, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028772788937203586, + "grad_norm": 7.2098798751831055, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8586810827255249, + "num_tokens": 404592573.0, + "step": 10600 + }, + { + "epoch": 1.3485561633379977, + "ewc_loss": 0.06211652606725693, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002866925788111985, + "grad_norm": 7.162913799285889, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8594473600387573, + "num_tokens": 404635266.0, + "step": 10601 + }, + { + "epoch": 1.3486833736165882, + "ewc_loss": 0.062034137547016144, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002883101406041533, + "grad_norm": 7.090775012969971, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.870948076248169, + "num_tokens": 404673004.0, + "step": 10602 + }, + { + "epoch": 1.3488105838951787, + "ewc_loss": 0.06198328733444214, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002878016384784132, + "grad_norm": 7.231326580047607, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8473992347717285, + "num_tokens": 404705153.0, + "step": 10603 + }, + { + "epoch": 1.3489377941737692, + "ewc_loss": 0.06191246956586838, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002870934549719095, + "grad_norm": 7.133449554443359, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8535455465316772, + "num_tokens": 404737068.0, + "step": 10604 + }, + { + "epoch": 1.3490650044523598, + "ewc_loss": 0.062029361724853516, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002882623521145433, + "grad_norm": 7.182643890380859, + "learning_rate": 1e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.8426722288131714, + "num_tokens": 404773680.0, + "step": 10605 + }, + { + "epoch": 1.3491922147309503, + "ewc_loss": 0.06160853058099747, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000286495458567515, + "grad_norm": 7.092807769775391, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8682477474212646, + "num_tokens": 404809937.0, + "step": 10606 + }, + { + "epoch": 1.3493194250095408, + "ewc_loss": 0.06211397051811218, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000289108429569751, + "grad_norm": 7.161382675170898, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8532470464706421, + "num_tokens": 404847146.0, + "step": 10607 + }, + { + "epoch": 1.3494466352881314, + "ewc_loss": 0.06193622946739197, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002873310586437583, + "grad_norm": 7.163536548614502, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8634012937545776, + "num_tokens": 404882952.0, + "step": 10608 + }, + { + "epoch": 1.3495738455667219, + "ewc_loss": 0.061967633664608, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028764508897438645, + "grad_norm": 7.080390453338623, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8724946975708008, + "num_tokens": 404925347.0, + "step": 10609 + }, + { + "epoch": 1.3497010558453124, + "ewc_loss": 0.06211388111114502, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002891075564548373, + "grad_norm": 7.1920061111450195, + "learning_rate": 1e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8482308387756348, + "num_tokens": 404963411.0, + "step": 10610 + }, + { + "epoch": 1.349828266123903, + "ewc_loss": 0.061937782913446426, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028734657098539174, + "grad_norm": 7.141038417816162, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.86160808801651, + "num_tokens": 404997522.0, + "step": 10611 + }, + { + "epoch": 1.3499554764024932, + "ewc_loss": 0.06215173378586769, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002894860808737576, + "grad_norm": 7.193608283996582, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8598443269729614, + "num_tokens": 405034049.0, + "step": 10612 + }, + { + "epoch": 1.3500826866810838, + "ewc_loss": 0.06193482503294945, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002873170014936477, + "grad_norm": 7.148104667663574, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8593885898590088, + "num_tokens": 405074985.0, + "step": 10613 + }, + { + "epoch": 1.3502098969596743, + "ewc_loss": 0.062056560069322586, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028853435651399195, + "grad_norm": 7.124995231628418, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8775601387023926, + "num_tokens": 405116133.0, + "step": 10614 + }, + { + "epoch": 1.3503371072382648, + "ewc_loss": 0.061999693512916565, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028796569677069783, + "grad_norm": 7.114427089691162, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8609917163848877, + "num_tokens": 405155572.0, + "step": 10615 + }, + { + "epoch": 1.3504643175168554, + "ewc_loss": 0.062029194086790085, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002882606931962073, + "grad_norm": 7.184566497802734, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8654405474662781, + "num_tokens": 405194700.0, + "step": 10616 + }, + { + "epoch": 1.350591527795446, + "ewc_loss": 0.06193840131163597, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000287352770101279, + "grad_norm": 7.139726161956787, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8577017188072205, + "num_tokens": 405233582.0, + "step": 10617 + }, + { + "epoch": 1.3507187380740364, + "ewc_loss": 0.061937421560287476, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002873429621104151, + "grad_norm": 7.115219593048096, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8738796710968018, + "num_tokens": 405274987.0, + "step": 10618 + }, + { + "epoch": 1.350845948352627, + "ewc_loss": 0.061914846301078796, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002871172328013927, + "grad_norm": 7.188255310058594, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8592735528945923, + "num_tokens": 405308151.0, + "step": 10619 + }, + { + "epoch": 1.3509731586312175, + "ewc_loss": 0.0618596076965332, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028656484209932387, + "grad_norm": 7.120128154754639, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8607417345046997, + "num_tokens": 405343939.0, + "step": 10620 + }, + { + "epoch": 1.3511003689098078, + "ewc_loss": 0.06196107715368271, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028757951804436743, + "grad_norm": 7.172968864440918, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8613804578781128, + "num_tokens": 405384211.0, + "step": 10621 + }, + { + "epoch": 1.3512275791883983, + "ewc_loss": 0.0618106909096241, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002860756649170071, + "grad_norm": 7.242579460144043, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8692706227302551, + "num_tokens": 405419971.0, + "step": 10622 + }, + { + "epoch": 1.3513547894669888, + "ewc_loss": 0.06154409795999527, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028585115796886384, + "grad_norm": 7.169429302215576, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8554327487945557, + "num_tokens": 405451322.0, + "step": 10623 + }, + { + "epoch": 1.3514819997455794, + "ewc_loss": 0.0616125762462616, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002865359128918499, + "grad_norm": 7.1633758544921875, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8541730642318726, + "num_tokens": 405496101.0, + "step": 10624 + }, + { + "epoch": 1.35160921002417, + "ewc_loss": 0.061534978449344635, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028575994656421244, + "grad_norm": 7.191735744476318, + "learning_rate": 1e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8453523516654968, + "num_tokens": 405530034.0, + "step": 10625 + }, + { + "epoch": 1.3517364203027604, + "ewc_loss": 0.06146950274705887, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002851051976904273, + "grad_norm": 7.055637836456299, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8741132616996765, + "num_tokens": 405570893.0, + "step": 10626 + }, + { + "epoch": 1.351863630581351, + "ewc_loss": 0.061598144471645355, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002863916161004454, + "grad_norm": 7.133618354797363, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8494527339935303, + "num_tokens": 405610120.0, + "step": 10627 + }, + { + "epoch": 1.3519908408599415, + "ewc_loss": 0.06150916591286659, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028550182469189167, + "grad_norm": 7.125572204589844, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8614521026611328, + "num_tokens": 405651878.0, + "step": 10628 + }, + { + "epoch": 1.352118051138532, + "ewc_loss": 0.06179027259349823, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002858715015463531, + "grad_norm": 7.0926923751831055, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8670439124107361, + "num_tokens": 405694578.0, + "step": 10629 + }, + { + "epoch": 1.3522452614171225, + "ewc_loss": 0.06155729666352272, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002859831147361547, + "grad_norm": 7.095677375793457, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8679221868515015, + "num_tokens": 405734128.0, + "step": 10630 + }, + { + "epoch": 1.352372471695713, + "ewc_loss": 0.061936840415000916, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002873371704481542, + "grad_norm": 7.233703136444092, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8680742979049683, + "num_tokens": 405776588.0, + "step": 10631 + }, + { + "epoch": 1.3524996819743036, + "ewc_loss": 0.061476461589336395, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028517478494904935, + "grad_norm": 7.086262226104736, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8580397963523865, + "num_tokens": 405810867.0, + "step": 10632 + }, + { + "epoch": 1.3526268922528941, + "ewc_loss": 0.06175637245178223, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002879738749470562, + "grad_norm": 7.172982215881348, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8798925876617432, + "num_tokens": 405848884.0, + "step": 10633 + }, + { + "epoch": 1.3527541025314846, + "ewc_loss": 0.06150294840335846, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028543962980620563, + "grad_norm": 7.072483062744141, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8800760507583618, + "num_tokens": 405888759.0, + "step": 10634 + }, + { + "epoch": 1.3528813128100752, + "ewc_loss": 0.06178486719727516, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028825883055105805, + "grad_norm": 7.180984020233154, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8753763437271118, + "num_tokens": 405925686.0, + "step": 10635 + }, + { + "epoch": 1.3530085230886655, + "ewc_loss": 0.06153407692909241, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028575092437677085, + "grad_norm": 7.168488025665283, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8759924173355103, + "num_tokens": 405960836.0, + "step": 10636 + }, + { + "epoch": 1.353135733367256, + "ewc_loss": 0.06170745939016342, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028748472686856985, + "grad_norm": 7.181109428405762, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.873531699180603, + "num_tokens": 406000314.0, + "step": 10637 + }, + { + "epoch": 1.3532629436458465, + "ewc_loss": 0.061611469835042953, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002865248534362763, + "grad_norm": 7.112630844116211, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8723458051681519, + "num_tokens": 406039468.0, + "step": 10638 + }, + { + "epoch": 1.353390153924437, + "ewc_loss": 0.06176454573869705, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028805562760680914, + "grad_norm": 7.168119430541992, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8863722085952759, + "num_tokens": 406076828.0, + "step": 10639 + }, + { + "epoch": 1.3535173642030276, + "ewc_loss": 0.061491891741752625, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028532909345813096, + "grad_norm": 7.127234935760498, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.861703634262085, + "num_tokens": 406116504.0, + "step": 10640 + }, + { + "epoch": 1.3536445744816181, + "ewc_loss": 0.06167703494429588, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002871805045288056, + "grad_norm": 7.1315016746521, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8613165020942688, + "num_tokens": 406155184.0, + "step": 10641 + }, + { + "epoch": 1.3537717847602087, + "ewc_loss": 0.06199119612574577, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028788071358576417, + "grad_norm": 7.209802627563477, + "learning_rate": 1e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8431211709976196, + "num_tokens": 406195411.0, + "step": 10642 + }, + { + "epoch": 1.3538989950387992, + "ewc_loss": 0.061664991080760956, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002870600437745452, + "grad_norm": 7.168029308319092, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8531560897827148, + "num_tokens": 406230159.0, + "step": 10643 + }, + { + "epoch": 1.3540262053173897, + "ewc_loss": 0.061770372092723846, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028811386437155306, + "grad_norm": 7.200504302978516, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8508391380310059, + "num_tokens": 406264541.0, + "step": 10644 + }, + { + "epoch": 1.3541534155959802, + "ewc_loss": 0.06166671961545944, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002870773314498365, + "grad_norm": 7.129233360290527, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8693544268608093, + "num_tokens": 406301869.0, + "step": 10645 + }, + { + "epoch": 1.3542806258745705, + "ewc_loss": 0.06175784766674042, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028798863058909774, + "grad_norm": 7.150782108306885, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8648030757904053, + "num_tokens": 406338819.0, + "step": 10646 + }, + { + "epoch": 1.354407836153161, + "ewc_loss": 0.0617251880466938, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028766202740371227, + "grad_norm": 7.183924198150635, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8642761707305908, + "num_tokens": 406375914.0, + "step": 10647 + }, + { + "epoch": 1.3545350464317516, + "ewc_loss": 0.061625100672245026, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002866611466743052, + "grad_norm": 7.113931655883789, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8681718111038208, + "num_tokens": 406418543.0, + "step": 10648 + }, + { + "epoch": 1.3546622567103421, + "ewc_loss": 0.06171827018260956, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002875928767025471, + "grad_norm": 7.184156894683838, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8602839708328247, + "num_tokens": 406455731.0, + "step": 10649 + }, + { + "epoch": 1.3547894669889327, + "ewc_loss": 0.061647333204746246, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028688349993899465, + "grad_norm": 7.145287990570068, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8628783822059631, + "num_tokens": 406496266.0, + "step": 10650 + }, + { + "epoch": 1.3549166772675232, + "ewc_loss": 0.06194273382425308, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002873961057048291, + "grad_norm": 7.154986381530762, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8576369881629944, + "num_tokens": 406541676.0, + "step": 10651 + }, + { + "epoch": 1.3550438875461137, + "ewc_loss": 0.06192181259393692, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002871868491638452, + "grad_norm": 7.164453506469727, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.858031153678894, + "num_tokens": 406578997.0, + "step": 10652 + }, + { + "epoch": 1.3551710978247042, + "ewc_loss": 0.062004148960113525, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028801022563129663, + "grad_norm": 7.191368103027344, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8671532273292542, + "num_tokens": 406619404.0, + "step": 10653 + }, + { + "epoch": 1.3552983081032948, + "ewc_loss": 0.06192667409777641, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002872354816645384, + "grad_norm": 7.144370079040527, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8815990686416626, + "num_tokens": 406656092.0, + "step": 10654 + }, + { + "epoch": 1.3554255183818853, + "ewc_loss": 0.06210402026772499, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002890089526772499, + "grad_norm": 7.220818996429443, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8457788228988647, + "num_tokens": 406701600.0, + "step": 10655 + }, + { + "epoch": 1.3555527286604758, + "ewc_loss": 0.061825014650821686, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002862188848666847, + "grad_norm": 7.112587928771973, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8619474768638611, + "num_tokens": 406739649.0, + "step": 10656 + }, + { + "epoch": 1.3556799389390664, + "ewc_loss": 0.06204819306731224, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028845068300142884, + "grad_norm": 7.210165977478027, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8810778856277466, + "num_tokens": 406774962.0, + "step": 10657 + }, + { + "epoch": 1.3558071492176569, + "ewc_loss": 0.06187160313129425, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002866848080884665, + "grad_norm": 7.138874053955078, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8584945797920227, + "num_tokens": 406818047.0, + "step": 10658 + }, + { + "epoch": 1.3559343594962474, + "ewc_loss": 0.06200987100601196, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028806747286580503, + "grad_norm": 7.151252746582031, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8556101322174072, + "num_tokens": 406859464.0, + "step": 10659 + }, + { + "epoch": 1.356061569774838, + "ewc_loss": 0.06191223859786987, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000287091126665473, + "grad_norm": 7.1120147705078125, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.861351728439331, + "num_tokens": 406896444.0, + "step": 10660 + }, + { + "epoch": 1.3561887800534282, + "ewc_loss": 0.062077462673187256, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028874335112050176, + "grad_norm": 7.1840362548828125, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8570776581764221, + "num_tokens": 406934623.0, + "step": 10661 + }, + { + "epoch": 1.3563159903320188, + "ewc_loss": 0.06190437823534012, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002870125463232398, + "grad_norm": 7.1013946533203125, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8553965091705322, + "num_tokens": 406974290.0, + "step": 10662 + }, + { + "epoch": 1.3564432006106093, + "ewc_loss": 0.06214572489261627, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002894260105676949, + "grad_norm": 7.181281089782715, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8806639313697815, + "num_tokens": 407011279.0, + "step": 10663 + }, + { + "epoch": 1.3565704108891998, + "ewc_loss": 0.06196754053235054, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028764415765181184, + "grad_norm": 7.147426128387451, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.851658821105957, + "num_tokens": 407054733.0, + "step": 10664 + }, + { + "epoch": 1.3566976211677904, + "ewc_loss": 0.0620422288775444, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028839102014899254, + "grad_norm": 7.142723083496094, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8633973598480225, + "num_tokens": 407098410.0, + "step": 10665 + }, + { + "epoch": 1.3568248314463809, + "ewc_loss": 0.06215107440948486, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002894794743042439, + "grad_norm": 7.179802417755127, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8571052551269531, + "num_tokens": 407134231.0, + "step": 10666 + }, + { + "epoch": 1.3569520417249714, + "ewc_loss": 0.062059253454208374, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028856127755716443, + "grad_norm": 7.208798408508301, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8476429581642151, + "num_tokens": 407167231.0, + "step": 10667 + }, + { + "epoch": 1.357079252003562, + "ewc_loss": 0.062068551778793335, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028865429339930415, + "grad_norm": 7.160418510437012, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8644330501556396, + "num_tokens": 407206604.0, + "step": 10668 + }, + { + "epoch": 1.3572064622821525, + "ewc_loss": 0.062069833278656006, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002886670990847051, + "grad_norm": 7.157474994659424, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8649629950523376, + "num_tokens": 407248247.0, + "step": 10669 + }, + { + "epoch": 1.3573336725607428, + "ewc_loss": 0.06219121068716049, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028988084523007274, + "grad_norm": 7.195596694946289, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8731626868247986, + "num_tokens": 407282870.0, + "step": 10670 + }, + { + "epoch": 1.3574608828393333, + "ewc_loss": 0.06204312667250633, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028840001323260367, + "grad_norm": 7.1762518882751465, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8685297966003418, + "num_tokens": 407318704.0, + "step": 10671 + }, + { + "epoch": 1.3575880931179238, + "ewc_loss": 0.062120310962200165, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028917184681631625, + "grad_norm": 7.1749701499938965, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8652952909469604, + "num_tokens": 407360003.0, + "step": 10672 + }, + { + "epoch": 1.3577153033965144, + "ewc_loss": 0.06218212842941284, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028979001217521727, + "grad_norm": 7.2502899169921875, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.856453537940979, + "num_tokens": 407399473.0, + "step": 10673 + }, + { + "epoch": 1.357842513675105, + "ewc_loss": 0.0619565024971962, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002875337959267199, + "grad_norm": 7.120194435119629, + "learning_rate": 1e-06, + "loss": 0.5616, + "mean_token_accuracy": 0.8325721025466919, + "num_tokens": 407440576.0, + "step": 10674 + }, + { + "epoch": 1.3579697239536954, + "ewc_loss": 0.06218995526432991, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002898683014791459, + "grad_norm": 7.182372093200684, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8598393797874451, + "num_tokens": 407481057.0, + "step": 10675 + }, + { + "epoch": 1.358096934232286, + "ewc_loss": 0.061914797872304916, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002871167380362749, + "grad_norm": 7.091026306152344, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.874638557434082, + "num_tokens": 407517104.0, + "step": 10676 + }, + { + "epoch": 1.3582241445108765, + "ewc_loss": 0.062163516879081726, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002896039222832769, + "grad_norm": 7.233141899108887, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8687434792518616, + "num_tokens": 407552604.0, + "step": 10677 + }, + { + "epoch": 1.358351354789467, + "ewc_loss": 0.06202341616153717, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028820292209275067, + "grad_norm": 7.124767780303955, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8769954442977905, + "num_tokens": 407588282.0, + "step": 10678 + }, + { + "epoch": 1.3584785650680575, + "ewc_loss": 0.062187887728214264, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002898476377595216, + "grad_norm": 7.183436393737793, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8587945699691772, + "num_tokens": 407619837.0, + "step": 10679 + }, + { + "epoch": 1.358605775346648, + "ewc_loss": 0.0621112585067749, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002890813339035958, + "grad_norm": 7.162212371826172, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8599390983581543, + "num_tokens": 407656939.0, + "step": 10680 + }, + { + "epoch": 1.3587329856252386, + "ewc_loss": 0.062143921852111816, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028940796619281173, + "grad_norm": 7.123738765716553, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8532116413116455, + "num_tokens": 407702756.0, + "step": 10681 + }, + { + "epoch": 1.3588601959038291, + "ewc_loss": 0.06224297732114792, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029039851506240666, + "grad_norm": 7.1468963623046875, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8714994788169861, + "num_tokens": 407736142.0, + "step": 10682 + }, + { + "epoch": 1.3589874061824196, + "ewc_loss": 0.06220989674329758, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002900677209254354, + "grad_norm": 7.195460796356201, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.859015703201294, + "num_tokens": 407773184.0, + "step": 10683 + }, + { + "epoch": 1.3591146164610102, + "ewc_loss": 0.06221190467476845, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029008780256845057, + "grad_norm": 7.175574779510498, + "learning_rate": 1e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8463500738143921, + "num_tokens": 407806543.0, + "step": 10684 + }, + { + "epoch": 1.3592418267396005, + "ewc_loss": 0.062242552638053894, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029039426590316, + "grad_norm": 7.214718818664551, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8609827160835266, + "num_tokens": 407841371.0, + "step": 10685 + }, + { + "epoch": 1.359369037018191, + "ewc_loss": 0.06208192557096481, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028878802550025284, + "grad_norm": 7.192338943481445, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.866237998008728, + "num_tokens": 407879862.0, + "step": 10686 + }, + { + "epoch": 1.3594962472967815, + "ewc_loss": 0.06217796355485916, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028974839369766414, + "grad_norm": 7.223125457763672, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8565084338188171, + "num_tokens": 407917947.0, + "step": 10687 + }, + { + "epoch": 1.359623457575372, + "ewc_loss": 0.062020935118198395, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028817812562920153, + "grad_norm": 7.12945556640625, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8689413070678711, + "num_tokens": 407953987.0, + "step": 10688 + }, + { + "epoch": 1.3597506678539626, + "ewc_loss": 0.06222524121403694, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002902211563196033, + "grad_norm": 7.227327823638916, + "learning_rate": 1e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.846213161945343, + "num_tokens": 407993166.0, + "step": 10689 + }, + { + "epoch": 1.3598778781325531, + "ewc_loss": 0.061990074813365936, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028786950861103833, + "grad_norm": 7.096741199493408, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8716692924499512, + "num_tokens": 408037063.0, + "step": 10690 + }, + { + "epoch": 1.3600050884111436, + "ewc_loss": 0.06227089464664459, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002906777081079781, + "grad_norm": 7.226355075836182, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8648731708526611, + "num_tokens": 408078526.0, + "step": 10691 + }, + { + "epoch": 1.3601322986897342, + "ewc_loss": 0.062294039875268936, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002884677378460765, + "grad_norm": 7.19743537902832, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8704883456230164, + "num_tokens": 408116002.0, + "step": 10692 + }, + { + "epoch": 1.3602595089683247, + "ewc_loss": 0.06188208609819412, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002892310149036348, + "grad_norm": 7.172032356262207, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8582606315612793, + "num_tokens": 408155885.0, + "step": 10693 + }, + { + "epoch": 1.3603867192469152, + "ewc_loss": 0.06185770779848099, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002889872412197292, + "grad_norm": 7.204903602600098, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8683804273605347, + "num_tokens": 408190085.0, + "step": 10694 + }, + { + "epoch": 1.3605139295255055, + "ewc_loss": 0.06185302883386612, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028894044226035476, + "grad_norm": 7.169638633728027, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8641542196273804, + "num_tokens": 408227289.0, + "step": 10695 + }, + { + "epoch": 1.360641139804096, + "ewc_loss": 0.06192934513092041, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002897036320064217, + "grad_norm": 7.141531944274902, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8707085847854614, + "num_tokens": 408271853.0, + "step": 10696 + }, + { + "epoch": 1.3607683500826866, + "ewc_loss": 0.06191398203372955, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028954996378161013, + "grad_norm": 7.130609512329102, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8762227296829224, + "num_tokens": 408311133.0, + "step": 10697 + }, + { + "epoch": 1.3608955603612771, + "ewc_loss": 0.061820968985557556, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028861986356787384, + "grad_norm": 7.189966201782227, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8643099665641785, + "num_tokens": 408342775.0, + "step": 10698 + }, + { + "epoch": 1.3610227706398677, + "ewc_loss": 0.061900198459625244, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028941212804056704, + "grad_norm": 7.14716100692749, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8762345910072327, + "num_tokens": 408379063.0, + "step": 10699 + }, + { + "epoch": 1.3611499809184582, + "ewc_loss": 0.06185067445039749, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002889169263653457, + "grad_norm": 7.1575117111206055, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8486739993095398, + "num_tokens": 408415856.0, + "step": 10700 + }, + { + "epoch": 1.3612771911970487, + "ewc_loss": 0.061959315091371536, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00029000331414863467, + "grad_norm": 7.154510498046875, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8677610754966736, + "num_tokens": 408451522.0, + "step": 10701 + }, + { + "epoch": 1.3614044014756392, + "ewc_loss": 0.061933740973472595, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028974757879041135, + "grad_norm": 7.133272171020508, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8682075142860413, + "num_tokens": 408494605.0, + "step": 10702 + }, + { + "epoch": 1.3615316117542298, + "ewc_loss": 0.06195954233407974, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002900055842474103, + "grad_norm": 7.201955795288086, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8622289896011353, + "num_tokens": 408536373.0, + "step": 10703 + }, + { + "epoch": 1.3616588220328203, + "ewc_loss": 0.061871401965618134, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002891241747420281, + "grad_norm": 7.154517650604248, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8507533073425293, + "num_tokens": 408574724.0, + "step": 10704 + }, + { + "epoch": 1.3617860323114108, + "ewc_loss": 0.062235910445451736, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002903278509620577, + "grad_norm": 7.215518951416016, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8552595376968384, + "num_tokens": 408615057.0, + "step": 10705 + }, + { + "epoch": 1.3619132425900013, + "ewc_loss": 0.0618755966424942, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028916611336171627, + "grad_norm": 7.187350273132324, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8502222299575806, + "num_tokens": 408651324.0, + "step": 10706 + }, + { + "epoch": 1.3620404528685919, + "ewc_loss": 0.061984188854694366, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002902520354837179, + "grad_norm": 7.182425022125244, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8682694435119629, + "num_tokens": 408690533.0, + "step": 10707 + }, + { + "epoch": 1.3621676631471824, + "ewc_loss": 0.06213216483592987, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002892903867177665, + "grad_norm": 7.228719711303711, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8559028506278992, + "num_tokens": 408725970.0, + "step": 10708 + }, + { + "epoch": 1.362294873425773, + "ewc_loss": 0.061826132237911224, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002886714646592736, + "grad_norm": 7.153364658355713, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8710212707519531, + "num_tokens": 408766550.0, + "step": 10709 + }, + { + "epoch": 1.3624220837043632, + "ewc_loss": 0.062083788216114044, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002912480558734387, + "grad_norm": 7.218383312225342, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8658941388130188, + "num_tokens": 408804320.0, + "step": 10710 + }, + { + "epoch": 1.3625492939829538, + "ewc_loss": 0.062116123735904694, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000289129966404289, + "grad_norm": 7.184051513671875, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8658863306045532, + "num_tokens": 408840902.0, + "step": 10711 + }, + { + "epoch": 1.3626765042615443, + "ewc_loss": 0.062172722071409225, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028969597769901156, + "grad_norm": 7.18581485748291, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8621245622634888, + "num_tokens": 408879346.0, + "step": 10712 + }, + { + "epoch": 1.3628037145401348, + "ewc_loss": 0.06215561181306839, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002895248762797564, + "grad_norm": 7.162722587585449, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8753622174263, + "num_tokens": 408919336.0, + "step": 10713 + }, + { + "epoch": 1.3629309248187254, + "ewc_loss": 0.06226357817649841, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002906045119743794, + "grad_norm": 7.2255353927612305, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8580371141433716, + "num_tokens": 408961547.0, + "step": 10714 + }, + { + "epoch": 1.3630581350973159, + "ewc_loss": 0.06212823837995529, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028925115475431085, + "grad_norm": 7.172701358795166, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8763819336891174, + "num_tokens": 409003573.0, + "step": 10715 + }, + { + "epoch": 1.3631853453759064, + "ewc_loss": 0.06220077723264694, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000289976509520784, + "grad_norm": 7.218477249145508, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8554811477661133, + "num_tokens": 409045560.0, + "step": 10716 + }, + { + "epoch": 1.363312555654497, + "ewc_loss": 0.06212276965379715, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002891964395530522, + "grad_norm": 7.2158379554748535, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8660842180252075, + "num_tokens": 409087245.0, + "step": 10717 + }, + { + "epoch": 1.3634397659330875, + "ewc_loss": 0.062113694846630096, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002891056938096881, + "grad_norm": 7.178321361541748, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8728724122047424, + "num_tokens": 409134177.0, + "step": 10718 + }, + { + "epoch": 1.3635669762116778, + "ewc_loss": 0.06215286999940872, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002894974604714662, + "grad_norm": 7.2365946769714355, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8488472700119019, + "num_tokens": 409169495.0, + "step": 10719 + }, + { + "epoch": 1.3636941864902683, + "ewc_loss": 0.061751656234264374, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002879266976378858, + "grad_norm": 7.239617824554443, + "learning_rate": 1e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8390547633171082, + "num_tokens": 409207781.0, + "step": 10720 + }, + { + "epoch": 1.3638213967688588, + "ewc_loss": 0.0616462379693985, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002868725568987429, + "grad_norm": 7.185459613800049, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8625482320785522, + "num_tokens": 409245411.0, + "step": 10721 + }, + { + "epoch": 1.3639486070474494, + "ewc_loss": 0.061879415065050125, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002892042975872755, + "grad_norm": 7.174260139465332, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8721030354499817, + "num_tokens": 409281757.0, + "step": 10722 + }, + { + "epoch": 1.3640758173260399, + "ewc_loss": 0.06176082417368889, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.000288018403807655, + "grad_norm": 7.1787309646606445, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8506509065628052, + "num_tokens": 409321317.0, + "step": 10723 + }, + { + "epoch": 1.3642030276046304, + "ewc_loss": 0.06173370033502579, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002877471852116287, + "grad_norm": 7.169800281524658, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8694692254066467, + "num_tokens": 409358919.0, + "step": 10724 + }, + { + "epoch": 1.364330237883221, + "ewc_loss": 0.0619516596198082, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002874853671528399, + "grad_norm": 7.190805912017822, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.854356586933136, + "num_tokens": 409394355.0, + "step": 10725 + }, + { + "epoch": 1.3644574481618115, + "ewc_loss": 0.06205064803361893, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002884752466343343, + "grad_norm": 7.214961528778076, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8731325268745422, + "num_tokens": 409433781.0, + "step": 10726 + }, + { + "epoch": 1.364584658440402, + "ewc_loss": 0.06201092153787613, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002880779793485999, + "grad_norm": 7.149628639221191, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8537722826004028, + "num_tokens": 409471728.0, + "step": 10727 + }, + { + "epoch": 1.3647118687189925, + "ewc_loss": 0.062053024768829346, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000288498995359987, + "grad_norm": 7.217113971710205, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8813791275024414, + "num_tokens": 409502294.0, + "step": 10728 + }, + { + "epoch": 1.364839078997583, + "ewc_loss": 0.061966672539711, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002876354556065053, + "grad_norm": 7.1823530197143555, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8636756539344788, + "num_tokens": 409545878.0, + "step": 10729 + }, + { + "epoch": 1.3649662892761736, + "ewc_loss": 0.06204579025506973, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028842667234130204, + "grad_norm": 7.142208576202393, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8654488325119019, + "num_tokens": 409584753.0, + "step": 10730 + }, + { + "epoch": 1.3650934995547641, + "ewc_loss": 0.062029846012592316, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002882672124542296, + "grad_norm": 7.254231929779053, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8678674697875977, + "num_tokens": 409613725.0, + "step": 10731 + }, + { + "epoch": 1.3652207098333546, + "ewc_loss": 0.0619126558303833, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002870952885132283, + "grad_norm": 7.161040782928467, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8555271625518799, + "num_tokens": 409652443.0, + "step": 10732 + }, + { + "epoch": 1.3653479201119452, + "ewc_loss": 0.06220318377017975, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029000057838857174, + "grad_norm": 7.241069316864014, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8690198063850403, + "num_tokens": 409690683.0, + "step": 10733 + }, + { + "epoch": 1.3654751303905355, + "ewc_loss": 0.06196267157793045, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002875954669434577, + "grad_norm": 7.188241481781006, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8633615970611572, + "num_tokens": 409727502.0, + "step": 10734 + }, + { + "epoch": 1.365602340669126, + "ewc_loss": 0.062117837369441986, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000289147108560428, + "grad_norm": 7.218135356903076, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.859228253364563, + "num_tokens": 409764592.0, + "step": 10735 + }, + { + "epoch": 1.3657295509477165, + "ewc_loss": 0.06204593926668167, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002884281275328249, + "grad_norm": 7.19373893737793, + "learning_rate": 1e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.8410078287124634, + "num_tokens": 409809645.0, + "step": 10736 + }, + { + "epoch": 1.365856761226307, + "ewc_loss": 0.062031276524066925, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002882815315388143, + "grad_norm": 7.220373630523682, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8613294959068298, + "num_tokens": 409848048.0, + "step": 10737 + }, + { + "epoch": 1.3659839715048976, + "ewc_loss": 0.06206004321575165, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028856919379904866, + "grad_norm": 7.247121810913086, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8585206866264343, + "num_tokens": 409884176.0, + "step": 10738 + }, + { + "epoch": 1.3661111817834881, + "ewc_loss": 0.06192246079444885, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028719333931803703, + "grad_norm": 7.1836771965026855, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8653655052185059, + "num_tokens": 409922630.0, + "step": 10739 + }, + { + "epoch": 1.3662383920620786, + "ewc_loss": 0.06207677349448204, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028873648261651397, + "grad_norm": 7.172587871551514, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.869093656539917, + "num_tokens": 409961876.0, + "step": 10740 + }, + { + "epoch": 1.3663656023406692, + "ewc_loss": 0.062060702592134476, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002885757712647319, + "grad_norm": 7.237408638000488, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8706871271133423, + "num_tokens": 409995432.0, + "step": 10741 + }, + { + "epoch": 1.3664928126192597, + "ewc_loss": 0.06220851093530655, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00028761246358044446, + "grad_norm": 7.2047600746154785, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8615943193435669, + "num_tokens": 410034876.0, + "step": 10742 + }, + { + "epoch": 1.3666200228978502, + "ewc_loss": 0.0621461346745491, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002894301142077893, + "grad_norm": 7.168423175811768, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8663133382797241, + "num_tokens": 410079190.0, + "step": 10743 + }, + { + "epoch": 1.3667472331764405, + "ewc_loss": 0.06202644109725952, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002882331609725952, + "grad_norm": 7.2542595863342285, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8670194149017334, + "num_tokens": 410112475.0, + "step": 10744 + }, + { + "epoch": 1.366874443455031, + "ewc_loss": 0.06201452016830444, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028811395168304443, + "grad_norm": 7.163582801818848, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8786125779151917, + "num_tokens": 410154583.0, + "step": 10745 + }, + { + "epoch": 1.3670016537336216, + "ewc_loss": 0.06219043582677841, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028987310361117125, + "grad_norm": 7.24237060546875, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8629807233810425, + "num_tokens": 410191631.0, + "step": 10746 + }, + { + "epoch": 1.3671288640122121, + "ewc_loss": 0.06199309974908829, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002878997474908829, + "grad_norm": 7.219486236572266, + "learning_rate": 1e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8405213952064514, + "num_tokens": 410232435.0, + "step": 10747 + }, + { + "epoch": 1.3672560742908026, + "ewc_loss": 0.06211331486701965, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028910188120789826, + "grad_norm": 7.2765069007873535, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8675708770751953, + "num_tokens": 410272940.0, + "step": 10748 + }, + { + "epoch": 1.3673832845693932, + "ewc_loss": 0.06195022165775299, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002874709607567638, + "grad_norm": 7.126922130584717, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8587205410003662, + "num_tokens": 410312807.0, + "step": 10749 + }, + { + "epoch": 1.3675104948479837, + "ewc_loss": 0.06221453845500946, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029011411243118346, + "grad_norm": 7.2559380531311035, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8683494329452515, + "num_tokens": 410352433.0, + "step": 10750 + }, + { + "epoch": 1.3676377051265742, + "ewc_loss": 0.06200997531414032, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028806852060370147, + "grad_norm": 7.212899684906006, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8627382516860962, + "num_tokens": 410394843.0, + "step": 10751 + }, + { + "epoch": 1.3677649154051648, + "ewc_loss": 0.062384773045778275, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002893750788643956, + "grad_norm": 7.163851261138916, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8652905225753784, + "num_tokens": 410439353.0, + "step": 10752 + }, + { + "epoch": 1.3678921256837553, + "ewc_loss": 0.06190628185868263, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028947298415005207, + "grad_norm": 7.254018306732178, + "learning_rate": 1e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8414926528930664, + "num_tokens": 410485789.0, + "step": 10753 + }, + { + "epoch": 1.3680193359623458, + "ewc_loss": 0.06177351623773575, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028814529650844634, + "grad_norm": 7.220912456512451, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8468436002731323, + "num_tokens": 410523385.0, + "step": 10754 + }, + { + "epoch": 1.3681465462409363, + "ewc_loss": 0.061939969658851624, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002898098318837583, + "grad_norm": 7.246792316436768, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.857387363910675, + "num_tokens": 410553340.0, + "step": 10755 + }, + { + "epoch": 1.3682737565195269, + "ewc_loss": 0.06179935857653618, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028840373852290213, + "grad_norm": 7.20245885848999, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8720115423202515, + "num_tokens": 410590041.0, + "step": 10756 + }, + { + "epoch": 1.3684009667981174, + "ewc_loss": 0.061845824122428894, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028886841027997434, + "grad_norm": 7.1650872230529785, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8639200925827026, + "num_tokens": 410627703.0, + "step": 10757 + }, + { + "epoch": 1.368528177076708, + "ewc_loss": 0.06184552609920502, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00028886544168926775, + "grad_norm": 7.185129165649414, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8553467988967896, + "num_tokens": 410662555.0, + "step": 10758 + }, + { + "epoch": 1.3686553873552982, + "ewc_loss": 0.061909712851047516, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002895072684623301, + "grad_norm": 7.198994159698486, + "learning_rate": 1e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8456158638000488, + "num_tokens": 410706026.0, + "step": 10759 + }, + { + "epoch": 1.3687825976338888, + "ewc_loss": 0.06187792867422104, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002891894255299121, + "grad_norm": 7.167046546936035, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8594898581504822, + "num_tokens": 410743275.0, + "step": 10760 + }, + { + "epoch": 1.3689098079124793, + "ewc_loss": 0.06197816878557205, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002901918487623334, + "grad_norm": 7.181905746459961, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8721907734870911, + "num_tokens": 410781220.0, + "step": 10761 + }, + { + "epoch": 1.3690370181910698, + "ewc_loss": 0.062280330806970596, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029077206272631884, + "grad_norm": 7.229511737823486, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8697962760925293, + "num_tokens": 410820014.0, + "step": 10762 + }, + { + "epoch": 1.3691642284696603, + "ewc_loss": 0.062182847410440445, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028979722992517054, + "grad_norm": 7.158145427703857, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8574218153953552, + "num_tokens": 410858721.0, + "step": 10763 + }, + { + "epoch": 1.3692914387482509, + "ewc_loss": 0.06236433610320091, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000291612115688622, + "grad_norm": 7.169089317321777, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8686915040016174, + "num_tokens": 410898527.0, + "step": 10764 + }, + { + "epoch": 1.3694186490268414, + "ewc_loss": 0.06228160113096237, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029078478110022843, + "grad_norm": 7.2344160079956055, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.863094687461853, + "num_tokens": 410938426.0, + "step": 10765 + }, + { + "epoch": 1.369545859305432, + "ewc_loss": 0.06218836456537247, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002898524107877165, + "grad_norm": 7.2224273681640625, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8679366111755371, + "num_tokens": 410969940.0, + "step": 10766 + }, + { + "epoch": 1.3696730695840225, + "ewc_loss": 0.06229438632726669, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002909126051235944, + "grad_norm": 7.1998515129089355, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8465763330459595, + "num_tokens": 411008859.0, + "step": 10767 + }, + { + "epoch": 1.3698002798626128, + "ewc_loss": 0.06225467100739479, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002905154542531818, + "grad_norm": 7.229304790496826, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.877117395401001, + "num_tokens": 411043516.0, + "step": 10768 + }, + { + "epoch": 1.3699274901412033, + "ewc_loss": 0.0622018426656723, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028998719062656164, + "grad_norm": 7.194299697875977, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8553342223167419, + "num_tokens": 411079092.0, + "step": 10769 + }, + { + "epoch": 1.3700547004197938, + "ewc_loss": 0.06224651634693146, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029043390532024205, + "grad_norm": 7.166728973388672, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8660010099411011, + "num_tokens": 411117360.0, + "step": 10770 + }, + { + "epoch": 1.3701819106983844, + "ewc_loss": 0.062296196818351746, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002909307077061385, + "grad_norm": 7.228057861328125, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8529486656188965, + "num_tokens": 411153615.0, + "step": 10771 + }, + { + "epoch": 1.3703091209769749, + "ewc_loss": 0.062222350388765335, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002901922562159598, + "grad_norm": 7.149041652679443, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8647735118865967, + "num_tokens": 411193571.0, + "step": 10772 + }, + { + "epoch": 1.3704363312555654, + "ewc_loss": 0.0623372346162796, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029134107171557844, + "grad_norm": 7.237757682800293, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8430442810058594, + "num_tokens": 411230427.0, + "step": 10773 + }, + { + "epoch": 1.370563541534156, + "ewc_loss": 0.06216425448656082, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028961128555238247, + "grad_norm": 7.178157806396484, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8611087799072266, + "num_tokens": 411271251.0, + "step": 10774 + }, + { + "epoch": 1.3706907518127465, + "ewc_loss": 0.06230095401406288, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029097829246893525, + "grad_norm": 7.270218849182129, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8646281957626343, + "num_tokens": 411304906.0, + "step": 10775 + }, + { + "epoch": 1.370817962091337, + "ewc_loss": 0.06202535331249237, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028822230524383485, + "grad_norm": 7.116275310516357, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8791642189025879, + "num_tokens": 411341417.0, + "step": 10776 + }, + { + "epoch": 1.3709451723699275, + "ewc_loss": 0.0624205507338047, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002921742561738938, + "grad_norm": 7.271862983703613, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8646992444992065, + "num_tokens": 411382118.0, + "step": 10777 + }, + { + "epoch": 1.371072382648518, + "ewc_loss": 0.06194877624511719, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002874565252568573, + "grad_norm": 7.099515914916992, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8521751165390015, + "num_tokens": 411426702.0, + "step": 10778 + }, + { + "epoch": 1.3711995929271086, + "ewc_loss": 0.062287718057632446, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002908459573518485, + "grad_norm": 7.222081661224365, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8616266250610352, + "num_tokens": 411461689.0, + "step": 10779 + }, + { + "epoch": 1.371326803205699, + "ewc_loss": 0.062154531478881836, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000289514078758657, + "grad_norm": 7.191579341888428, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.864534854888916, + "num_tokens": 411499613.0, + "step": 10780 + }, + { + "epoch": 1.3714540134842896, + "ewc_loss": 0.062228284776210785, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029025159892626107, + "grad_norm": 7.221375465393066, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8612918853759766, + "num_tokens": 411538422.0, + "step": 10781 + }, + { + "epoch": 1.3715812237628802, + "ewc_loss": 0.06217288225889206, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028969754930585623, + "grad_norm": 7.14308500289917, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8764723539352417, + "num_tokens": 411572401.0, + "step": 10782 + }, + { + "epoch": 1.3717084340414705, + "ewc_loss": 0.06200461834669113, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002904563443735242, + "grad_norm": 7.197732925415039, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8691874742507935, + "num_tokens": 411611268.0, + "step": 10783 + }, + { + "epoch": 1.371835644320061, + "ewc_loss": 0.062032196670770645, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002907321322709322, + "grad_norm": 7.209118843078613, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8613303899765015, + "num_tokens": 411648837.0, + "step": 10784 + }, + { + "epoch": 1.3719628545986515, + "ewc_loss": 0.06216961890459061, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002896649530157447, + "grad_norm": 7.162657737731934, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.866584837436676, + "num_tokens": 411690327.0, + "step": 10785 + }, + { + "epoch": 1.372090064877242, + "ewc_loss": 0.06204730272293091, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00029088318115100265, + "grad_norm": 7.2328104972839355, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.862328052520752, + "num_tokens": 411729665.0, + "step": 10786 + }, + { + "epoch": 1.3722172751558326, + "ewc_loss": 0.061950452625751495, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002899146929848939, + "grad_norm": 7.125698566436768, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8752196431159973, + "num_tokens": 411764716.0, + "step": 10787 + }, + { + "epoch": 1.3723444854344231, + "ewc_loss": 0.06210935115814209, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00029150364571250975, + "grad_norm": 7.241400718688965, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8642323017120361, + "num_tokens": 411805560.0, + "step": 10788 + }, + { + "epoch": 1.3724716957130136, + "ewc_loss": 0.06191793829202652, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002895895449910313, + "grad_norm": 7.1868696212768555, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8672540783882141, + "num_tokens": 411841151.0, + "step": 10789 + }, + { + "epoch": 1.3725989059916042, + "ewc_loss": 0.06205592304468155, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002909693866968155, + "grad_norm": 7.22173547744751, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8688936233520508, + "num_tokens": 411875088.0, + "step": 10790 + }, + { + "epoch": 1.3727261162701947, + "ewc_loss": 0.062153011560440063, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028949888655915856, + "grad_norm": 7.172523498535156, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8623934984207153, + "num_tokens": 411914554.0, + "step": 10791 + }, + { + "epoch": 1.3728533265487852, + "ewc_loss": 0.06233018636703491, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029127064044587314, + "grad_norm": 7.2352399826049805, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8641171455383301, + "num_tokens": 411952237.0, + "step": 10792 + }, + { + "epoch": 1.3729805368273755, + "ewc_loss": 0.06202545017004013, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002906646695919335, + "grad_norm": 7.219850540161133, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8568977117538452, + "num_tokens": 411989676.0, + "step": 10793 + }, + { + "epoch": 1.373107747105966, + "ewc_loss": 0.061976492404937744, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002901750849559903, + "grad_norm": 7.187625885009766, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8756992220878601, + "num_tokens": 412025804.0, + "step": 10794 + }, + { + "epoch": 1.3732349573845566, + "ewc_loss": 0.06204674392938614, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002908776223193854, + "grad_norm": 7.228593826293945, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8779942393302917, + "num_tokens": 412060085.0, + "step": 10795 + }, + { + "epoch": 1.3733621676631471, + "ewc_loss": 0.06219828501343727, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028995159664191306, + "grad_norm": 7.155254364013672, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8691097497940063, + "num_tokens": 412100750.0, + "step": 10796 + }, + { + "epoch": 1.3734893779417376, + "ewc_loss": 0.062101542949676514, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002914255892392248, + "grad_norm": 7.187860488891602, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8653632402420044, + "num_tokens": 412145829.0, + "step": 10797 + }, + { + "epoch": 1.3736165882203282, + "ewc_loss": 0.06206053867936134, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00029101554537191987, + "grad_norm": 7.207831859588623, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8687269687652588, + "num_tokens": 412187607.0, + "step": 10798 + }, + { + "epoch": 1.3737437984989187, + "ewc_loss": 0.0623023696243763, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002909924369305372, + "grad_norm": 7.188061237335205, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8494340777397156, + "num_tokens": 412227214.0, + "step": 10799 + }, + { + "epoch": 1.3738710087775092, + "ewc_loss": 0.06239485740661621, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002919172984547913, + "grad_norm": 7.240416526794434, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.865591824054718, + "num_tokens": 412261929.0, + "step": 10800 + }, + { + "epoch": 1.3739982190560998, + "ewc_loss": 0.06205551326274872, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002909652830567211, + "grad_norm": 7.243626117706299, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8582717180252075, + "num_tokens": 412299441.0, + "step": 10801 + }, + { + "epoch": 1.3741254293346903, + "ewc_loss": 0.06201295927166939, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002905397559516132, + "grad_norm": 7.177785873413086, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8615564107894897, + "num_tokens": 412339248.0, + "step": 10802 + }, + { + "epoch": 1.3742526396132808, + "ewc_loss": 0.062252290546894073, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00029293305124156177, + "grad_norm": 7.251270294189453, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8616476058959961, + "num_tokens": 412377224.0, + "step": 10803 + }, + { + "epoch": 1.3743798498918713, + "ewc_loss": 0.062230877578258514, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029027750133536756, + "grad_norm": 7.273673057556152, + "learning_rate": 1e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.841153085231781, + "num_tokens": 412415331.0, + "step": 10804 + }, + { + "epoch": 1.3745070601704619, + "ewc_loss": 0.062353603541851044, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029150478076189756, + "grad_norm": 7.2029876708984375, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8621021509170532, + "num_tokens": 412455604.0, + "step": 10805 + }, + { + "epoch": 1.3746342704490524, + "ewc_loss": 0.06237303838133812, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029169913614168763, + "grad_norm": 7.247125148773193, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8698517680168152, + "num_tokens": 412489295.0, + "step": 10806 + }, + { + "epoch": 1.374761480727643, + "ewc_loss": 0.062186937779188156, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028983812080696225, + "grad_norm": 7.192426681518555, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8591575026512146, + "num_tokens": 412525750.0, + "step": 10807 + }, + { + "epoch": 1.3748886910062332, + "ewc_loss": 0.06242497265338898, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029221849399618804, + "grad_norm": 7.192904472351074, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8726094365119934, + "num_tokens": 412567163.0, + "step": 10808 + }, + { + "epoch": 1.3750159012848238, + "ewc_loss": 0.06238603591918945, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002918291138485074, + "grad_norm": 7.21168327331543, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.870008111000061, + "num_tokens": 412598789.0, + "step": 10809 + }, + { + "epoch": 1.3751431115634143, + "ewc_loss": 0.06238983944058418, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029186715255491436, + "grad_norm": 7.239852428436279, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8657559156417847, + "num_tokens": 412632588.0, + "step": 10810 + }, + { + "epoch": 1.3752703218420048, + "ewc_loss": 0.06241687387228012, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029213749803602695, + "grad_norm": 7.192574977874756, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8620432615280151, + "num_tokens": 412676369.0, + "step": 10811 + }, + { + "epoch": 1.3753975321205953, + "ewc_loss": 0.06241889297962189, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029215766699053347, + "grad_norm": 7.209481239318848, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8775252103805542, + "num_tokens": 412712674.0, + "step": 10812 + }, + { + "epoch": 1.3755247423991859, + "ewc_loss": 0.062372878193855286, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029169750632718205, + "grad_norm": 7.225597381591797, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8578633069992065, + "num_tokens": 412753531.0, + "step": 10813 + }, + { + "epoch": 1.3756519526777764, + "ewc_loss": 0.062342170625925064, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002913904609158635, + "grad_norm": 7.2925333976745605, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8605579137802124, + "num_tokens": 412799450.0, + "step": 10814 + }, + { + "epoch": 1.375779162956367, + "ewc_loss": 0.06224922835826874, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002904610591940582, + "grad_norm": 7.154571533203125, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8861263394355774, + "num_tokens": 412836858.0, + "step": 10815 + }, + { + "epoch": 1.3759063732349575, + "ewc_loss": 0.062448933720588684, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002924580767285079, + "grad_norm": 7.247781276702881, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8667749166488647, + "num_tokens": 412871169.0, + "step": 10816 + }, + { + "epoch": 1.3760335835135478, + "ewc_loss": 0.062041737139225006, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00029082753462716937, + "grad_norm": 7.199093341827393, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8688557147979736, + "num_tokens": 412909281.0, + "step": 10817 + }, + { + "epoch": 1.3761607937921383, + "ewc_loss": 0.06238821893930435, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029185094172134995, + "grad_norm": 7.216206073760986, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8629229068756104, + "num_tokens": 412950126.0, + "step": 10818 + }, + { + "epoch": 1.3762880040707288, + "ewc_loss": 0.062131959944963455, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002917297533713281, + "grad_norm": 7.234517574310303, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8509222269058228, + "num_tokens": 412988644.0, + "step": 10819 + }, + { + "epoch": 1.3764152143493193, + "ewc_loss": 0.06231974437832832, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002911661867983639, + "grad_norm": 7.255340099334717, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.848759114742279, + "num_tokens": 413028403.0, + "step": 10820 + }, + { + "epoch": 1.3765424246279099, + "ewc_loss": 0.06218084692955017, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00029221863951534033, + "grad_norm": 7.201875686645508, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8538944721221924, + "num_tokens": 413067429.0, + "step": 10821 + }, + { + "epoch": 1.3766696349065004, + "ewc_loss": 0.06241534650325775, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002921221894212067, + "grad_norm": 7.301294803619385, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8587585687637329, + "num_tokens": 413101630.0, + "step": 10822 + }, + { + "epoch": 1.376796845185091, + "ewc_loss": 0.0622534304857254, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029050305602140725, + "grad_norm": 7.197906017303467, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8562610149383545, + "num_tokens": 413147757.0, + "step": 10823 + }, + { + "epoch": 1.3769240554636815, + "ewc_loss": 0.062457069754600525, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029253942193463445, + "grad_norm": 7.2358927726745605, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8655909299850464, + "num_tokens": 413186803.0, + "step": 10824 + }, + { + "epoch": 1.377051265742272, + "ewc_loss": 0.062127698212862015, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00029168714536353946, + "grad_norm": 7.212152481079102, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8574584722518921, + "num_tokens": 413227201.0, + "step": 10825 + }, + { + "epoch": 1.3771784760208625, + "ewc_loss": 0.06235439330339432, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029151266789995134, + "grad_norm": 7.2374138832092285, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8618581295013428, + "num_tokens": 413270594.0, + "step": 10826 + }, + { + "epoch": 1.377305686299453, + "ewc_loss": 0.06205970048904419, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002910071343649179, + "grad_norm": 7.23328971862793, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8446375131607056, + "num_tokens": 413307345.0, + "step": 10827 + }, + { + "epoch": 1.3774328965780436, + "ewc_loss": 0.062033168971538544, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.00029074185295030475, + "grad_norm": 7.238834381103516, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8691789507865906, + "num_tokens": 413344310.0, + "step": 10828 + }, + { + "epoch": 1.377560106856634, + "ewc_loss": 0.06226801127195358, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000290648837108165, + "grad_norm": 7.1786956787109375, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8804473280906677, + "num_tokens": 413382047.0, + "step": 10829 + }, + { + "epoch": 1.3776873171352246, + "ewc_loss": 0.06241265684366226, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029209532658569515, + "grad_norm": 7.251036643981934, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8604568243026733, + "num_tokens": 413424658.0, + "step": 10830 + }, + { + "epoch": 1.3778145274138152, + "ewc_loss": 0.062213510274887085, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029010383877903223, + "grad_norm": 7.339630603790283, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8715888261795044, + "num_tokens": 413458386.0, + "step": 10831 + }, + { + "epoch": 1.3779417376924055, + "ewc_loss": 0.06228801980614662, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002908489550463855, + "grad_norm": 7.295273303985596, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8499482870101929, + "num_tokens": 413492530.0, + "step": 10832 + }, + { + "epoch": 1.378068947970996, + "ewc_loss": 0.062220849096775055, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029017726774327457, + "grad_norm": 7.182001113891602, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.874406099319458, + "num_tokens": 413528894.0, + "step": 10833 + }, + { + "epoch": 1.3781961582495865, + "ewc_loss": 0.06234996020793915, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029146834276616573, + "grad_norm": 7.332499027252197, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.859459638595581, + "num_tokens": 413564765.0, + "step": 10834 + }, + { + "epoch": 1.378323368528177, + "ewc_loss": 0.06218726560473442, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028984140953980386, + "grad_norm": 7.251831531524658, + "learning_rate": 1e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8371173143386841, + "num_tokens": 413598122.0, + "step": 10835 + }, + { + "epoch": 1.3784505788067676, + "ewc_loss": 0.06222598999738693, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029022866510786116, + "grad_norm": 7.214833736419678, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8465477228164673, + "num_tokens": 413634519.0, + "step": 10836 + }, + { + "epoch": 1.378577789085358, + "ewc_loss": 0.062324561178684235, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002912143536377698, + "grad_norm": 7.220706939697266, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8559929132461548, + "num_tokens": 413674021.0, + "step": 10837 + }, + { + "epoch": 1.3787049993639486, + "ewc_loss": 0.06227879226207733, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002907566668000072, + "grad_norm": 7.1624555587768555, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8690795302391052, + "num_tokens": 413713804.0, + "step": 10838 + }, + { + "epoch": 1.3788322096425392, + "ewc_loss": 0.062426455318927765, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002922333078458905, + "grad_norm": 7.21565055847168, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8698409795761108, + "num_tokens": 413755665.0, + "step": 10839 + }, + { + "epoch": 1.3789594199211297, + "ewc_loss": 0.062312714755535126, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029109587194398046, + "grad_norm": 7.2356462478637695, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8619838356971741, + "num_tokens": 413790872.0, + "step": 10840 + }, + { + "epoch": 1.3790866301997202, + "ewc_loss": 0.06237708777189255, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002917396486736834, + "grad_norm": 7.227449417114258, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8826876282691956, + "num_tokens": 413830798.0, + "step": 10841 + }, + { + "epoch": 1.3792138404783105, + "ewc_loss": 0.06238546222448349, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029182335129007697, + "grad_norm": 7.2864460945129395, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8679420948028564, + "num_tokens": 413865960.0, + "step": 10842 + }, + { + "epoch": 1.379341050756901, + "ewc_loss": 0.062288425862789154, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029085302958264947, + "grad_norm": 7.23344087600708, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8722759485244751, + "num_tokens": 413898222.0, + "step": 10843 + }, + { + "epoch": 1.3794682610354916, + "ewc_loss": 0.0622970275580883, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002909390314016491, + "grad_norm": 7.262075424194336, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8574129939079285, + "num_tokens": 413931839.0, + "step": 10844 + }, + { + "epoch": 1.3795954713140821, + "ewc_loss": 0.06235416233539581, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002915103978011757, + "grad_norm": 7.327392101287842, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8536847233772278, + "num_tokens": 413963815.0, + "step": 10845 + }, + { + "epoch": 1.3797226815926726, + "ewc_loss": 0.06219968944787979, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028996565379202366, + "grad_norm": 7.152244567871094, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8702961802482605, + "num_tokens": 414002476.0, + "step": 10846 + }, + { + "epoch": 1.3798498918712632, + "ewc_loss": 0.06257030367851257, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029367179377004504, + "grad_norm": 7.27821683883667, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.858468234539032, + "num_tokens": 414041121.0, + "step": 10847 + }, + { + "epoch": 1.3799771021498537, + "ewc_loss": 0.06223799288272858, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002903486602008343, + "grad_norm": 7.221360206604004, + "learning_rate": 1e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8504610061645508, + "num_tokens": 414080447.0, + "step": 10848 + }, + { + "epoch": 1.3801043124284442, + "ewc_loss": 0.062447573989629745, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002924444852396846, + "grad_norm": 7.362157821655273, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8517561554908752, + "num_tokens": 414108925.0, + "step": 10849 + }, + { + "epoch": 1.3802315227070348, + "ewc_loss": 0.06221260875463486, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029009481659159064, + "grad_norm": 7.170665264129639, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8675562143325806, + "num_tokens": 414154296.0, + "step": 10850 + }, + { + "epoch": 1.3803587329856253, + "ewc_loss": 0.06238184869289398, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002917872625403106, + "grad_norm": 7.218504905700684, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8520709276199341, + "num_tokens": 414194612.0, + "step": 10851 + }, + { + "epoch": 1.3804859432642158, + "ewc_loss": 0.06225753575563431, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029054409242235124, + "grad_norm": 7.223273277282715, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8666812777519226, + "num_tokens": 414229939.0, + "step": 10852 + }, + { + "epoch": 1.3806131535428063, + "ewc_loss": 0.06238502264022827, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029181898571550846, + "grad_norm": 7.267220973968506, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8596846461296082, + "num_tokens": 414265193.0, + "step": 10853 + }, + { + "epoch": 1.3807403638213969, + "ewc_loss": 0.062250830233097076, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002904770371969789, + "grad_norm": 7.208230972290039, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8596142530441284, + "num_tokens": 414301471.0, + "step": 10854 + }, + { + "epoch": 1.3808675740999874, + "ewc_loss": 0.0623348243534565, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002913170028477907, + "grad_norm": 7.262608051300049, + "learning_rate": 1e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.8346611261367798, + "num_tokens": 414338755.0, + "step": 10855 + }, + { + "epoch": 1.380994784378578, + "ewc_loss": 0.062232065945863724, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029028940480202436, + "grad_norm": 7.174347877502441, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.865355372428894, + "num_tokens": 414379996.0, + "step": 10856 + }, + { + "epoch": 1.3811219946571682, + "ewc_loss": 0.0623621828854084, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000291590578854084, + "grad_norm": 7.1950178146362305, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8650134801864624, + "num_tokens": 414421905.0, + "step": 10857 + }, + { + "epoch": 1.3812492049357588, + "ewc_loss": 0.06231993809342384, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002911681367550045, + "grad_norm": 7.152324199676514, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8677821159362793, + "num_tokens": 414465149.0, + "step": 10858 + }, + { + "epoch": 1.3813764152143493, + "ewc_loss": 0.06255073100328445, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002934760705102235, + "grad_norm": 7.236392021179199, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8715118765830994, + "num_tokens": 414504086.0, + "step": 10859 + }, + { + "epoch": 1.3815036254929398, + "ewc_loss": 0.06229332089424133, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029090195312164724, + "grad_norm": 7.189069747924805, + "learning_rate": 1e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8476015329360962, + "num_tokens": 414545392.0, + "step": 10860 + }, + { + "epoch": 1.3816308357715303, + "ewc_loss": 0.06256261467933655, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002935948723461479, + "grad_norm": 7.250339508056641, + "learning_rate": 1e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.8270251750946045, + "num_tokens": 414586445.0, + "step": 10861 + }, + { + "epoch": 1.3817580460501209, + "ewc_loss": 0.062368690967559814, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029165568412281573, + "grad_norm": 7.273584365844727, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8717654347419739, + "num_tokens": 414618467.0, + "step": 10862 + }, + { + "epoch": 1.3818852563287114, + "ewc_loss": 0.06245090067386627, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029247775091789663, + "grad_norm": 7.2495436668396, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8735170364379883, + "num_tokens": 414651253.0, + "step": 10863 + }, + { + "epoch": 1.382012466607302, + "ewc_loss": 0.06243409588932991, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029230970540083945, + "grad_norm": 7.262838840484619, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.862224280834198, + "num_tokens": 414692016.0, + "step": 10864 + }, + { + "epoch": 1.3821396768858925, + "ewc_loss": 0.06234817951917648, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002914505312219262, + "grad_norm": 7.210733890533447, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8683674335479736, + "num_tokens": 414731840.0, + "step": 10865 + }, + { + "epoch": 1.3822668871644828, + "ewc_loss": 0.062399301677942276, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002919617691077292, + "grad_norm": 7.2397308349609375, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8654186725616455, + "num_tokens": 414767110.0, + "step": 10866 + }, + { + "epoch": 1.3823940974430733, + "ewc_loss": 0.062275804579257965, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002907268062699586, + "grad_norm": 7.217564582824707, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8673338890075684, + "num_tokens": 414799340.0, + "step": 10867 + }, + { + "epoch": 1.3825213077216638, + "ewc_loss": 0.06236008182168007, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029156956588849425, + "grad_norm": 7.210822105407715, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8667194247245789, + "num_tokens": 414840915.0, + "step": 10868 + }, + { + "epoch": 1.3826485180002543, + "ewc_loss": 0.06224111095070839, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002903798595070839, + "grad_norm": 7.27138614654541, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8507044315338135, + "num_tokens": 414877298.0, + "step": 10869 + }, + { + "epoch": 1.3827757282788449, + "ewc_loss": 0.06217870116233826, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002897557569667697, + "grad_norm": 7.2028608322143555, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8694987297058105, + "num_tokens": 414916431.0, + "step": 10870 + }, + { + "epoch": 1.3829029385574354, + "ewc_loss": 0.062419455498456955, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002921633131336421, + "grad_norm": 7.237659454345703, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8607020378112793, + "num_tokens": 414958148.0, + "step": 10871 + }, + { + "epoch": 1.383030148836026, + "ewc_loss": 0.06232965737581253, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029126534354873, + "grad_norm": 7.1776838302612305, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8884154558181763, + "num_tokens": 414997008.0, + "step": 10872 + }, + { + "epoch": 1.3831573591146165, + "ewc_loss": 0.06241213530302048, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002920901170000434, + "grad_norm": 7.248310565948486, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.85074782371521, + "num_tokens": 415042893.0, + "step": 10873 + }, + { + "epoch": 1.383284569393207, + "ewc_loss": 0.06237213313579559, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002916900848504156, + "grad_norm": 7.2597222328186035, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8613205552101135, + "num_tokens": 415077476.0, + "step": 10874 + }, + { + "epoch": 1.3834117796717975, + "ewc_loss": 0.06234738975763321, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029144264408387244, + "grad_norm": 7.222768306732178, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8587926030158997, + "num_tokens": 415124329.0, + "step": 10875 + }, + { + "epoch": 1.383538989950388, + "ewc_loss": 0.062454063445329666, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002925093867816031, + "grad_norm": 7.270644664764404, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8752331137657166, + "num_tokens": 415164101.0, + "step": 10876 + }, + { + "epoch": 1.3836662002289786, + "ewc_loss": 0.06227609142661095, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002907296584453434, + "grad_norm": 7.194167137145996, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8642038702964783, + "num_tokens": 415203792.0, + "step": 10877 + }, + { + "epoch": 1.383793410507569, + "ewc_loss": 0.062437452375888824, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029234326211735606, + "grad_norm": 7.336808681488037, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8598588109016418, + "num_tokens": 415233754.0, + "step": 10878 + }, + { + "epoch": 1.3839206207861596, + "ewc_loss": 0.06225123628973961, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029048111173324287, + "grad_norm": 7.269172191619873, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8715015649795532, + "num_tokens": 415269567.0, + "step": 10879 + }, + { + "epoch": 1.3840478310647502, + "ewc_loss": 0.06240139901638031, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029198272386565804, + "grad_norm": 7.221941947937012, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8658247590065002, + "num_tokens": 415312660.0, + "step": 10880 + }, + { + "epoch": 1.3841750413433405, + "ewc_loss": 0.06225127354264259, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002904814900830388, + "grad_norm": 7.543399333953857, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8739667534828186, + "num_tokens": 415355377.0, + "step": 10881 + }, + { + "epoch": 1.384302251621931, + "ewc_loss": 0.0620153546333313, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028812227537855506, + "grad_norm": 7.107427597045898, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8747988939285278, + "num_tokens": 415393109.0, + "step": 10882 + }, + { + "epoch": 1.3844294619005215, + "ewc_loss": 0.06256123632192612, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002935811353381723, + "grad_norm": 7.263350486755371, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.875341534614563, + "num_tokens": 415428650.0, + "step": 10883 + }, + { + "epoch": 1.384556672179112, + "ewc_loss": 0.06208168715238571, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028878560988232493, + "grad_norm": 7.149118900299072, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8605244159698486, + "num_tokens": 415467166.0, + "step": 10884 + }, + { + "epoch": 1.3846838824577026, + "ewc_loss": 0.0625317394733429, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002932861098088324, + "grad_norm": 7.271888256072998, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8760720491409302, + "num_tokens": 415505633.0, + "step": 10885 + }, + { + "epoch": 1.384811092736293, + "ewc_loss": 0.06228327751159668, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029080151580274105, + "grad_norm": 7.189812183380127, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8628654479980469, + "num_tokens": 415546254.0, + "step": 10886 + }, + { + "epoch": 1.3849383030148836, + "ewc_loss": 0.06251884251832962, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002931571507360786, + "grad_norm": 7.248464107513428, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8620288968086243, + "num_tokens": 415584999.0, + "step": 10887 + }, + { + "epoch": 1.3850655132934742, + "ewc_loss": 0.06237667053937912, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029173545772209764, + "grad_norm": 7.209993839263916, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8648275136947632, + "num_tokens": 415624608.0, + "step": 10888 + }, + { + "epoch": 1.3851927235720647, + "ewc_loss": 0.06254711747169495, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029343992355279624, + "grad_norm": 7.330114841461182, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.849361777305603, + "num_tokens": 415663405.0, + "step": 10889 + }, + { + "epoch": 1.385319933850655, + "ewc_loss": 0.06233987957239151, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029136755620129406, + "grad_norm": 7.210504055023193, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8780825734138489, + "num_tokens": 415698934.0, + "step": 10890 + }, + { + "epoch": 1.3854471441292455, + "ewc_loss": 0.06263922899961472, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029436105978675187, + "grad_norm": 7.326961040496826, + "learning_rate": 1e-06, + "loss": 0.5534, + "mean_token_accuracy": 0.8347952365875244, + "num_tokens": 415736973.0, + "step": 10891 + }, + { + "epoch": 1.385574354407836, + "ewc_loss": 0.06231142580509186, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002910830080509186, + "grad_norm": 7.197255611419678, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.86649489402771, + "num_tokens": 415778364.0, + "step": 10892 + }, + { + "epoch": 1.3857015646864266, + "ewc_loss": 0.06263920664787292, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029436079785227776, + "grad_norm": 7.3008317947387695, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8793392181396484, + "num_tokens": 415817502.0, + "step": 10893 + }, + { + "epoch": 1.385828774965017, + "ewc_loss": 0.06230536848306656, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002910224429797381, + "grad_norm": 7.159958839416504, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.858119010925293, + "num_tokens": 415860364.0, + "step": 10894 + }, + { + "epoch": 1.3859559852436076, + "ewc_loss": 0.06271327286958694, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029510146123357117, + "grad_norm": 7.304943561553955, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8656308054924011, + "num_tokens": 415896816.0, + "step": 10895 + }, + { + "epoch": 1.3860831955221982, + "ewc_loss": 0.062360942363739014, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000291578151518479, + "grad_norm": 7.191882133483887, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8452110290527344, + "num_tokens": 415935173.0, + "step": 10896 + }, + { + "epoch": 1.3862104058007887, + "ewc_loss": 0.06275159120559692, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029548464226536453, + "grad_norm": 7.302480220794678, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8731798529624939, + "num_tokens": 415977043.0, + "step": 10897 + }, + { + "epoch": 1.3863376160793792, + "ewc_loss": 0.0625080093741417, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029304882627911866, + "grad_norm": 7.211052894592285, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8722275495529175, + "num_tokens": 416014376.0, + "step": 10898 + }, + { + "epoch": 1.3864648263579697, + "ewc_loss": 0.06268665939569473, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029483536491170526, + "grad_norm": 7.255956172943115, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8513122200965881, + "num_tokens": 416055648.0, + "step": 10899 + }, + { + "epoch": 1.3865920366365603, + "ewc_loss": 0.06220605596899986, + "ewc_loss_diag": 3.2901763916015625e-05, + "ewc_loss_parallel": 0.0002924707077909261, + "grad_norm": 7.291886806488037, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8552645444869995, + "num_tokens": 416087062.0, + "step": 10900 + }, + { + "epoch": 1.3867192469151508, + "ewc_loss": 0.06259177625179291, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002938865509349853, + "grad_norm": 7.254958152770996, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8703963756561279, + "num_tokens": 416122559.0, + "step": 10901 + }, + { + "epoch": 1.3868464571937413, + "ewc_loss": 0.06279981136322021, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029352548881433904, + "grad_norm": 7.343714237213135, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8558430075645447, + "num_tokens": 416155228.0, + "step": 10902 + }, + { + "epoch": 1.3869736674723319, + "ewc_loss": 0.06233380734920502, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029130681650713086, + "grad_norm": 7.296838760375977, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8496804237365723, + "num_tokens": 416191190.0, + "step": 10903 + }, + { + "epoch": 1.3871008777509224, + "ewc_loss": 0.06253427267074585, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029331151745282114, + "grad_norm": 7.2878546714782715, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8670616149902344, + "num_tokens": 416228009.0, + "step": 10904 + }, + { + "epoch": 1.387228088029513, + "ewc_loss": 0.06236816197633743, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029165035812184215, + "grad_norm": 7.252379417419434, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.859734058380127, + "num_tokens": 416263657.0, + "step": 10905 + }, + { + "epoch": 1.3873552983081032, + "ewc_loss": 0.062332604080438614, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029129479662515223, + "grad_norm": 7.265668869018555, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8568146824836731, + "num_tokens": 416305187.0, + "step": 10906 + }, + { + "epoch": 1.3874825085866938, + "ewc_loss": 0.062399208545684814, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002919608086813241, + "grad_norm": 7.263299942016602, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.860228955745697, + "num_tokens": 416342578.0, + "step": 10907 + }, + { + "epoch": 1.3876097188652843, + "ewc_loss": 0.0623275525867939, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029124427237547934, + "grad_norm": 7.246211051940918, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8592713475227356, + "num_tokens": 416378969.0, + "step": 10908 + }, + { + "epoch": 1.3877369291438748, + "ewc_loss": 0.06242965906858444, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029226532205939293, + "grad_norm": 7.233209609985352, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8622523546218872, + "num_tokens": 416412958.0, + "step": 10909 + }, + { + "epoch": 1.3878641394224653, + "ewc_loss": 0.06242342293262482, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002922029816545546, + "grad_norm": 7.266560077667236, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8436225056648254, + "num_tokens": 416445643.0, + "step": 10910 + }, + { + "epoch": 1.3879913497010559, + "ewc_loss": 0.062475211918354034, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002927208843175322, + "grad_norm": 7.231251239776611, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8497004508972168, + "num_tokens": 416484891.0, + "step": 10911 + }, + { + "epoch": 1.3881185599796464, + "ewc_loss": 0.06242404505610466, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029220920987427235, + "grad_norm": 7.280567169189453, + "learning_rate": 1e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8503204584121704, + "num_tokens": 416520504.0, + "step": 10912 + }, + { + "epoch": 1.388245770258237, + "ewc_loss": 0.06239476054906845, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029191633802838624, + "grad_norm": 7.195549964904785, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.863174557685852, + "num_tokens": 416561877.0, + "step": 10913 + }, + { + "epoch": 1.3883729805368275, + "ewc_loss": 0.062454499304294586, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002925137523561716, + "grad_norm": 7.274697780609131, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8630213737487793, + "num_tokens": 416597676.0, + "step": 10914 + }, + { + "epoch": 1.3885001908154178, + "ewc_loss": 0.062393657863140106, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002919053367804736, + "grad_norm": 7.260967254638672, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8704847097396851, + "num_tokens": 416630557.0, + "step": 10915 + }, + { + "epoch": 1.3886274010940083, + "ewc_loss": 0.06238105148077011, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000291779258986935, + "grad_norm": 7.2066569328308105, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8646343946456909, + "num_tokens": 416668604.0, + "step": 10916 + }, + { + "epoch": 1.3887546113725988, + "ewc_loss": 0.06242550164461136, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029222379089333117, + "grad_norm": 7.271718502044678, + "learning_rate": 1e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8430423736572266, + "num_tokens": 416710246.0, + "step": 10917 + }, + { + "epoch": 1.3888818216511893, + "ewc_loss": 0.06233289837837219, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002912977070081979, + "grad_norm": 7.256948947906494, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8637993335723877, + "num_tokens": 416744962.0, + "step": 10918 + }, + { + "epoch": 1.3890090319297799, + "ewc_loss": 0.0624857172369957, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000292825949145481, + "grad_norm": 7.294960975646973, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8647039532661438, + "num_tokens": 416779483.0, + "step": 10919 + }, + { + "epoch": 1.3891362422083704, + "ewc_loss": 0.06222151964902878, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029018393252044916, + "grad_norm": 7.176637649536133, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8758015036582947, + "num_tokens": 416822135.0, + "step": 10920 + }, + { + "epoch": 1.389263452486961, + "ewc_loss": 0.06272036582231522, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029273098334670067, + "grad_norm": 7.2770867347717285, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8608146905899048, + "num_tokens": 416860383.0, + "step": 10921 + }, + { + "epoch": 1.3893906627655515, + "ewc_loss": 0.06232325732707977, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002912013151217252, + "grad_norm": 7.260691165924072, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8583120107650757, + "num_tokens": 416896960.0, + "step": 10922 + }, + { + "epoch": 1.389517873044142, + "ewc_loss": 0.062440067529678345, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029236942646093667, + "grad_norm": 7.24539041519165, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8741908073425293, + "num_tokens": 416931981.0, + "step": 10923 + }, + { + "epoch": 1.3896450833227325, + "ewc_loss": 0.06268288195133209, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029235618421807885, + "grad_norm": 7.249420166015625, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8721392154693604, + "num_tokens": 416968179.0, + "step": 10924 + }, + { + "epoch": 1.389772293601323, + "ewc_loss": 0.062460899353027344, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029257775167934597, + "grad_norm": 7.234542369842529, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8671855330467224, + "num_tokens": 417008879.0, + "step": 10925 + }, + { + "epoch": 1.3898995038799136, + "ewc_loss": 0.06241223216056824, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029209107742644846, + "grad_norm": 7.243966102600098, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8734169006347656, + "num_tokens": 417042497.0, + "step": 10926 + }, + { + "epoch": 1.390026714158504, + "ewc_loss": 0.06244561821222305, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002924249565694481, + "grad_norm": 7.25511360168457, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8632858395576477, + "num_tokens": 417084396.0, + "step": 10927 + }, + { + "epoch": 1.3901539244370946, + "ewc_loss": 0.06242901086807251, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029225886100903153, + "grad_norm": 7.2383928298950195, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.875436544418335, + "num_tokens": 417121014.0, + "step": 10928 + }, + { + "epoch": 1.3902811347156852, + "ewc_loss": 0.06255746632814407, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029354338767006993, + "grad_norm": 7.272706985473633, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8617314100265503, + "num_tokens": 417160169.0, + "step": 10929 + }, + { + "epoch": 1.3904083449942755, + "ewc_loss": 0.06243520975112915, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002923208521679044, + "grad_norm": 7.304372310638428, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8462961912155151, + "num_tokens": 417195024.0, + "step": 10930 + }, + { + "epoch": 1.390535555272866, + "ewc_loss": 0.062442801892757416, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029239675495773554, + "grad_norm": 7.263701915740967, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8689169883728027, + "num_tokens": 417231344.0, + "step": 10931 + }, + { + "epoch": 1.3906627655514565, + "ewc_loss": 0.06272924691438675, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002928198082372546, + "grad_norm": 7.438523292541504, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8533873558044434, + "num_tokens": 417270266.0, + "step": 10932 + }, + { + "epoch": 1.390789975830047, + "ewc_loss": 0.0622907355427742, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029087610892020166, + "grad_norm": 7.215872287750244, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8604289293289185, + "num_tokens": 417308292.0, + "step": 10933 + }, + { + "epoch": 1.3909171861086376, + "ewc_loss": 0.062381498515605927, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002917837118729949, + "grad_norm": 7.259187698364258, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8591539859771729, + "num_tokens": 417346358.0, + "step": 10934 + }, + { + "epoch": 1.391044396387228, + "ewc_loss": 0.0623592808842659, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029156156233511865, + "grad_norm": 7.27684211730957, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8638973236083984, + "num_tokens": 417381924.0, + "step": 10935 + }, + { + "epoch": 1.3911716066658186, + "ewc_loss": 0.06230536103248596, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029102235566824675, + "grad_norm": 7.283699035644531, + "learning_rate": 1e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8367608189582825, + "num_tokens": 417417779.0, + "step": 10936 + }, + { + "epoch": 1.3912988169444092, + "ewc_loss": 0.06236041337251663, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002915728837251663, + "grad_norm": 7.252288818359375, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8543898463249207, + "num_tokens": 417459076.0, + "step": 10937 + }, + { + "epoch": 1.3914260272229997, + "ewc_loss": 0.062462836503982544, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002925971057265997, + "grad_norm": 7.331364631652832, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8624520897865295, + "num_tokens": 417499422.0, + "step": 10938 + }, + { + "epoch": 1.39155323750159, + "ewc_loss": 0.06235413998365402, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029151016497053206, + "grad_norm": 7.3144402503967285, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8631212115287781, + "num_tokens": 417541776.0, + "step": 10939 + }, + { + "epoch": 1.3916804477801805, + "ewc_loss": 0.062298040837049484, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029094915953464806, + "grad_norm": 7.32489538192749, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8503344655036926, + "num_tokens": 417576469.0, + "step": 10940 + }, + { + "epoch": 1.391807658058771, + "ewc_loss": 0.06222550570964813, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029022383387200534, + "grad_norm": 7.210268974304199, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8634859323501587, + "num_tokens": 417611726.0, + "step": 10941 + }, + { + "epoch": 1.3919348683373616, + "ewc_loss": 0.06265997141599655, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029212707886472344, + "grad_norm": 7.374915599822998, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8552116751670837, + "num_tokens": 417650135.0, + "step": 10942 + }, + { + "epoch": 1.392062078615952, + "ewc_loss": 0.06214381754398346, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002894069184549153, + "grad_norm": 7.242097854614258, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8441328406333923, + "num_tokens": 417690981.0, + "step": 10943 + }, + { + "epoch": 1.3921892888945426, + "ewc_loss": 0.062411580234766006, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029208455816842616, + "grad_norm": 7.315849781036377, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8492951989173889, + "num_tokens": 417732152.0, + "step": 10944 + }, + { + "epoch": 1.3923164991731332, + "ewc_loss": 0.06221519410610199, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002901206898968667, + "grad_norm": 7.249279022216797, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8526421189308167, + "num_tokens": 417771814.0, + "step": 10945 + }, + { + "epoch": 1.3924437094517237, + "ewc_loss": 0.06230901926755905, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029105893918313086, + "grad_norm": 7.339780807495117, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8510831594467163, + "num_tokens": 417810663.0, + "step": 10946 + }, + { + "epoch": 1.3925709197303142, + "ewc_loss": 0.06251853704452515, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000290712749119848, + "grad_norm": 7.25032377243042, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8675357103347778, + "num_tokens": 417845664.0, + "step": 10947 + }, + { + "epoch": 1.3926981300089047, + "ewc_loss": 0.0625840499997139, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029136784723959863, + "grad_norm": 7.352359294891357, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8545418977737427, + "num_tokens": 417889617.0, + "step": 10948 + }, + { + "epoch": 1.3928253402874953, + "ewc_loss": 0.06243886053562164, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00028991594444960356, + "grad_norm": 7.22310209274292, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8587031364440918, + "num_tokens": 417930820.0, + "step": 10949 + }, + { + "epoch": 1.3929525505660858, + "ewc_loss": 0.06231894716620445, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002911582123488188, + "grad_norm": 7.355513095855713, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8635680675506592, + "num_tokens": 417966799.0, + "step": 10950 + }, + { + "epoch": 1.3930797608446763, + "ewc_loss": 0.062162354588508606, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002895922807510942, + "grad_norm": 7.220996379852295, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8535901308059692, + "num_tokens": 418009328.0, + "step": 10951 + }, + { + "epoch": 1.3932069711232669, + "ewc_loss": 0.062289047986269, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029085922869853675, + "grad_norm": 7.44994592666626, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8638330101966858, + "num_tokens": 418048548.0, + "step": 10952 + }, + { + "epoch": 1.3933341814018574, + "ewc_loss": 0.0620696023106575, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002886647707782686, + "grad_norm": 7.181504726409912, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8598220944404602, + "num_tokens": 418094947.0, + "step": 10953 + }, + { + "epoch": 1.393461391680448, + "ewc_loss": 0.062449246644973755, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002924612199421972, + "grad_norm": 7.323854923248291, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8531811237335205, + "num_tokens": 418137144.0, + "step": 10954 + }, + { + "epoch": 1.3935886019590382, + "ewc_loss": 0.06213149428367615, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002892836928367615, + "grad_norm": 7.242532730102539, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8835071325302124, + "num_tokens": 418171449.0, + "step": 10955 + }, + { + "epoch": 1.3937158122376287, + "ewc_loss": 0.06237730011343956, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002917417441494763, + "grad_norm": 7.344437599182129, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8645803928375244, + "num_tokens": 418209854.0, + "step": 10956 + }, + { + "epoch": 1.3938430225162193, + "ewc_loss": 0.06219951808452606, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028996390756219625, + "grad_norm": 7.261744022369385, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8702294826507568, + "num_tokens": 418245121.0, + "step": 10957 + }, + { + "epoch": 1.3939702327948098, + "ewc_loss": 0.06230389326810837, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002910076582338661, + "grad_norm": 7.248254776000977, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8660783171653748, + "num_tokens": 418284158.0, + "step": 10958 + }, + { + "epoch": 1.3940974430734003, + "ewc_loss": 0.06230149790644646, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029098373488523066, + "grad_norm": 7.262369155883789, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8695862293243408, + "num_tokens": 418324091.0, + "step": 10959 + }, + { + "epoch": 1.3942246533519909, + "ewc_loss": 0.06227079778909683, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029067674768157303, + "grad_norm": 7.251492500305176, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8625872135162354, + "num_tokens": 418364764.0, + "step": 10960 + }, + { + "epoch": 1.3943518636305814, + "ewc_loss": 0.062282104045152664, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000290789786959067, + "grad_norm": 7.231929302215576, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8597157001495361, + "num_tokens": 418403974.0, + "step": 10961 + }, + { + "epoch": 1.394479073909172, + "ewc_loss": 0.062372855842113495, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029169730260036886, + "grad_norm": 7.250051975250244, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8510457277297974, + "num_tokens": 418446341.0, + "step": 10962 + }, + { + "epoch": 1.3946062841877624, + "ewc_loss": 0.06230536103248596, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002910223847720772, + "grad_norm": 7.197287559509277, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8623456358909607, + "num_tokens": 418490989.0, + "step": 10963 + }, + { + "epoch": 1.3947334944663528, + "ewc_loss": 0.06249881535768509, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029295688727870584, + "grad_norm": 7.348632335662842, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8571792840957642, + "num_tokens": 418524633.0, + "step": 10964 + }, + { + "epoch": 1.3948607047449433, + "ewc_loss": 0.06227206066250801, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002906893496401608, + "grad_norm": 7.207156658172607, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8611584901809692, + "num_tokens": 418562576.0, + "step": 10965 + }, + { + "epoch": 1.3949879150235338, + "ewc_loss": 0.06249576807022095, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029292641556821764, + "grad_norm": 7.330039978027344, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8614014387130737, + "num_tokens": 418596537.0, + "step": 10966 + }, + { + "epoch": 1.3951151253021243, + "ewc_loss": 0.06225254014134407, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002904941502492875, + "grad_norm": 7.239234924316406, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8595048189163208, + "num_tokens": 418625800.0, + "step": 10967 + }, + { + "epoch": 1.3952423355807149, + "ewc_loss": 0.06254734098911285, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029344213544391096, + "grad_norm": 7.327208995819092, + "learning_rate": 1e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8473506569862366, + "num_tokens": 418663818.0, + "step": 10968 + }, + { + "epoch": 1.3953695458593054, + "ewc_loss": 0.062310218811035156, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029107092996127903, + "grad_norm": 7.207430839538574, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8674830794334412, + "num_tokens": 418700607.0, + "step": 10969 + }, + { + "epoch": 1.395496756137896, + "ewc_loss": 0.0625690296292305, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000293659046292305, + "grad_norm": 7.2783942222595215, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8553625345230103, + "num_tokens": 418739357.0, + "step": 10970 + }, + { + "epoch": 1.3956239664164865, + "ewc_loss": 0.06237093359231949, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002916780940722674, + "grad_norm": 7.2918925285339355, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8673146963119507, + "num_tokens": 418769752.0, + "step": 10971 + }, + { + "epoch": 1.395751176695077, + "ewc_loss": 0.062500961124897, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029297833680175245, + "grad_norm": 7.265194416046143, + "learning_rate": 1e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8427720069885254, + "num_tokens": 418808196.0, + "step": 10972 + }, + { + "epoch": 1.3958783869736675, + "ewc_loss": 0.062468841671943665, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029265714692883193, + "grad_norm": 7.234801769256592, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8778723478317261, + "num_tokens": 418843335.0, + "step": 10973 + }, + { + "epoch": 1.396005597252258, + "ewc_loss": 0.06257262080907822, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002936949604190886, + "grad_norm": 7.283087253570557, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8708248734474182, + "num_tokens": 418876591.0, + "step": 10974 + }, + { + "epoch": 1.3961328075308486, + "ewc_loss": 0.06266868859529495, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002922142157331109, + "grad_norm": 7.243773460388184, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8498663306236267, + "num_tokens": 418910482.0, + "step": 10975 + }, + { + "epoch": 1.396260017809439, + "ewc_loss": 0.06274998933076859, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002930272603407502, + "grad_norm": 7.2780842781066895, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8532112836837769, + "num_tokens": 418942248.0, + "step": 10976 + }, + { + "epoch": 1.3963872280880296, + "ewc_loss": 0.06267891824245453, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002923165448009968, + "grad_norm": 7.27305269241333, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8667566180229187, + "num_tokens": 418977770.0, + "step": 10977 + }, + { + "epoch": 1.3965144383666201, + "ewc_loss": 0.06273537129163742, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029288107180036604, + "grad_norm": 7.260260105133057, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8548258543014526, + "num_tokens": 419016464.0, + "step": 10978 + }, + { + "epoch": 1.3966416486452105, + "ewc_loss": 0.06266028434038162, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029213016387075186, + "grad_norm": 7.233928203582764, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8536481261253357, + "num_tokens": 419053254.0, + "step": 10979 + }, + { + "epoch": 1.396768858923801, + "ewc_loss": 0.06274436414241791, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029297100263647735, + "grad_norm": 7.276405334472656, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8552867770195007, + "num_tokens": 419088940.0, + "step": 10980 + }, + { + "epoch": 1.3968960692023915, + "ewc_loss": 0.0627274215221405, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002928015892393887, + "grad_norm": 7.248201370239258, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8606474995613098, + "num_tokens": 419127897.0, + "step": 10981 + }, + { + "epoch": 1.397023279480982, + "ewc_loss": 0.06275902688503265, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029311765683814883, + "grad_norm": 7.257480621337891, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.863987922668457, + "num_tokens": 419170921.0, + "step": 10982 + }, + { + "epoch": 1.3971504897595726, + "ewc_loss": 0.06266048550605774, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029213220113888383, + "grad_norm": 7.247405529022217, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.868561863899231, + "num_tokens": 419206032.0, + "step": 10983 + }, + { + "epoch": 1.397277700038163, + "ewc_loss": 0.06265193969011307, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002920467231888324, + "grad_norm": 7.192821979522705, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8784117102622986, + "num_tokens": 419244240.0, + "step": 10984 + }, + { + "epoch": 1.3974049103167536, + "ewc_loss": 0.06283794343471527, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029390674899332225, + "grad_norm": 7.286935806274414, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.871161162853241, + "num_tokens": 419274902.0, + "step": 10985 + }, + { + "epoch": 1.3975321205953442, + "ewc_loss": 0.06260260939598083, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029155341326259077, + "grad_norm": 7.2357563972473145, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8734219670295715, + "num_tokens": 419312295.0, + "step": 10986 + }, + { + "epoch": 1.3976593308739347, + "ewc_loss": 0.06280077248811722, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002935350639745593, + "grad_norm": 7.292366027832031, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8446559906005859, + "num_tokens": 419350999.0, + "step": 10987 + }, + { + "epoch": 1.397786541152525, + "ewc_loss": 0.06271729618310928, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002927002788055688, + "grad_norm": 7.229207515716553, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8629717826843262, + "num_tokens": 419386739.0, + "step": 10988 + }, + { + "epoch": 1.3979137514311155, + "ewc_loss": 0.06289947032928467, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029452203307300806, + "grad_norm": 7.234017372131348, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8688735961914062, + "num_tokens": 419430353.0, + "step": 10989 + }, + { + "epoch": 1.398040961709706, + "ewc_loss": 0.06277285516262054, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029325587092898786, + "grad_norm": 7.30216121673584, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8583265542984009, + "num_tokens": 419462910.0, + "step": 10990 + }, + { + "epoch": 1.3981681719882966, + "ewc_loss": 0.06276977062225342, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002932250499725342, + "grad_norm": 7.263075351715088, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8714443445205688, + "num_tokens": 419498727.0, + "step": 10991 + }, + { + "epoch": 1.398295382266887, + "ewc_loss": 0.06281215697526932, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002936488890554756, + "grad_norm": 7.27701473236084, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.854825496673584, + "num_tokens": 419534496.0, + "step": 10992 + }, + { + "epoch": 1.3984225925454776, + "ewc_loss": 0.06272360682487488, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002927634341176599, + "grad_norm": 7.260270118713379, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8585659265518188, + "num_tokens": 419575760.0, + "step": 10993 + }, + { + "epoch": 1.3985498028240682, + "ewc_loss": 0.06262528896331787, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029178024851717055, + "grad_norm": 7.173288345336914, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8698742389678955, + "num_tokens": 419616822.0, + "step": 10994 + }, + { + "epoch": 1.3986770131026587, + "ewc_loss": 0.0627991333603859, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029351867851801217, + "grad_norm": 7.2761664390563965, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8687753677368164, + "num_tokens": 419650425.0, + "step": 10995 + }, + { + "epoch": 1.3988042233812492, + "ewc_loss": 0.06273423135280609, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000292869663098827, + "grad_norm": 7.203864097595215, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8685951232910156, + "num_tokens": 419685501.0, + "step": 10996 + }, + { + "epoch": 1.3989314336598397, + "ewc_loss": 0.06288092583417892, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029433658346533775, + "grad_norm": 7.255898952484131, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8655979633331299, + "num_tokens": 419724471.0, + "step": 10997 + }, + { + "epoch": 1.3990586439384303, + "ewc_loss": 0.06272157281637192, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029274309054017067, + "grad_norm": 7.241215705871582, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8698350787162781, + "num_tokens": 419763683.0, + "step": 10998 + }, + { + "epoch": 1.3991858542170208, + "ewc_loss": 0.06287315487861633, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002942589344456792, + "grad_norm": 7.230457782745361, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8657643795013428, + "num_tokens": 419801238.0, + "step": 10999 + }, + { + "epoch": 1.3993130644956113, + "ewc_loss": 0.06278637051582336, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029339102911762893, + "grad_norm": 7.288518905639648, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8590338230133057, + "num_tokens": 419832787.0, + "step": 11000 + }, + { + "epoch": 1.3994402747742019, + "ewc_loss": 0.06246544420719147, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002926231827586889, + "grad_norm": 7.203168869018555, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8521562814712524, + "num_tokens": 419864294.0, + "step": 11001 + }, + { + "epoch": 1.3995674850527924, + "ewc_loss": 0.06289266049861908, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002944539301097393, + "grad_norm": 7.286725044250488, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8656699657440186, + "num_tokens": 419900493.0, + "step": 11002 + }, + { + "epoch": 1.399694695331383, + "ewc_loss": 0.0627434104681015, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000292961485683918, + "grad_norm": 7.188534259796143, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8800151348114014, + "num_tokens": 419938917.0, + "step": 11003 + }, + { + "epoch": 1.3998219056099732, + "ewc_loss": 0.06290338933467865, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029456126503646374, + "grad_norm": 7.254188537597656, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8780763745307922, + "num_tokens": 419978771.0, + "step": 11004 + }, + { + "epoch": 1.3999491158885637, + "ewc_loss": 0.06276679039001465, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029319527675397694, + "grad_norm": 7.2398681640625, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8606201410293579, + "num_tokens": 420011836.0, + "step": 11005 + }, + { + "epoch": 1.4000763261671543, + "ewc_loss": 0.06289907544851303, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002945181040558964, + "grad_norm": 7.235141277313232, + "learning_rate": 1e-06, + "loss": 0.5424, + "mean_token_accuracy": 0.8422080278396606, + "num_tokens": 420051618.0, + "step": 11006 + }, + { + "epoch": 1.4002035364457448, + "ewc_loss": 0.06279877573251724, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029351512785069644, + "grad_norm": 7.298886299133301, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8485742211341858, + "num_tokens": 420089119.0, + "step": 11007 + }, + { + "epoch": 1.4003307467243353, + "ewc_loss": 0.06282775104045868, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029380485648289323, + "grad_norm": 7.197540283203125, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8721262812614441, + "num_tokens": 420123706.0, + "step": 11008 + }, + { + "epoch": 1.4004579570029259, + "ewc_loss": 0.06278231739997864, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002933504874818027, + "grad_norm": 7.210231781005859, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8598280549049377, + "num_tokens": 420156723.0, + "step": 11009 + }, + { + "epoch": 1.4005851672815164, + "ewc_loss": 0.06276194006204605, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029314676066860557, + "grad_norm": 7.196186542510986, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8613799214363098, + "num_tokens": 420198519.0, + "step": 11010 + }, + { + "epoch": 1.400712377560107, + "ewc_loss": 0.06286998838186264, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029422721127048135, + "grad_norm": 7.21842098236084, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8751858472824097, + "num_tokens": 420231722.0, + "step": 11011 + }, + { + "epoch": 1.4008395878386974, + "ewc_loss": 0.06280647218227386, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029359207837842405, + "grad_norm": 7.212881565093994, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8656849265098572, + "num_tokens": 420272059.0, + "step": 11012 + }, + { + "epoch": 1.4009667981172877, + "ewc_loss": 0.0629163458943367, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002946907770819962, + "grad_norm": 7.28018045425415, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8703705072402954, + "num_tokens": 420307695.0, + "step": 11013 + }, + { + "epoch": 1.4010940083958783, + "ewc_loss": 0.06275773048400879, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002931046183221042, + "grad_norm": 7.212348461151123, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8726134300231934, + "num_tokens": 420345989.0, + "step": 11014 + }, + { + "epoch": 1.4012212186744688, + "ewc_loss": 0.06291691958904266, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002946965687442571, + "grad_norm": 7.24893045425415, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8687766194343567, + "num_tokens": 420387194.0, + "step": 11015 + }, + { + "epoch": 1.4013484289530593, + "ewc_loss": 0.06286486983299255, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029417601763270795, + "grad_norm": 7.270856857299805, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8782451748847961, + "num_tokens": 420427608.0, + "step": 11016 + }, + { + "epoch": 1.4014756392316499, + "ewc_loss": 0.062820665538311, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002937339886557311, + "grad_norm": 7.299188613891602, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8685011863708496, + "num_tokens": 420461363.0, + "step": 11017 + }, + { + "epoch": 1.4016028495102404, + "ewc_loss": 0.06277885288000107, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029331588302738965, + "grad_norm": 7.24240779876709, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8459459543228149, + "num_tokens": 420503577.0, + "step": 11018 + }, + { + "epoch": 1.401730059788831, + "ewc_loss": 0.06292886286973953, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029481598176062107, + "grad_norm": 7.3942952156066895, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8736252784729004, + "num_tokens": 420538852.0, + "step": 11019 + }, + { + "epoch": 1.4018572700674214, + "ewc_loss": 0.06255462765693665, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029107366572134197, + "grad_norm": 7.184326648712158, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8611565828323364, + "num_tokens": 420577495.0, + "step": 11020 + }, + { + "epoch": 1.401984480346012, + "ewc_loss": 0.06294354051351547, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002949627232737839, + "grad_norm": 7.382209777832031, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8633682727813721, + "num_tokens": 420616123.0, + "step": 11021 + }, + { + "epoch": 1.4021116906246025, + "ewc_loss": 0.062341563403606415, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029138437821529806, + "grad_norm": 7.2112956047058105, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8737523555755615, + "num_tokens": 420653904.0, + "step": 11022 + }, + { + "epoch": 1.402238900903193, + "ewc_loss": 0.06282167136669159, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029374402947723866, + "grad_norm": 7.285438537597656, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8869631290435791, + "num_tokens": 420693539.0, + "step": 11023 + }, + { + "epoch": 1.4023661111817836, + "ewc_loss": 0.06238536909222603, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002918224490713328, + "grad_norm": 7.406064510345459, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8667309880256653, + "num_tokens": 420729046.0, + "step": 11024 + }, + { + "epoch": 1.402493321460374, + "ewc_loss": 0.06250827014446259, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029061007080599666, + "grad_norm": 7.202545642852783, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.866170346736908, + "num_tokens": 420766143.0, + "step": 11025 + }, + { + "epoch": 1.4026205317389646, + "ewc_loss": 0.06284290552139282, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000293956371024251, + "grad_norm": 7.3987932205200195, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8594794273376465, + "num_tokens": 420806367.0, + "step": 11026 + }, + { + "epoch": 1.4027477420175551, + "ewc_loss": 0.062145452946424484, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028942327480763197, + "grad_norm": 7.155221462249756, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8702877163887024, + "num_tokens": 420845081.0, + "step": 11027 + }, + { + "epoch": 1.4028749522961454, + "ewc_loss": 0.06270299106836319, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000294998666504398, + "grad_norm": 7.485827922821045, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8688260912895203, + "num_tokens": 420885833.0, + "step": 11028 + }, + { + "epoch": 1.403002162574736, + "ewc_loss": 0.06204377859830856, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00028840656159445643, + "grad_norm": 7.264388561248779, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8683032989501953, + "num_tokens": 420916313.0, + "step": 11029 + }, + { + "epoch": 1.4031293728533265, + "ewc_loss": 0.06259793788194656, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029394813464023173, + "grad_norm": 7.571262836456299, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8451859354972839, + "num_tokens": 420957011.0, + "step": 11030 + }, + { + "epoch": 1.403256583131917, + "ewc_loss": 0.06196625158190727, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002876312646549195, + "grad_norm": 7.124687671661377, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8662563562393188, + "num_tokens": 420994815.0, + "step": 11031 + }, + { + "epoch": 1.4033837934105076, + "ewc_loss": 0.06286798417568207, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029664862086065114, + "grad_norm": 7.808358192443848, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.870845377445221, + "num_tokens": 421033977.0, + "step": 11032 + }, + { + "epoch": 1.403511003689098, + "ewc_loss": 0.06193903088569641, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002873590565286577, + "grad_norm": 7.079479694366455, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8668712377548218, + "num_tokens": 421068871.0, + "step": 11033 + }, + { + "epoch": 1.4036382139676886, + "ewc_loss": 0.06310094892978668, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029897820786572993, + "grad_norm": 7.6701459884643555, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8638346791267395, + "num_tokens": 421105680.0, + "step": 11034 + }, + { + "epoch": 1.4037654242462791, + "ewc_loss": 0.06202295422554016, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002881982654798776, + "grad_norm": 7.100619792938232, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8755443096160889, + "num_tokens": 421149519.0, + "step": 11035 + }, + { + "epoch": 1.4038926345248697, + "ewc_loss": 0.06305532157421112, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029852197621949017, + "grad_norm": 7.528832912445068, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8652815222740173, + "num_tokens": 421190195.0, + "step": 11036 + }, + { + "epoch": 1.40401984480346, + "ewc_loss": 0.06222974509000778, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029026620904915035, + "grad_norm": 7.350824356079102, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8585354685783386, + "num_tokens": 421218637.0, + "step": 11037 + }, + { + "epoch": 1.4041470550820505, + "ewc_loss": 0.06265412271022797, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002945099549833685, + "grad_norm": 7.363472938537598, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8681721687316895, + "num_tokens": 421261300.0, + "step": 11038 + }, + { + "epoch": 1.404274265360641, + "ewc_loss": 0.06235995888710022, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002915683144237846, + "grad_norm": 7.3003668785095215, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.860305666923523, + "num_tokens": 421297250.0, + "step": 11039 + }, + { + "epoch": 1.4044014756392316, + "ewc_loss": 0.06250153481960297, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002929840993601829, + "grad_norm": 7.30836820602417, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.856042206287384, + "num_tokens": 421334213.0, + "step": 11040 + }, + { + "epoch": 1.404528685917822, + "ewc_loss": 0.062444835901260376, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029241712763905525, + "grad_norm": 7.307806968688965, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8714083433151245, + "num_tokens": 421368341.0, + "step": 11041 + }, + { + "epoch": 1.4046558961964126, + "ewc_loss": 0.062466323375701904, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002926320012193173, + "grad_norm": 7.246060848236084, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8514162302017212, + "num_tokens": 421408278.0, + "step": 11042 + }, + { + "epoch": 1.4047831064750032, + "ewc_loss": 0.06282465904951096, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002937739191111177, + "grad_norm": 7.344531059265137, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8545955419540405, + "num_tokens": 421442559.0, + "step": 11043 + }, + { + "epoch": 1.4049103167535937, + "ewc_loss": 0.06268422305583954, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029236957198008895, + "grad_norm": 7.288640022277832, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8600726127624512, + "num_tokens": 421474014.0, + "step": 11044 + }, + { + "epoch": 1.4050375270321842, + "ewc_loss": 0.06283099204301834, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029383727815002203, + "grad_norm": 7.318000793457031, + "learning_rate": 1e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.8389208912849426, + "num_tokens": 421511193.0, + "step": 11045 + }, + { + "epoch": 1.4051647373107747, + "ewc_loss": 0.06272368878126144, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029276421992108226, + "grad_norm": 7.243020057678223, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8642224669456482, + "num_tokens": 421547895.0, + "step": 11046 + }, + { + "epoch": 1.4052919475893653, + "ewc_loss": 0.06262297928333282, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029419854399748147, + "grad_norm": 7.292803764343262, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8615610003471375, + "num_tokens": 421582883.0, + "step": 11047 + }, + { + "epoch": 1.4054191578679558, + "ewc_loss": 0.06255998462438583, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029356859158724546, + "grad_norm": 7.3060832023620605, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8661229610443115, + "num_tokens": 421619838.0, + "step": 11048 + }, + { + "epoch": 1.4055463681465463, + "ewc_loss": 0.0625627264380455, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029359603649936616, + "grad_norm": 7.257877349853516, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8735182285308838, + "num_tokens": 421659451.0, + "step": 11049 + }, + { + "epoch": 1.4056735784251368, + "ewc_loss": 0.06286713480949402, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029419868951663375, + "grad_norm": 7.2747673988342285, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8599705696105957, + "num_tokens": 421704826.0, + "step": 11050 + }, + { + "epoch": 1.4058007887037274, + "ewc_loss": 0.06269372254610062, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002924645959865302, + "grad_norm": 7.218960762023926, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8596042990684509, + "num_tokens": 421748953.0, + "step": 11051 + }, + { + "epoch": 1.405927998982318, + "ewc_loss": 0.06292150914669037, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029474240727722645, + "grad_norm": 7.302825450897217, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8457517623901367, + "num_tokens": 421786614.0, + "step": 11052 + }, + { + "epoch": 1.4060552092609082, + "ewc_loss": 0.0625087171792984, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002930559276137501, + "grad_norm": 7.339500427246094, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8577033281326294, + "num_tokens": 421817291.0, + "step": 11053 + }, + { + "epoch": 1.4061824195394987, + "ewc_loss": 0.06280291080474854, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000293556455289945, + "grad_norm": 7.2864251136779785, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8613688349723816, + "num_tokens": 421858200.0, + "step": 11054 + }, + { + "epoch": 1.4063096298180893, + "ewc_loss": 0.06281064450740814, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000293633813271299, + "grad_norm": 7.246953964233398, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8629502654075623, + "num_tokens": 421903343.0, + "step": 11055 + }, + { + "epoch": 1.4064368400966798, + "ewc_loss": 0.06276053190231323, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029313264531083405, + "grad_norm": 7.293202877044678, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8797138929367065, + "num_tokens": 421941426.0, + "step": 11056 + }, + { + "epoch": 1.4065640503752703, + "ewc_loss": 0.06277981400489807, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002933255163952708, + "grad_norm": 7.2833733558654785, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8594486713409424, + "num_tokens": 421983992.0, + "step": 11057 + }, + { + "epoch": 1.4066912606538609, + "ewc_loss": 0.06272855401039124, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029281285242177546, + "grad_norm": 7.257785797119141, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8713216781616211, + "num_tokens": 422015066.0, + "step": 11058 + }, + { + "epoch": 1.4068184709324514, + "ewc_loss": 0.06281515955924988, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002936789533123374, + "grad_norm": 7.269815444946289, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8670362234115601, + "num_tokens": 422052392.0, + "step": 11059 + }, + { + "epoch": 1.406945681211042, + "ewc_loss": 0.06284831464290619, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029401047504507005, + "grad_norm": 7.303336143493652, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8603569269180298, + "num_tokens": 422091453.0, + "step": 11060 + }, + { + "epoch": 1.4070728914896324, + "ewc_loss": 0.06291215121746063, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029464889666996896, + "grad_norm": 7.287539005279541, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8478002548217773, + "num_tokens": 422131019.0, + "step": 11061 + }, + { + "epoch": 1.4072001017682227, + "ewc_loss": 0.06282484531402588, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002937758108600974, + "grad_norm": 7.250782012939453, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8564585447311401, + "num_tokens": 422172139.0, + "step": 11062 + }, + { + "epoch": 1.4073273120468133, + "ewc_loss": 0.06295146048069, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002950419730041176, + "grad_norm": 7.307758331298828, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8700800538063049, + "num_tokens": 422209708.0, + "step": 11063 + }, + { + "epoch": 1.4074545223254038, + "ewc_loss": 0.06257640570402145, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002937328245025128, + "grad_norm": 7.278039455413818, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8641021251678467, + "num_tokens": 422245091.0, + "step": 11064 + }, + { + "epoch": 1.4075817326039943, + "ewc_loss": 0.06277847290039062, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029575350345112383, + "grad_norm": 7.325865745544434, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.863129734992981, + "num_tokens": 422280054.0, + "step": 11065 + }, + { + "epoch": 1.4077089428825849, + "ewc_loss": 0.06256219744682312, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002935907104983926, + "grad_norm": 7.291551113128662, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8699747323989868, + "num_tokens": 422310036.0, + "step": 11066 + }, + { + "epoch": 1.4078361531611754, + "ewc_loss": 0.06290130317211151, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029454033938236535, + "grad_norm": 7.310328960418701, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8467953205108643, + "num_tokens": 422347914.0, + "step": 11067 + }, + { + "epoch": 1.407963363439766, + "ewc_loss": 0.06286273896694183, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029415477183647454, + "grad_norm": 7.286248207092285, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.864158034324646, + "num_tokens": 422383207.0, + "step": 11068 + }, + { + "epoch": 1.4080905737183564, + "ewc_loss": 0.06282661855220795, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002937935059890151, + "grad_norm": 7.279697895050049, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8662009239196777, + "num_tokens": 422414675.0, + "step": 11069 + }, + { + "epoch": 1.408217783996947, + "ewc_loss": 0.06284235417842865, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002939508995041251, + "grad_norm": 7.308725357055664, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8521926999092102, + "num_tokens": 422449746.0, + "step": 11070 + }, + { + "epoch": 1.4083449942755375, + "ewc_loss": 0.06272370368242264, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029276436544023454, + "grad_norm": 7.260893821716309, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8562756776809692, + "num_tokens": 422487789.0, + "step": 11071 + }, + { + "epoch": 1.408472204554128, + "ewc_loss": 0.06290684640407562, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002945958112832159, + "grad_norm": 7.280058860778809, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8689594268798828, + "num_tokens": 422521750.0, + "step": 11072 + }, + { + "epoch": 1.4085994148327186, + "ewc_loss": 0.06274278461933136, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002929551701527089, + "grad_norm": 7.198331356048584, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8718951940536499, + "num_tokens": 422562923.0, + "step": 11073 + }, + { + "epoch": 1.408726625111309, + "ewc_loss": 0.0630066841840744, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029559421818703413, + "grad_norm": 7.370006561279297, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8529627323150635, + "num_tokens": 422606832.0, + "step": 11074 + }, + { + "epoch": 1.4088538353898996, + "ewc_loss": 0.06266302615404129, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002921575796790421, + "grad_norm": 7.203964710235596, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8654333353042603, + "num_tokens": 422644465.0, + "step": 11075 + }, + { + "epoch": 1.4089810456684901, + "ewc_loss": 0.063079334795475, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029632067889906466, + "grad_norm": 7.319370746612549, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.870998740196228, + "num_tokens": 422681204.0, + "step": 11076 + }, + { + "epoch": 1.4091082559470804, + "ewc_loss": 0.06282759457826614, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029380328487604856, + "grad_norm": 7.299597263336182, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8527672290802002, + "num_tokens": 422716325.0, + "step": 11077 + }, + { + "epoch": 1.409235466225671, + "ewc_loss": 0.06287218630313873, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029424921376630664, + "grad_norm": 7.2482171058654785, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8528411388397217, + "num_tokens": 422753644.0, + "step": 11078 + }, + { + "epoch": 1.4093626765042615, + "ewc_loss": 0.06298049539327621, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000295332312816754, + "grad_norm": 7.362027645111084, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8629326820373535, + "num_tokens": 422790893.0, + "step": 11079 + }, + { + "epoch": 1.409489886782852, + "ewc_loss": 0.06269513070583344, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029247862403281033, + "grad_norm": 7.2307586669921875, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8549258708953857, + "num_tokens": 422829400.0, + "step": 11080 + }, + { + "epoch": 1.4096170970614426, + "ewc_loss": 0.06298575550317764, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029538487433455884, + "grad_norm": 7.315876007080078, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8722323775291443, + "num_tokens": 422867386.0, + "step": 11081 + }, + { + "epoch": 1.409744307340033, + "ewc_loss": 0.0627303421497345, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029283075127750635, + "grad_norm": 7.180107593536377, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8652350306510925, + "num_tokens": 422910736.0, + "step": 11082 + }, + { + "epoch": 1.4098715176186236, + "ewc_loss": 0.06304939091205597, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029602128779515624, + "grad_norm": 7.286604881286621, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8672544956207275, + "num_tokens": 422950302.0, + "step": 11083 + }, + { + "epoch": 1.4099987278972141, + "ewc_loss": 0.06290711462497711, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029459845973178744, + "grad_norm": 7.275522232055664, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8614451885223389, + "num_tokens": 422985892.0, + "step": 11084 + }, + { + "epoch": 1.4101259381758047, + "ewc_loss": 0.06291036307811737, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029463099781423807, + "grad_norm": 7.308788299560547, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8551632165908813, + "num_tokens": 423020075.0, + "step": 11085 + }, + { + "epoch": 1.410253148454395, + "ewc_loss": 0.06283297389745712, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029385709785856307, + "grad_norm": 7.286002159118652, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8659735321998596, + "num_tokens": 423058260.0, + "step": 11086 + }, + { + "epoch": 1.4103803587329855, + "ewc_loss": 0.0628022700548172, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029355002334341407, + "grad_norm": 7.261430263519287, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8869016170501709, + "num_tokens": 423092158.0, + "step": 11087 + }, + { + "epoch": 1.410507569011576, + "ewc_loss": 0.062864750623703, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029417482437565923, + "grad_norm": 7.338450908660889, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8598591089248657, + "num_tokens": 423123999.0, + "step": 11088 + }, + { + "epoch": 1.4106347792901666, + "ewc_loss": 0.06283561885356903, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002938834950327873, + "grad_norm": 7.24578332901001, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8592052459716797, + "num_tokens": 423167111.0, + "step": 11089 + }, + { + "epoch": 1.410761989568757, + "ewc_loss": 0.06289765983819962, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029450393049046397, + "grad_norm": 7.393430709838867, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8549359440803528, + "num_tokens": 423202060.0, + "step": 11090 + }, + { + "epoch": 1.4108891998473476, + "ewc_loss": 0.06293049454689026, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002923908759839833, + "grad_norm": 7.2727227210998535, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8555783033370972, + "num_tokens": 423241200.0, + "step": 11091 + }, + { + "epoch": 1.4110164101259381, + "ewc_loss": 0.06317374855279922, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029482340323738754, + "grad_norm": 7.292506694793701, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8726891279220581, + "num_tokens": 423284713.0, + "step": 11092 + }, + { + "epoch": 1.4111436204045287, + "ewc_loss": 0.06296034902334213, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002926894521806389, + "grad_norm": 7.278960704803467, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8652957677841187, + "num_tokens": 423322524.0, + "step": 11093 + }, + { + "epoch": 1.4112708306831192, + "ewc_loss": 0.06310196220874786, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002941055572591722, + "grad_norm": 7.315234661102295, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8638155460357666, + "num_tokens": 423360551.0, + "step": 11094 + }, + { + "epoch": 1.4113980409617097, + "ewc_loss": 0.06280913949012756, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002936187374871224, + "grad_norm": 7.294401168823242, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8738842010498047, + "num_tokens": 423395227.0, + "step": 11095 + }, + { + "epoch": 1.4115252512403003, + "ewc_loss": 0.0627366378903389, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002928937319666147, + "grad_norm": 7.363720893859863, + "learning_rate": 1e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8500633835792542, + "num_tokens": 423434757.0, + "step": 11096 + }, + { + "epoch": 1.4116524615188908, + "ewc_loss": 0.06271235644817352, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002926509187091142, + "grad_norm": 7.330521106719971, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8806604743003845, + "num_tokens": 423464599.0, + "step": 11097 + }, + { + "epoch": 1.4117796717974813, + "ewc_loss": 0.0627727136015892, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002932545030489564, + "grad_norm": 7.3402323722839355, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8559082746505737, + "num_tokens": 423504327.0, + "step": 11098 + }, + { + "epoch": 1.4119068820760718, + "ewc_loss": 0.06286288797855377, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002941561979241669, + "grad_norm": 7.31942892074585, + "learning_rate": 1e-06, + "loss": 0.5275, + "mean_token_accuracy": 0.8455507755279541, + "num_tokens": 423543114.0, + "step": 11099 + }, + { + "epoch": 1.4120340923546624, + "ewc_loss": 0.06267032027244568, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002922305138781667, + "grad_norm": 7.3116865158081055, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8506033420562744, + "num_tokens": 423581727.0, + "step": 11100 + }, + { + "epoch": 1.412161302633253, + "ewc_loss": 0.06272140145301819, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002927414025180042, + "grad_norm": 7.328802108764648, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.868370532989502, + "num_tokens": 423617073.0, + "step": 11101 + }, + { + "epoch": 1.4122885129118432, + "ewc_loss": 0.0626303181052208, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029183051083236933, + "grad_norm": 7.22024393081665, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8677574992179871, + "num_tokens": 423664922.0, + "step": 11102 + }, + { + "epoch": 1.4124157231904337, + "ewc_loss": 0.06288091093301773, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029433643794618547, + "grad_norm": 7.3361382484436035, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8551206588745117, + "num_tokens": 423706990.0, + "step": 11103 + }, + { + "epoch": 1.4125429334690243, + "ewc_loss": 0.06262721866369247, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029179954435676336, + "grad_norm": 7.279560089111328, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8459988236427307, + "num_tokens": 423745945.0, + "step": 11104 + }, + { + "epoch": 1.4126701437476148, + "ewc_loss": 0.06281674653291702, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029369478579610586, + "grad_norm": 7.343705177307129, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8696961402893066, + "num_tokens": 423780824.0, + "step": 11105 + }, + { + "epoch": 1.4127973540262053, + "ewc_loss": 0.06260822713375092, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002916096127592027, + "grad_norm": 7.201383590698242, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8762795329093933, + "num_tokens": 423818640.0, + "step": 11106 + }, + { + "epoch": 1.4129245643047958, + "ewc_loss": 0.06289219111204147, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002944492734968662, + "grad_norm": 7.317310333251953, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8718265295028687, + "num_tokens": 423856657.0, + "step": 11107 + }, + { + "epoch": 1.4130517745833864, + "ewc_loss": 0.06272262334823608, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002927535679191351, + "grad_norm": 7.247183322906494, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8750841617584229, + "num_tokens": 423900288.0, + "step": 11108 + }, + { + "epoch": 1.413178984861977, + "ewc_loss": 0.06294400990009308, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002949674380943179, + "grad_norm": 7.377877235412598, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8695882558822632, + "num_tokens": 423935929.0, + "step": 11109 + }, + { + "epoch": 1.4133061951405674, + "ewc_loss": 0.06271921098232269, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029271942912600935, + "grad_norm": 7.296345233917236, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8613227605819702, + "num_tokens": 423970719.0, + "step": 11110 + }, + { + "epoch": 1.4134334054191577, + "ewc_loss": 0.06296725571155548, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029519989038817585, + "grad_norm": 7.458314895629883, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8632621169090271, + "num_tokens": 424012338.0, + "step": 11111 + }, + { + "epoch": 1.4135606156977483, + "ewc_loss": 0.06259806454181671, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002915079821832478, + "grad_norm": 7.231817722320557, + "learning_rate": 1e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8454769849777222, + "num_tokens": 424050112.0, + "step": 11112 + }, + { + "epoch": 1.4136878259763388, + "ewc_loss": 0.0629955381155014, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002954827214125544, + "grad_norm": 7.325995445251465, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8630399703979492, + "num_tokens": 424092688.0, + "step": 11113 + }, + { + "epoch": 1.4138150362549293, + "ewc_loss": 0.06261876225471497, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029171493952162564, + "grad_norm": 7.236511707305908, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.861746609210968, + "num_tokens": 424131355.0, + "step": 11114 + }, + { + "epoch": 1.4139422465335199, + "ewc_loss": 0.06281884014606476, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002937157405540347, + "grad_norm": 7.250124931335449, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8521164059638977, + "num_tokens": 424177768.0, + "step": 11115 + }, + { + "epoch": 1.4140694568121104, + "ewc_loss": 0.06284712255001068, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029399857157841325, + "grad_norm": 7.298393726348877, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8761221170425415, + "num_tokens": 424215834.0, + "step": 11116 + }, + { + "epoch": 1.414196667090701, + "ewc_loss": 0.06282196193933487, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029374693986028433, + "grad_norm": 7.225541114807129, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8677690029144287, + "num_tokens": 424260002.0, + "step": 11117 + }, + { + "epoch": 1.4143238773692914, + "ewc_loss": 0.06294183433055878, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002949456684291363, + "grad_norm": 7.307496547698975, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8612954616546631, + "num_tokens": 424299343.0, + "step": 11118 + }, + { + "epoch": 1.414451087647882, + "ewc_loss": 0.06293036788702011, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002948310284409672, + "grad_norm": 7.252046585083008, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8568567037582397, + "num_tokens": 424340312.0, + "step": 11119 + }, + { + "epoch": 1.4145782979264725, + "ewc_loss": 0.063024140894413, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029576875385828316, + "grad_norm": 7.35236930847168, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8725600838661194, + "num_tokens": 424376658.0, + "step": 11120 + }, + { + "epoch": 1.414705508205063, + "ewc_loss": 0.06287593394517899, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029428667039610445, + "grad_norm": 7.303027629852295, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8747112154960632, + "num_tokens": 424415499.0, + "step": 11121 + }, + { + "epoch": 1.4148327184836536, + "ewc_loss": 0.06294241547584534, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002949514891952276, + "grad_norm": 7.306711673736572, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8648243546485901, + "num_tokens": 424453401.0, + "step": 11122 + }, + { + "epoch": 1.414959928762244, + "ewc_loss": 0.06266232579946518, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029459199868142605, + "grad_norm": 7.306453227996826, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8669848442077637, + "num_tokens": 424490654.0, + "step": 11123 + }, + { + "epoch": 1.4150871390408346, + "ewc_loss": 0.06269463151693344, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002949150511994958, + "grad_norm": 7.216481685638428, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8732634782791138, + "num_tokens": 424534469.0, + "step": 11124 + }, + { + "epoch": 1.4152143493194251, + "ewc_loss": 0.0628834068775177, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029680278385058045, + "grad_norm": 7.394469261169434, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8541738986968994, + "num_tokens": 424570187.0, + "step": 11125 + }, + { + "epoch": 1.4153415595980154, + "ewc_loss": 0.06261876970529556, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029415645985864103, + "grad_norm": 7.255894660949707, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.842724621295929, + "num_tokens": 424607898.0, + "step": 11126 + }, + { + "epoch": 1.415468769876606, + "ewc_loss": 0.06323603540658951, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002978876873385161, + "grad_norm": 7.362541198730469, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8663436770439148, + "num_tokens": 424652156.0, + "step": 11127 + }, + { + "epoch": 1.4155959801551965, + "ewc_loss": 0.06262801587581635, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029424886452034116, + "grad_norm": 7.3146138191223145, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8542557954788208, + "num_tokens": 424688735.0, + "step": 11128 + }, + { + "epoch": 1.415723190433787, + "ewc_loss": 0.0628339946269989, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002963086881209165, + "grad_norm": 7.305295944213867, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8646143674850464, + "num_tokens": 424732859.0, + "step": 11129 + }, + { + "epoch": 1.4158504007123776, + "ewc_loss": 0.0627075582742691, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029504435951821506, + "grad_norm": 7.305539608001709, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8557495474815369, + "num_tokens": 424769381.0, + "step": 11130 + }, + { + "epoch": 1.415977610990968, + "ewc_loss": 0.06282801926136017, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029624890885315835, + "grad_norm": 7.317021369934082, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8647266626358032, + "num_tokens": 424802362.0, + "step": 11131 + }, + { + "epoch": 1.4161048212695586, + "ewc_loss": 0.06284835934638977, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029645231552422047, + "grad_norm": 7.290480136871338, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8599027991294861, + "num_tokens": 424837897.0, + "step": 11132 + }, + { + "epoch": 1.4162320315481491, + "ewc_loss": 0.062879279255867, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002967615728266537, + "grad_norm": 7.28374719619751, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8742008209228516, + "num_tokens": 424878154.0, + "step": 11133 + }, + { + "epoch": 1.4163592418267397, + "ewc_loss": 0.06277485191822052, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002957172691822052, + "grad_norm": 7.274712562561035, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8687451481819153, + "num_tokens": 424915063.0, + "step": 11134 + }, + { + "epoch": 1.41648645210533, + "ewc_loss": 0.06291273236274719, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029709606315009296, + "grad_norm": 7.297811031341553, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8628135919570923, + "num_tokens": 424955159.0, + "step": 11135 + }, + { + "epoch": 1.4166136623839205, + "ewc_loss": 0.06289587169885635, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029692749376408756, + "grad_norm": 7.335232734680176, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8557372093200684, + "num_tokens": 424992573.0, + "step": 11136 + }, + { + "epoch": 1.416740872662511, + "ewc_loss": 0.06311564147472382, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029668377828784287, + "grad_norm": 7.338511943817139, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8677452206611633, + "num_tokens": 425030452.0, + "step": 11137 + }, + { + "epoch": 1.4168680829411016, + "ewc_loss": 0.06289882212877274, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029695694684050977, + "grad_norm": 7.307684898376465, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8763845562934875, + "num_tokens": 425067118.0, + "step": 11138 + }, + { + "epoch": 1.416995293219692, + "ewc_loss": 0.06310912221670151, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029661855660378933, + "grad_norm": 7.408502578735352, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.853519082069397, + "num_tokens": 425101684.0, + "step": 11139 + }, + { + "epoch": 1.4171225034982826, + "ewc_loss": 0.06280286610126495, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002959973644465208, + "grad_norm": 7.277112007141113, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8718819618225098, + "num_tokens": 425143127.0, + "step": 11140 + }, + { + "epoch": 1.4172497137768731, + "ewc_loss": 0.06301699578762054, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029813870787620544, + "grad_norm": 7.446938991546631, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8628222346305847, + "num_tokens": 425173912.0, + "step": 11141 + }, + { + "epoch": 1.4173769240554637, + "ewc_loss": 0.06268735229969025, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029484223341569304, + "grad_norm": 7.27456521987915, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8682746887207031, + "num_tokens": 425210146.0, + "step": 11142 + }, + { + "epoch": 1.4175041343340542, + "ewc_loss": 0.06297783553600311, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029774714494124055, + "grad_norm": 7.329061985015869, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8757058382034302, + "num_tokens": 425246082.0, + "step": 11143 + }, + { + "epoch": 1.4176313446126447, + "ewc_loss": 0.0629810094833374, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002953374059870839, + "grad_norm": 7.284614086151123, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8607068061828613, + "num_tokens": 425287470.0, + "step": 11144 + }, + { + "epoch": 1.4177585548912353, + "ewc_loss": 0.06313882023096085, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002969155611936003, + "grad_norm": 7.34812593460083, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8593577146530151, + "num_tokens": 425326553.0, + "step": 11145 + }, + { + "epoch": 1.4178857651698258, + "ewc_loss": 0.06311593949794769, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000296686717774719, + "grad_norm": 7.35488224029541, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8567607402801514, + "num_tokens": 425363105.0, + "step": 11146 + }, + { + "epoch": 1.4180129754484163, + "ewc_loss": 0.06295760720968246, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002951034111902118, + "grad_norm": 7.268989562988281, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8580300807952881, + "num_tokens": 425402513.0, + "step": 11147 + }, + { + "epoch": 1.4181401857270068, + "ewc_loss": 0.06322775036096573, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002978048287332058, + "grad_norm": 7.354218482971191, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.863831639289856, + "num_tokens": 425440132.0, + "step": 11148 + }, + { + "epoch": 1.4182673960055974, + "ewc_loss": 0.06303329020738602, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002958602271974087, + "grad_norm": 7.265626907348633, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8618563413619995, + "num_tokens": 425481738.0, + "step": 11149 + }, + { + "epoch": 1.418394606284188, + "ewc_loss": 0.06349555402994156, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002980414719786495, + "grad_norm": 7.4974822998046875, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8614403009414673, + "num_tokens": 425516916.0, + "step": 11150 + }, + { + "epoch": 1.4185218165627782, + "ewc_loss": 0.06310686469078064, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002941545390058309, + "grad_norm": 7.249587535858154, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8627887964248657, + "num_tokens": 425555951.0, + "step": 11151 + }, + { + "epoch": 1.4186490268413687, + "ewc_loss": 0.06332501769065857, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002987775660585612, + "grad_norm": 7.319559574127197, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8634989261627197, + "num_tokens": 425595302.0, + "step": 11152 + }, + { + "epoch": 1.4187762371199593, + "ewc_loss": 0.06294508278369904, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002949782065115869, + "grad_norm": 7.308353900909424, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8670327663421631, + "num_tokens": 425623281.0, + "step": 11153 + }, + { + "epoch": 1.4189034473985498, + "ewc_loss": 0.06313510239124298, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002968783664982766, + "grad_norm": 7.369740009307861, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8727051019668579, + "num_tokens": 425659124.0, + "step": 11154 + }, + { + "epoch": 1.4190306576771403, + "ewc_loss": 0.06300872564315796, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029561459086835384, + "grad_norm": 7.333789348602295, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8499972820281982, + "num_tokens": 425695809.0, + "step": 11155 + }, + { + "epoch": 1.4191578679557308, + "ewc_loss": 0.06305176019668579, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029604494920931756, + "grad_norm": 7.356493949890137, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.853691577911377, + "num_tokens": 425731890.0, + "step": 11156 + }, + { + "epoch": 1.4192850782343214, + "ewc_loss": 0.06296880543231964, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029521537362597883, + "grad_norm": 7.312896728515625, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8717159628868103, + "num_tokens": 425770899.0, + "step": 11157 + }, + { + "epoch": 1.419412288512912, + "ewc_loss": 0.06299537420272827, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029548106249421835, + "grad_norm": 7.3174238204956055, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.874862551689148, + "num_tokens": 425807812.0, + "step": 11158 + }, + { + "epoch": 1.4195394987915024, + "ewc_loss": 0.06299136579036713, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029544095741584897, + "grad_norm": 7.298954486846924, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8586435317993164, + "num_tokens": 425848997.0, + "step": 11159 + }, + { + "epoch": 1.4196667090700927, + "ewc_loss": 0.06303048878908157, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002958322293125093, + "grad_norm": 7.327922821044922, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8728950023651123, + "num_tokens": 425887816.0, + "step": 11160 + }, + { + "epoch": 1.4197939193486833, + "ewc_loss": 0.0629749447107315, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002952767536044121, + "grad_norm": 7.318448066711426, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8541466593742371, + "num_tokens": 425925488.0, + "step": 11161 + }, + { + "epoch": 1.4199211296272738, + "ewc_loss": 0.06301361322402954, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002956634561996907, + "grad_norm": 7.247992515563965, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8702515959739685, + "num_tokens": 425965616.0, + "step": 11162 + }, + { + "epoch": 1.4200483399058643, + "ewc_loss": 0.06310765445232391, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002966038591694087, + "grad_norm": 7.3257222175598145, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8568540811538696, + "num_tokens": 426005308.0, + "step": 11163 + }, + { + "epoch": 1.4201755501844548, + "ewc_loss": 0.06304248422384262, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002959521661978215, + "grad_norm": 7.324591159820557, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8606273531913757, + "num_tokens": 426042339.0, + "step": 11164 + }, + { + "epoch": 1.4203027604630454, + "ewc_loss": 0.06307223439216614, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029624972376041114, + "grad_norm": 7.318440914154053, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8625932931900024, + "num_tokens": 426081925.0, + "step": 11165 + }, + { + "epoch": 1.420429970741636, + "ewc_loss": 0.06299922615289688, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002955196250695735, + "grad_norm": 7.2574028968811035, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8571406602859497, + "num_tokens": 426120918.0, + "step": 11166 + }, + { + "epoch": 1.4205571810202264, + "ewc_loss": 0.06309569627046585, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002964843297377229, + "grad_norm": 7.338063716888428, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.852652370929718, + "num_tokens": 426158711.0, + "step": 11167 + }, + { + "epoch": 1.420684391298817, + "ewc_loss": 0.06306028366088867, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029613019432872534, + "grad_norm": 9.731746673583984, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.858702540397644, + "num_tokens": 426197034.0, + "step": 11168 + }, + { + "epoch": 1.4208116015774075, + "ewc_loss": 0.06439145654439926, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003094419080298394, + "grad_norm": 7.318743705749512, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8560307025909424, + "num_tokens": 426239390.0, + "step": 11169 + }, + { + "epoch": 1.420938811855998, + "ewc_loss": 0.06419360637664795, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003074634005315602, + "grad_norm": 7.7054901123046875, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8520660996437073, + "num_tokens": 426285488.0, + "step": 11170 + }, + { + "epoch": 1.4210660221345885, + "ewc_loss": 0.0630406066775322, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002959333942271769, + "grad_norm": 7.244332790374756, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8507586717605591, + "num_tokens": 426323955.0, + "step": 11171 + }, + { + "epoch": 1.421193232413179, + "ewc_loss": 0.06430135667324066, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003085409407503903, + "grad_norm": 7.656270980834961, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8835819959640503, + "num_tokens": 426359460.0, + "step": 11172 + }, + { + "epoch": 1.4213204426917696, + "ewc_loss": 0.06308802217245102, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029640758293680847, + "grad_norm": 7.322477340698242, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8552470207214355, + "num_tokens": 426397104.0, + "step": 11173 + }, + { + "epoch": 1.4214476529703601, + "ewc_loss": 0.06378426402807236, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030336997588165104, + "grad_norm": 7.481097221374512, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8719559907913208, + "num_tokens": 426441349.0, + "step": 11174 + }, + { + "epoch": 1.4215748632489504, + "ewc_loss": 0.06314300745725632, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029695744160562754, + "grad_norm": 7.370035171508789, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8720799088478088, + "num_tokens": 426478410.0, + "step": 11175 + }, + { + "epoch": 1.421702073527541, + "ewc_loss": 0.06346655637025833, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030019288533367217, + "grad_norm": 7.4742279052734375, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8620051145553589, + "num_tokens": 426519874.0, + "step": 11176 + }, + { + "epoch": 1.4218292838061315, + "ewc_loss": 0.06335487961769104, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002966347092296928, + "grad_norm": 14.076513290405273, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8511925935745239, + "num_tokens": 426559970.0, + "step": 11177 + }, + { + "epoch": 1.421956494084722, + "ewc_loss": 0.07274002581834793, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00039292758447118104, + "grad_norm": 8.564026832580566, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8647011518478394, + "num_tokens": 426593897.0, + "step": 11178 + }, + { + "epoch": 1.4220837043633126, + "ewc_loss": 0.061464518308639526, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002801725349854678, + "grad_norm": 7.072617530822754, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.862835168838501, + "num_tokens": 426633712.0, + "step": 11179 + }, + { + "epoch": 1.422210914641903, + "ewc_loss": 0.06498706340789795, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00031539794872514904, + "grad_norm": 7.769362926483154, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8777150511741638, + "num_tokens": 426669085.0, + "step": 11180 + }, + { + "epoch": 1.4223381249204936, + "ewc_loss": 0.06341162323951721, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029964360874146223, + "grad_norm": 7.2715582847595215, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8772931098937988, + "num_tokens": 426709260.0, + "step": 11181 + }, + { + "epoch": 1.4224653351990841, + "ewc_loss": 0.06409650295972824, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030649235122837126, + "grad_norm": 7.585212230682373, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8521909117698669, + "num_tokens": 426747747.0, + "step": 11182 + }, + { + "epoch": 1.4225925454776747, + "ewc_loss": 0.06336909532546997, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029921834357082844, + "grad_norm": 7.311417579650879, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8504420518875122, + "num_tokens": 426788185.0, + "step": 11183 + }, + { + "epoch": 1.422719755756265, + "ewc_loss": 0.06390932947397232, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030462065478786826, + "grad_norm": 7.571687698364258, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8644371032714844, + "num_tokens": 426828663.0, + "step": 11184 + }, + { + "epoch": 1.4228469660348555, + "ewc_loss": 0.06345441937446594, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003000715805683285, + "grad_norm": 7.392497539520264, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8532427549362183, + "num_tokens": 426869205.0, + "step": 11185 + }, + { + "epoch": 1.422974176313446, + "ewc_loss": 0.06356945633888245, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003012219094671309, + "grad_norm": 7.477419853210449, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8782044649124146, + "num_tokens": 426900561.0, + "step": 11186 + }, + { + "epoch": 1.4231013865920366, + "ewc_loss": 0.0630040094256401, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029800881748087704, + "grad_norm": 7.350973606109619, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8782904148101807, + "num_tokens": 426940618.0, + "step": 11187 + }, + { + "epoch": 1.423228596870627, + "ewc_loss": 0.06343071162700653, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002998344716615975, + "grad_norm": 7.484537601470947, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8541892766952515, + "num_tokens": 426975509.0, + "step": 11188 + }, + { + "epoch": 1.4233558071492176, + "ewc_loss": 0.06288562715053558, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029682504828087986, + "grad_norm": 7.304165363311768, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8715919852256775, + "num_tokens": 427011647.0, + "step": 11189 + }, + { + "epoch": 1.4234830174278081, + "ewc_loss": 0.06353206932544708, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030084801255725324, + "grad_norm": 13.728241920471191, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8603911399841309, + "num_tokens": 427046631.0, + "step": 11190 + }, + { + "epoch": 1.4236102277063987, + "ewc_loss": 0.0735965371131897, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00040149272535927594, + "grad_norm": 8.665852546691895, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8602216839790344, + "num_tokens": 427081364.0, + "step": 11191 + }, + { + "epoch": 1.4237374379849892, + "ewc_loss": 0.06173699349164963, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002828972938004881, + "grad_norm": 7.061410427093506, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8652615547180176, + "num_tokens": 427118667.0, + "step": 11192 + }, + { + "epoch": 1.4238646482635797, + "ewc_loss": 0.06554263085126877, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00032095363712869585, + "grad_norm": 7.879552841186523, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8676770329475403, + "num_tokens": 427155017.0, + "step": 11193 + }, + { + "epoch": 1.4239918585421703, + "ewc_loss": 0.06381987780332565, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030372614855878055, + "grad_norm": 7.362781524658203, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8560344576835632, + "num_tokens": 427191741.0, + "step": 11194 + }, + { + "epoch": 1.4241190688207608, + "ewc_loss": 0.06428506970405579, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00031081942142918706, + "grad_norm": 7.718926906585693, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8668017983436584, + "num_tokens": 427226210.0, + "step": 11195 + }, + { + "epoch": 1.4242462790993513, + "ewc_loss": 0.06336446851491928, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00030161344329826534, + "grad_norm": 7.349049091339111, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.860529899597168, + "num_tokens": 427261699.0, + "step": 11196 + }, + { + "epoch": 1.4243734893779418, + "ewc_loss": 0.06387082487344742, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0003066769859287888, + "grad_norm": 7.588299751281738, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8540630340576172, + "num_tokens": 427297158.0, + "step": 11197 + }, + { + "epoch": 1.4245006996565324, + "ewc_loss": 0.06334659457206726, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00030143471667543054, + "grad_norm": 7.371488094329834, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8474293351173401, + "num_tokens": 427338234.0, + "step": 11198 + }, + { + "epoch": 1.424627909935123, + "ewc_loss": 0.06355854868888855, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0003035542613361031, + "grad_norm": 7.512182712554932, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8573526740074158, + "num_tokens": 427376727.0, + "step": 11199 + }, + { + "epoch": 1.4247551202137132, + "ewc_loss": 0.06316736340522766, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029964238638058305, + "grad_norm": 7.3350300788879395, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8756515979766846, + "num_tokens": 427417095.0, + "step": 11200 + }, + { + "epoch": 1.4248823304923037, + "ewc_loss": 0.06338411569595337, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0003018098941538483, + "grad_norm": 7.485397815704346, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8469588756561279, + "num_tokens": 427453243.0, + "step": 11201 + }, + { + "epoch": 1.4250095407708943, + "ewc_loss": 0.06303048133850098, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029827351681888103, + "grad_norm": 7.386750221252441, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8658499717712402, + "num_tokens": 427490961.0, + "step": 11202 + }, + { + "epoch": 1.4251367510494848, + "ewc_loss": 0.06350093334913254, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003005366597790271, + "grad_norm": 9.87087631225586, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8736321926116943, + "num_tokens": 427521487.0, + "step": 11203 + }, + { + "epoch": 1.4252639613280753, + "ewc_loss": 0.0650804191827774, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003163314831908792, + "grad_norm": 7.457651138305664, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8607351183891296, + "num_tokens": 427556607.0, + "step": 11204 + }, + { + "epoch": 1.4253911716066658, + "ewc_loss": 0.0642525851726532, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030805321875959635, + "grad_norm": 7.651515007019043, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8639649152755737, + "num_tokens": 427598940.0, + "step": 11205 + }, + { + "epoch": 1.4255183818852564, + "ewc_loss": 0.06340014189481735, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029952876502647996, + "grad_norm": 7.366942882537842, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8522047996520996, + "num_tokens": 427641337.0, + "step": 11206 + }, + { + "epoch": 1.425645592163847, + "ewc_loss": 0.06414498388767242, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030697719193995, + "grad_norm": 7.549829483032227, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.874970555305481, + "num_tokens": 427680235.0, + "step": 11207 + }, + { + "epoch": 1.4257728024424374, + "ewc_loss": 0.06334172934293747, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029894462204538286, + "grad_norm": 7.3538007736206055, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8643970489501953, + "num_tokens": 427719918.0, + "step": 11208 + }, + { + "epoch": 1.4259000127210277, + "ewc_loss": 0.06336739659309387, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00030164269264787436, + "grad_norm": 7.421750068664551, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.865809440612793, + "num_tokens": 427758042.0, + "step": 11209 + }, + { + "epoch": 1.4260272229996183, + "ewc_loss": 0.06326606869697571, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00030062944279052317, + "grad_norm": 7.40095329284668, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8647376298904419, + "num_tokens": 427794591.0, + "step": 11210 + }, + { + "epoch": 1.4261544332782088, + "ewc_loss": 0.06314339488744736, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002994026872329414, + "grad_norm": 7.439668655395508, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8700639009475708, + "num_tokens": 427828190.0, + "step": 11211 + }, + { + "epoch": 1.4262816435567993, + "ewc_loss": 0.063328817486763, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029881548834964633, + "grad_norm": 7.369169235229492, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8559266924858093, + "num_tokens": 427870363.0, + "step": 11212 + }, + { + "epoch": 1.4264088538353898, + "ewc_loss": 0.06339757144451141, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002995030954480171, + "grad_norm": 7.392339706420898, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8787502646446228, + "num_tokens": 427910037.0, + "step": 11213 + }, + { + "epoch": 1.4265360641139804, + "ewc_loss": 0.06305434554815292, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029851222643628716, + "grad_norm": 7.431246757507324, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8628121018409729, + "num_tokens": 427945782.0, + "step": 11214 + }, + { + "epoch": 1.426663274392571, + "ewc_loss": 0.06322789937257767, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002978063130285591, + "grad_norm": 7.334232807159424, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8705108761787415, + "num_tokens": 427984635.0, + "step": 11215 + }, + { + "epoch": 1.4267904846711614, + "ewc_loss": 0.06316531449556351, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002996218972839415, + "grad_norm": 7.3469767570495605, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8781828284263611, + "num_tokens": 428024289.0, + "step": 11216 + }, + { + "epoch": 1.426917694949752, + "ewc_loss": 0.06300665438175201, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029803530196659267, + "grad_norm": 7.350829601287842, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8557295203208923, + "num_tokens": 428067119.0, + "step": 11217 + }, + { + "epoch": 1.4270449052283425, + "ewc_loss": 0.063130222260952, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029927099240012467, + "grad_norm": 7.382225036621094, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8614686131477356, + "num_tokens": 428108150.0, + "step": 11218 + }, + { + "epoch": 1.427172115506933, + "ewc_loss": 0.06297514587640762, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002977201947942376, + "grad_norm": 7.295146942138672, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8740628361701965, + "num_tokens": 428150839.0, + "step": 11219 + }, + { + "epoch": 1.4272993257855235, + "ewc_loss": 0.06342838704586029, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000299811246804893, + "grad_norm": 7.387234210968018, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.851284384727478, + "num_tokens": 428194325.0, + "step": 11220 + }, + { + "epoch": 1.427426536064114, + "ewc_loss": 0.06295669078826904, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002975356765091419, + "grad_norm": 7.412676811218262, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8615684509277344, + "num_tokens": 428230879.0, + "step": 11221 + }, + { + "epoch": 1.4275537463427046, + "ewc_loss": 0.06301053613424301, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002980740973725915, + "grad_norm": 7.379035949707031, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8762706518173218, + "num_tokens": 428265795.0, + "step": 11222 + }, + { + "epoch": 1.4276809566212951, + "ewc_loss": 0.06297396123409271, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002977083786390722, + "grad_norm": 7.350561618804932, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8690125942230225, + "num_tokens": 428300413.0, + "step": 11223 + }, + { + "epoch": 1.4278081668998854, + "ewc_loss": 0.06298694759607315, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002978382399305701, + "grad_norm": 7.343611240386963, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.884600043296814, + "num_tokens": 428339293.0, + "step": 11224 + }, + { + "epoch": 1.427935377178476, + "ewc_loss": 0.0630083829164505, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029805258964188397, + "grad_norm": 7.385507583618164, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8765153884887695, + "num_tokens": 428374158.0, + "step": 11225 + }, + { + "epoch": 1.4280625874570665, + "ewc_loss": 0.06296122074127197, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002975809620693326, + "grad_norm": 7.332232475280762, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8646115064620972, + "num_tokens": 428411715.0, + "step": 11226 + }, + { + "epoch": 1.428189797735657, + "ewc_loss": 0.06318758428096771, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000299844570690766, + "grad_norm": 7.36972188949585, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8515303134918213, + "num_tokens": 428450000.0, + "step": 11227 + }, + { + "epoch": 1.4283170080142475, + "ewc_loss": 0.06303174048662186, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002982861769851297, + "grad_norm": 7.354360580444336, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8703851103782654, + "num_tokens": 428484546.0, + "step": 11228 + }, + { + "epoch": 1.428444218292838, + "ewc_loss": 0.06306909769773483, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029865975375287235, + "grad_norm": 7.357091903686523, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8644766807556152, + "num_tokens": 428527519.0, + "step": 11229 + }, + { + "epoch": 1.4285714285714286, + "ewc_loss": 0.06312378495931625, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002992066147271544, + "grad_norm": 7.3357157707214355, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8594704270362854, + "num_tokens": 428568744.0, + "step": 11230 + }, + { + "epoch": 1.4286986388500191, + "ewc_loss": 0.06309744715690613, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002989431959576905, + "grad_norm": 7.331860542297363, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8596669435501099, + "num_tokens": 428613050.0, + "step": 11231 + }, + { + "epoch": 1.4288258491286097, + "ewc_loss": 0.06313921511173248, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002993608941324055, + "grad_norm": 7.365705966949463, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8611524105072021, + "num_tokens": 428647334.0, + "step": 11232 + }, + { + "epoch": 1.4289530594072, + "ewc_loss": 0.06310789287090302, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029904767870903015, + "grad_norm": 7.332043647766113, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8701597452163696, + "num_tokens": 428681015.0, + "step": 11233 + }, + { + "epoch": 1.4290802696857905, + "ewc_loss": 0.06313659250736237, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029933464247733355, + "grad_norm": 7.405879497528076, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8568728566169739, + "num_tokens": 428718903.0, + "step": 11234 + }, + { + "epoch": 1.429207479964381, + "ewc_loss": 0.0631040409207344, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029900914523750544, + "grad_norm": 7.322640895843506, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8732227683067322, + "num_tokens": 428756038.0, + "step": 11235 + }, + { + "epoch": 1.4293346902429716, + "ewc_loss": 0.06315211951732635, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029948996962048113, + "grad_norm": 7.342300891876221, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8806920051574707, + "num_tokens": 428793908.0, + "step": 11236 + }, + { + "epoch": 1.429461900521562, + "ewc_loss": 0.06310103088617325, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029897908098064363, + "grad_norm": 7.350865840911865, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8568406701087952, + "num_tokens": 428830924.0, + "step": 11237 + }, + { + "epoch": 1.4295891108001526, + "ewc_loss": 0.06313473731279373, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002993161033373326, + "grad_norm": 7.363962650299072, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.860765278339386, + "num_tokens": 428869083.0, + "step": 11238 + }, + { + "epoch": 1.4297163210787431, + "ewc_loss": 0.06308744847774506, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029884325340390205, + "grad_norm": 7.417577266693115, + "learning_rate": 1e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8392380475997925, + "num_tokens": 428903970.0, + "step": 11239 + }, + { + "epoch": 1.4298435313573337, + "ewc_loss": 0.0630582869052887, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029855166212655604, + "grad_norm": 7.37794303894043, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8664363026618958, + "num_tokens": 428940710.0, + "step": 11240 + }, + { + "epoch": 1.4299707416359242, + "ewc_loss": 0.0630389153957367, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000298357947031036, + "grad_norm": 7.333183765411377, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8667464256286621, + "num_tokens": 428981160.0, + "step": 11241 + }, + { + "epoch": 1.4300979519145147, + "ewc_loss": 0.06303226947784424, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029829147388227284, + "grad_norm": 7.337625980377197, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8650625348091125, + "num_tokens": 429022464.0, + "step": 11242 + }, + { + "epoch": 1.4302251621931052, + "ewc_loss": 0.0631314367055893, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029928309959359467, + "grad_norm": 7.43140983581543, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8643506765365601, + "num_tokens": 429060964.0, + "step": 11243 + }, + { + "epoch": 1.4303523724716958, + "ewc_loss": 0.0632282942533493, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002978102711495012, + "grad_norm": 7.315865993499756, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.864843487739563, + "num_tokens": 429101389.0, + "step": 11244 + }, + { + "epoch": 1.4304795827502863, + "ewc_loss": 0.06325024366378784, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.000300471147056669, + "grad_norm": 7.41532564163208, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8600489497184753, + "num_tokens": 429135547.0, + "step": 11245 + }, + { + "epoch": 1.4306067930288768, + "ewc_loss": 0.06301397830247879, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002981085272040218, + "grad_norm": 7.3491621017456055, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8662450909614563, + "num_tokens": 429177504.0, + "step": 11246 + }, + { + "epoch": 1.4307340033074674, + "ewc_loss": 0.06314300000667572, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002993987291119993, + "grad_norm": 7.361008167266846, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.868025541305542, + "num_tokens": 429217923.0, + "step": 11247 + }, + { + "epoch": 1.430861213586058, + "ewc_loss": 0.06315314769744873, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.0002995002141688019, + "grad_norm": 7.400251388549805, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8637592196464539, + "num_tokens": 429251514.0, + "step": 11248 + }, + { + "epoch": 1.4309884238646482, + "ewc_loss": 0.06299081444740295, + "ewc_loss_diag": 3.314018249511719e-05, + "ewc_loss_parallel": 0.00029787688981741667, + "grad_norm": 7.332716464996338, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.848120927810669, + "num_tokens": 429291747.0, + "step": 11249 + }, + { + "epoch": 1.4311156341432387, + "ewc_loss": 0.06356725096702576, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030119981965981424, + "grad_norm": 7.428622722625732, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8608368039131165, + "num_tokens": 429333480.0, + "step": 11250 + }, + { + "epoch": 1.4312428444218293, + "ewc_loss": 0.06329157948493958, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002984431048389524, + "grad_norm": 7.348002910614014, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8750550746917725, + "num_tokens": 429366482.0, + "step": 11251 + }, + { + "epoch": 1.4313700547004198, + "ewc_loss": 0.06346084177494049, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030013572541065514, + "grad_norm": 7.407711505889893, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8567216396331787, + "num_tokens": 429400704.0, + "step": 11252 + }, + { + "epoch": 1.4314972649790103, + "ewc_loss": 0.0632343664765358, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002978710108436644, + "grad_norm": 7.513418674468994, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8513343334197998, + "num_tokens": 429437463.0, + "step": 11253 + }, + { + "epoch": 1.4316244752576008, + "ewc_loss": 0.063239686191082, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029792418354190886, + "grad_norm": 7.334331512451172, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.85237717628479, + "num_tokens": 429474496.0, + "step": 11254 + }, + { + "epoch": 1.4317516855361914, + "ewc_loss": 0.0634082704782486, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002996100520249456, + "grad_norm": 7.393250942230225, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8495640754699707, + "num_tokens": 429514641.0, + "step": 11255 + }, + { + "epoch": 1.431878895814782, + "ewc_loss": 0.06315860897302628, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029711340903304517, + "grad_norm": 7.341746807098389, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8634449243545532, + "num_tokens": 429554054.0, + "step": 11256 + }, + { + "epoch": 1.4320061060933724, + "ewc_loss": 0.06348199397325516, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030034728115424514, + "grad_norm": 7.366947650909424, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8611255884170532, + "num_tokens": 429594004.0, + "step": 11257 + }, + { + "epoch": 1.4321333163719627, + "ewc_loss": 0.06331703066825867, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029869767604395747, + "grad_norm": 7.352834701538086, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.870095431804657, + "num_tokens": 429633019.0, + "step": 11258 + }, + { + "epoch": 1.4322605266505533, + "ewc_loss": 0.063505619764328, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003005835460498929, + "grad_norm": 7.455150604248047, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8637667894363403, + "num_tokens": 429666869.0, + "step": 11259 + }, + { + "epoch": 1.4323877369291438, + "ewc_loss": 0.06334295123815536, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002989568456541747, + "grad_norm": 7.3703789710998535, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8737340569496155, + "num_tokens": 429700049.0, + "step": 11260 + }, + { + "epoch": 1.4325149472077343, + "ewc_loss": 0.06346626579761505, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003001899749506265, + "grad_norm": 7.422879219055176, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8613634705543518, + "num_tokens": 429737517.0, + "step": 11261 + }, + { + "epoch": 1.4326421574863248, + "ewc_loss": 0.06331545114517212, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029868187266401947, + "grad_norm": 7.346949577331543, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8765584230422974, + "num_tokens": 429778697.0, + "step": 11262 + }, + { + "epoch": 1.4327693677649154, + "ewc_loss": 0.06347055733203888, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003002328739967197, + "grad_norm": 7.429213047027588, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8643803000450134, + "num_tokens": 429811883.0, + "step": 11263 + }, + { + "epoch": 1.432896578043506, + "ewc_loss": 0.0632319301366806, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002978466800414026, + "grad_norm": 7.428135395050049, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8689060211181641, + "num_tokens": 429851748.0, + "step": 11264 + }, + { + "epoch": 1.4330237883220964, + "ewc_loss": 0.06336383521556854, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002991656947415322, + "grad_norm": 7.475667476654053, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8532147407531738, + "num_tokens": 429891033.0, + "step": 11265 + }, + { + "epoch": 1.433150998600687, + "ewc_loss": 0.06316211074590683, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002971484500449151, + "grad_norm": 7.347568511962891, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8669594526290894, + "num_tokens": 429935980.0, + "step": 11266 + }, + { + "epoch": 1.4332782088792775, + "ewc_loss": 0.06331087648868561, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002986361214425415, + "grad_norm": 7.432052135467529, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8517298698425293, + "num_tokens": 429970874.0, + "step": 11267 + }, + { + "epoch": 1.433405419157868, + "ewc_loss": 0.06313376128673553, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029686494963243604, + "grad_norm": 7.284501552581787, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8490787744522095, + "num_tokens": 430017909.0, + "step": 11268 + }, + { + "epoch": 1.4335326294364585, + "ewc_loss": 0.06338734924793243, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002994007954839617, + "grad_norm": 7.471446990966797, + "learning_rate": 1e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8375536203384399, + "num_tokens": 430050641.0, + "step": 11269 + }, + { + "epoch": 1.433659839715049, + "ewc_loss": 0.06318596005439758, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000297386955935508, + "grad_norm": 7.333244323730469, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8679786920547485, + "num_tokens": 430082985.0, + "step": 11270 + }, + { + "epoch": 1.4337870499936396, + "ewc_loss": 0.06343723088502884, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002998996351379901, + "grad_norm": 7.436256408691406, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8624767065048218, + "num_tokens": 430122340.0, + "step": 11271 + }, + { + "epoch": 1.4339142602722301, + "ewc_loss": 0.0630909726023674, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029643706511706114, + "grad_norm": 7.293823719024658, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8497399091720581, + "num_tokens": 430162140.0, + "step": 11272 + }, + { + "epoch": 1.4340414705508204, + "ewc_loss": 0.06347382813692093, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030026561580598354, + "grad_norm": 7.379740238189697, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8754464387893677, + "num_tokens": 430201672.0, + "step": 11273 + }, + { + "epoch": 1.434168680829411, + "ewc_loss": 0.06320898979902267, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029761725454591215, + "grad_norm": 7.305029392242432, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8523575067520142, + "num_tokens": 430239760.0, + "step": 11274 + }, + { + "epoch": 1.4342958911080015, + "ewc_loss": 0.06345893442630768, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000300116662401706, + "grad_norm": 7.426207542419434, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8446943759918213, + "num_tokens": 430280527.0, + "step": 11275 + }, + { + "epoch": 1.434423101386592, + "ewc_loss": 0.06336275488138199, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002991548681166023, + "grad_norm": 7.38095235824585, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8607603907585144, + "num_tokens": 430312788.0, + "step": 11276 + }, + { + "epoch": 1.4345503116651825, + "ewc_loss": 0.06364685297012329, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029955446370877326, + "grad_norm": 7.354809761047363, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8716915249824524, + "num_tokens": 430348044.0, + "step": 11277 + }, + { + "epoch": 1.434677521943773, + "ewc_loss": 0.06330303847789764, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029855777393095195, + "grad_norm": 7.321140289306641, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8786773085594177, + "num_tokens": 430385676.0, + "step": 11278 + }, + { + "epoch": 1.4348047322223636, + "ewc_loss": 0.06337113678455353, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002992386871483177, + "grad_norm": 7.4118266105651855, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8593900203704834, + "num_tokens": 430416976.0, + "step": 11279 + }, + { + "epoch": 1.4349319425009541, + "ewc_loss": 0.06330940127372742, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029862139490433037, + "grad_norm": 7.277617454528809, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8742671608924866, + "num_tokens": 430460002.0, + "step": 11280 + }, + { + "epoch": 1.4350591527795447, + "ewc_loss": 0.06353439390659332, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003008713247254491, + "grad_norm": 7.361852645874023, + "learning_rate": 1e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8415235280990601, + "num_tokens": 430503172.0, + "step": 11281 + }, + { + "epoch": 1.435186363058135, + "ewc_loss": 0.0633457750082016, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029898504726588726, + "grad_norm": 7.342654228210449, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8508110046386719, + "num_tokens": 430541221.0, + "step": 11282 + }, + { + "epoch": 1.4353135733367255, + "ewc_loss": 0.06346432864665985, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00030017062090337276, + "grad_norm": 7.362565994262695, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8557776808738708, + "num_tokens": 430579527.0, + "step": 11283 + }, + { + "epoch": 1.435440783615316, + "ewc_loss": 0.06340111047029495, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029953845660202205, + "grad_norm": 7.321228504180908, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8655668497085571, + "num_tokens": 430617263.0, + "step": 11284 + }, + { + "epoch": 1.4355679938939065, + "ewc_loss": 0.06343185901641846, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002998459676746279, + "grad_norm": 7.3904523849487305, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8752970695495605, + "num_tokens": 430649961.0, + "step": 11285 + }, + { + "epoch": 1.435695204172497, + "ewc_loss": 0.0632149949669838, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002976772957481444, + "grad_norm": 7.369863510131836, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8567116856575012, + "num_tokens": 430682599.0, + "step": 11286 + }, + { + "epoch": 1.4358224144510876, + "ewc_loss": 0.06330392509698868, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029856659239158034, + "grad_norm": 7.381001949310303, + "learning_rate": 1e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.8351551294326782, + "num_tokens": 430721014.0, + "step": 11287 + }, + { + "epoch": 1.4359496247296781, + "ewc_loss": 0.06345243752002716, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029761026962660253, + "grad_norm": 7.350331783294678, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8620391488075256, + "num_tokens": 430757014.0, + "step": 11288 + }, + { + "epoch": 1.4360768350082687, + "ewc_loss": 0.06347748637199402, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002978607953991741, + "grad_norm": 7.326369762420654, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.864366888999939, + "num_tokens": 430796169.0, + "step": 11289 + }, + { + "epoch": 1.4362040452868592, + "ewc_loss": 0.06327535212039948, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002982808218803257, + "grad_norm": 7.3456220626831055, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8616845011711121, + "num_tokens": 430832726.0, + "step": 11290 + }, + { + "epoch": 1.4363312555654497, + "ewc_loss": 0.06327924132347107, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029831973370164633, + "grad_norm": 7.3831658363342285, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8522728085517883, + "num_tokens": 430866343.0, + "step": 11291 + }, + { + "epoch": 1.4364584658440402, + "ewc_loss": 0.06327314674854279, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002982588193845004, + "grad_norm": 7.313344955444336, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8677946329116821, + "num_tokens": 430901622.0, + "step": 11292 + }, + { + "epoch": 1.4365856761226308, + "ewc_loss": 0.06335686147212982, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002990959328599274, + "grad_norm": 7.381965637207031, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8759359121322632, + "num_tokens": 430937499.0, + "step": 11293 + }, + { + "epoch": 1.4367128864012213, + "ewc_loss": 0.06320080906152725, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002975354145746678, + "grad_norm": 7.354186534881592, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8505692481994629, + "num_tokens": 430974048.0, + "step": 11294 + }, + { + "epoch": 1.4368400966798118, + "ewc_loss": 0.06330515444278717, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002985788742080331, + "grad_norm": 7.331901550292969, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8708573579788208, + "num_tokens": 431010927.0, + "step": 11295 + }, + { + "epoch": 1.4369673069584024, + "ewc_loss": 0.06330367922782898, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000298564147669822, + "grad_norm": 7.396032810211182, + "learning_rate": 1e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8471870422363281, + "num_tokens": 431051731.0, + "step": 11296 + }, + { + "epoch": 1.4370945172369929, + "ewc_loss": 0.06332612782716751, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029634719248861074, + "grad_norm": 7.305357456207275, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8786063194274902, + "num_tokens": 431087876.0, + "step": 11297 + }, + { + "epoch": 1.4372217275155832, + "ewc_loss": 0.06334002315998077, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002989275672007352, + "grad_norm": 7.359265327453613, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8563857078552246, + "num_tokens": 431126217.0, + "step": 11298 + }, + { + "epoch": 1.4373489377941737, + "ewc_loss": 0.06311716139316559, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002966989704873413, + "grad_norm": 7.318666934967041, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.863792359828949, + "num_tokens": 431168049.0, + "step": 11299 + }, + { + "epoch": 1.4374761480727642, + "ewc_loss": 0.06332191824913025, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029874651227146387, + "grad_norm": 7.37322473526001, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8599239587783813, + "num_tokens": 431211143.0, + "step": 11300 + }, + { + "epoch": 1.4376033583513548, + "ewc_loss": 0.06321370601654053, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029766440275125206, + "grad_norm": 7.296806335449219, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8699467778205872, + "num_tokens": 431255747.0, + "step": 11301 + }, + { + "epoch": 1.4377305686299453, + "ewc_loss": 0.063570037484169, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029878635541535914, + "grad_norm": 7.340267658233643, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8661473989486694, + "num_tokens": 431296618.0, + "step": 11302 + }, + { + "epoch": 1.4378577789085358, + "ewc_loss": 0.06318605691194534, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029738788725808263, + "grad_norm": 7.305529594421387, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8639464378356934, + "num_tokens": 431336493.0, + "step": 11303 + }, + { + "epoch": 1.4379849891871264, + "ewc_loss": 0.06329169869422913, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029844435630366206, + "grad_norm": 7.377918720245361, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8596200942993164, + "num_tokens": 431376200.0, + "step": 11304 + }, + { + "epoch": 1.438112199465717, + "ewc_loss": 0.06328372657299042, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029836464091204107, + "grad_norm": 7.343578815460205, + "learning_rate": 1e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8425683379173279, + "num_tokens": 431414015.0, + "step": 11305 + }, + { + "epoch": 1.4382394097443074, + "ewc_loss": 0.06337879598140717, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002993153175339103, + "grad_norm": 7.420651912689209, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8574403524398804, + "num_tokens": 431453498.0, + "step": 11306 + }, + { + "epoch": 1.4383666200228977, + "ewc_loss": 0.06320269405841827, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002975542447529733, + "grad_norm": 7.3186163902282715, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8631525039672852, + "num_tokens": 431491126.0, + "step": 11307 + }, + { + "epoch": 1.4384938303014883, + "ewc_loss": 0.06341800093650818, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000299707317026332, + "grad_norm": 7.395938396453857, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8856040835380554, + "num_tokens": 431528549.0, + "step": 11308 + }, + { + "epoch": 1.4386210405800788, + "ewc_loss": 0.06323625147342682, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002978898410219699, + "grad_norm": 7.379344940185547, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8694976568222046, + "num_tokens": 431569435.0, + "step": 11309 + }, + { + "epoch": 1.4387482508586693, + "ewc_loss": 0.06322811543941498, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002978084667120129, + "grad_norm": 7.344126224517822, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8681686520576477, + "num_tokens": 431606125.0, + "step": 11310 + }, + { + "epoch": 1.4388754611372598, + "ewc_loss": 0.06328215450048447, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002983488666359335, + "grad_norm": 7.354307651519775, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8581493496894836, + "num_tokens": 431643517.0, + "step": 11311 + }, + { + "epoch": 1.4390026714158504, + "ewc_loss": 0.0632360577583313, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002978879492729902, + "grad_norm": 7.336221218109131, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8719566464424133, + "num_tokens": 431682058.0, + "step": 11312 + }, + { + "epoch": 1.439129881694441, + "ewc_loss": 0.06334373354911804, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029896467458456755, + "grad_norm": 7.356636047363281, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8750982284545898, + "num_tokens": 431721241.0, + "step": 11313 + }, + { + "epoch": 1.4392570919730314, + "ewc_loss": 0.06328225880861282, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002983499434776604, + "grad_norm": 7.321091175079346, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8752706050872803, + "num_tokens": 431757809.0, + "step": 11314 + }, + { + "epoch": 1.439384302251622, + "ewc_loss": 0.06329222023487091, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002984495076816529, + "grad_norm": 7.354746341705322, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8513978123664856, + "num_tokens": 431795931.0, + "step": 11315 + }, + { + "epoch": 1.4395115125302125, + "ewc_loss": 0.06319889426231384, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029751629335805774, + "grad_norm": 7.341338157653809, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8729519248008728, + "num_tokens": 431830517.0, + "step": 11316 + }, + { + "epoch": 1.439638722808803, + "ewc_loss": 0.0633326917886734, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029885428375564516, + "grad_norm": 7.401538372039795, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8649518489837646, + "num_tokens": 431866047.0, + "step": 11317 + }, + { + "epoch": 1.4397659330873935, + "ewc_loss": 0.06317874044179916, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029731474933214486, + "grad_norm": 7.37677526473999, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8512750267982483, + "num_tokens": 431901602.0, + "step": 11318 + }, + { + "epoch": 1.439893143365984, + "ewc_loss": 0.06323250383138657, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002978523843921721, + "grad_norm": 7.361490249633789, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8587663173675537, + "num_tokens": 431934989.0, + "step": 11319 + }, + { + "epoch": 1.4400203536445746, + "ewc_loss": 0.06329254806041718, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029845282551832497, + "grad_norm": 7.368785381317139, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.867949366569519, + "num_tokens": 431968664.0, + "step": 11320 + }, + { + "epoch": 1.4401475639231651, + "ewc_loss": 0.06327390670776367, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029826638638041914, + "grad_norm": 7.3510541915893555, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8742741346359253, + "num_tokens": 432008025.0, + "step": 11321 + }, + { + "epoch": 1.4402747742017554, + "ewc_loss": 0.06332209706306458, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029874834581278265, + "grad_norm": 7.375000476837158, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8663383722305298, + "num_tokens": 432043526.0, + "step": 11322 + }, + { + "epoch": 1.440401984480346, + "ewc_loss": 0.06333421170711517, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002988694468513131, + "grad_norm": 7.362453937530518, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8633209466934204, + "num_tokens": 432085175.0, + "step": 11323 + }, + { + "epoch": 1.4405291947589365, + "ewc_loss": 0.06334865838289261, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002990139473695308, + "grad_norm": 7.409943580627441, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8529844284057617, + "num_tokens": 432122379.0, + "step": 11324 + }, + { + "epoch": 1.440656405037527, + "ewc_loss": 0.06313058733940125, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029683325556106865, + "grad_norm": 7.342706680297852, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8656976222991943, + "num_tokens": 432157690.0, + "step": 11325 + }, + { + "epoch": 1.4407836153161175, + "ewc_loss": 0.06327345967292786, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002982619625981897, + "grad_norm": 7.424238204956055, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8707910776138306, + "num_tokens": 432188885.0, + "step": 11326 + }, + { + "epoch": 1.440910825594708, + "ewc_loss": 0.06304729729890823, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029600030393339694, + "grad_norm": 7.30331563949585, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.864181637763977, + "num_tokens": 432228804.0, + "step": 11327 + }, + { + "epoch": 1.4410380358732986, + "ewc_loss": 0.06332115828990936, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029873891617171466, + "grad_norm": 7.399521827697754, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8457505702972412, + "num_tokens": 432264150.0, + "step": 11328 + }, + { + "epoch": 1.4411652461518891, + "ewc_loss": 0.06313340365886688, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002968613407574594, + "grad_norm": 7.299297332763672, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8567690849304199, + "num_tokens": 432306759.0, + "step": 11329 + }, + { + "epoch": 1.4412924564304797, + "ewc_loss": 0.0633011907339096, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029853926389478147, + "grad_norm": 7.38874626159668, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8729757070541382, + "num_tokens": 432346928.0, + "step": 11330 + }, + { + "epoch": 1.44141966670907, + "ewc_loss": 0.06311953067779541, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002967226319015026, + "grad_norm": 7.3349504470825195, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8705055713653564, + "num_tokens": 432386630.0, + "step": 11331 + }, + { + "epoch": 1.4415468769876605, + "ewc_loss": 0.06333652138710022, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029889255529269576, + "grad_norm": 7.410254001617432, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8506426215171814, + "num_tokens": 432427331.0, + "step": 11332 + }, + { + "epoch": 1.441674087266251, + "ewc_loss": 0.06298162788152695, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002953436051029712, + "grad_norm": 7.299384593963623, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8539798259735107, + "num_tokens": 432462901.0, + "step": 11333 + }, + { + "epoch": 1.4418012975448415, + "ewc_loss": 0.06335844099521637, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002991117653436959, + "grad_norm": 7.401207447052002, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.860458254814148, + "num_tokens": 432494232.0, + "step": 11334 + }, + { + "epoch": 1.441928507823432, + "ewc_loss": 0.06310710310935974, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002965983294416219, + "grad_norm": 7.512407302856445, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.87521892786026, + "num_tokens": 432534276.0, + "step": 11335 + }, + { + "epoch": 1.4420557181020226, + "ewc_loss": 0.06304194033145905, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029594675288535655, + "grad_norm": 7.278850555419922, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8544663786888123, + "num_tokens": 432577337.0, + "step": 11336 + }, + { + "epoch": 1.4421829283806131, + "ewc_loss": 0.06331896781921387, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002987170300912112, + "grad_norm": 7.4037885665893555, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8553979396820068, + "num_tokens": 432614290.0, + "step": 11337 + }, + { + "epoch": 1.4423101386592037, + "ewc_loss": 0.06309296190738678, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029645697213709354, + "grad_norm": 7.331790447235107, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8646008968353271, + "num_tokens": 432648558.0, + "step": 11338 + }, + { + "epoch": 1.4424373489377942, + "ewc_loss": 0.06325128674507141, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002980401914101094, + "grad_norm": 7.399389743804932, + "learning_rate": 1e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8360559940338135, + "num_tokens": 432684243.0, + "step": 11339 + }, + { + "epoch": 1.4425645592163847, + "ewc_loss": 0.06310249865055084, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029655228718183935, + "grad_norm": 7.339346408843994, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8621256947517395, + "num_tokens": 432719355.0, + "step": 11340 + }, + { + "epoch": 1.4426917694949752, + "ewc_loss": 0.0632597953081131, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002981252910103649, + "grad_norm": 7.4221110343933105, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8638318777084351, + "num_tokens": 432758675.0, + "step": 11341 + }, + { + "epoch": 1.4428189797735658, + "ewc_loss": 0.06301388144493103, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029566619195975363, + "grad_norm": 7.296646595001221, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8596717715263367, + "num_tokens": 432801677.0, + "step": 11342 + }, + { + "epoch": 1.4429461900521563, + "ewc_loss": 0.06328696012496948, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002983969752676785, + "grad_norm": 7.412981986999512, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.861539900302887, + "num_tokens": 432836299.0, + "step": 11343 + }, + { + "epoch": 1.4430734003307468, + "ewc_loss": 0.06308197975158691, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002963471633847803, + "grad_norm": 7.381521224975586, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8610621094703674, + "num_tokens": 432868838.0, + "step": 11344 + }, + { + "epoch": 1.4432006106093374, + "ewc_loss": 0.06323771178722382, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029790442204102874, + "grad_norm": 7.342433452606201, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8647705912590027, + "num_tokens": 432904895.0, + "step": 11345 + }, + { + "epoch": 1.4433278208879279, + "ewc_loss": 0.06319194287061691, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002974467643070966, + "grad_norm": 7.383245944976807, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8575045466423035, + "num_tokens": 432944785.0, + "step": 11346 + }, + { + "epoch": 1.4434550311665182, + "ewc_loss": 0.06310395151376724, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029656686820089817, + "grad_norm": 7.428967475891113, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8519498109817505, + "num_tokens": 432975645.0, + "step": 11347 + }, + { + "epoch": 1.4435822414451087, + "ewc_loss": 0.06310012936592102, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.000296528625767678, + "grad_norm": 7.342447280883789, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8571693897247314, + "num_tokens": 433016887.0, + "step": 11348 + }, + { + "epoch": 1.4437094517236992, + "ewc_loss": 0.06344062834978104, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029749219538643956, + "grad_norm": 7.360086441040039, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8643213510513306, + "num_tokens": 433053666.0, + "step": 11349 + }, + { + "epoch": 1.4438366620022898, + "ewc_loss": 0.06334029138088226, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002964888117276132, + "grad_norm": 7.2727837562561035, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8580853939056396, + "num_tokens": 433101131.0, + "step": 11350 + }, + { + "epoch": 1.4439638722808803, + "ewc_loss": 0.06351236999034882, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002982096339110285, + "grad_norm": 7.447140216827393, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8692669868469238, + "num_tokens": 433132535.0, + "step": 11351 + }, + { + "epoch": 1.4440910825594708, + "ewc_loss": 0.06331175565719604, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029620344866998494, + "grad_norm": 7.345449924468994, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8477152585983276, + "num_tokens": 433168229.0, + "step": 11352 + }, + { + "epoch": 1.4442182928380614, + "ewc_loss": 0.0635073333978653, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029815928428433836, + "grad_norm": 7.389773368835449, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8664058446884155, + "num_tokens": 433206094.0, + "step": 11353 + }, + { + "epoch": 1.4443455031166519, + "ewc_loss": 0.06327670067548752, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002958529512397945, + "grad_norm": 7.374602794647217, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8500964641571045, + "num_tokens": 433241038.0, + "step": 11354 + }, + { + "epoch": 1.4444727133952424, + "ewc_loss": 0.06333132088184357, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029884057585150003, + "grad_norm": 7.458046913146973, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8544793128967285, + "num_tokens": 433274674.0, + "step": 11355 + }, + { + "epoch": 1.4445999236738327, + "ewc_loss": 0.06301870942115784, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029571447521448135, + "grad_norm": 7.327137470245361, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8662523031234741, + "num_tokens": 433314511.0, + "step": 11356 + }, + { + "epoch": 1.4447271339524232, + "ewc_loss": 0.06330349296331406, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029856228502467275, + "grad_norm": 7.43813943862915, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8637198805809021, + "num_tokens": 433349338.0, + "step": 11357 + }, + { + "epoch": 1.4448543442310138, + "ewc_loss": 0.06294143199920654, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002949416812043637, + "grad_norm": 7.325934410095215, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8718075752258301, + "num_tokens": 433382609.0, + "step": 11358 + }, + { + "epoch": 1.4449815545096043, + "ewc_loss": 0.06344306468963623, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029751655529253185, + "grad_norm": 7.440106391906738, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8775321245193481, + "num_tokens": 433411553.0, + "step": 11359 + }, + { + "epoch": 1.4451087647881948, + "ewc_loss": 0.06328798085451126, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029596572858281434, + "grad_norm": 7.352441787719727, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8591611385345459, + "num_tokens": 433450571.0, + "step": 11360 + }, + { + "epoch": 1.4452359750667854, + "ewc_loss": 0.06347283720970154, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029781434568576515, + "grad_norm": 7.39133358001709, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8578123450279236, + "num_tokens": 433486994.0, + "step": 11361 + }, + { + "epoch": 1.445363185345376, + "ewc_loss": 0.06335552036762238, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002966411120723933, + "grad_norm": 7.353518962860107, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8743900060653687, + "num_tokens": 433521407.0, + "step": 11362 + }, + { + "epoch": 1.4454903956239664, + "ewc_loss": 0.0633767768740654, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029685371555387974, + "grad_norm": 7.357753276824951, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8747091293334961, + "num_tokens": 433561338.0, + "step": 11363 + }, + { + "epoch": 1.445617605902557, + "ewc_loss": 0.06343461573123932, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002974320959765464, + "grad_norm": 7.360781669616699, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8702478408813477, + "num_tokens": 433598892.0, + "step": 11364 + }, + { + "epoch": 1.4457448161811475, + "ewc_loss": 0.0634504184126854, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000297590100672096, + "grad_norm": 7.412216663360596, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8531175851821899, + "num_tokens": 433641423.0, + "step": 11365 + }, + { + "epoch": 1.445872026459738, + "ewc_loss": 0.06332993507385254, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002963852894026786, + "grad_norm": 7.299925327301025, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8659403324127197, + "num_tokens": 433677385.0, + "step": 11366 + }, + { + "epoch": 1.4459992367383285, + "ewc_loss": 0.06350390613079071, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029812497086822987, + "grad_norm": 7.419715404510498, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8607060313224792, + "num_tokens": 433716901.0, + "step": 11367 + }, + { + "epoch": 1.446126447016919, + "ewc_loss": 0.0632537379860878, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002956232929136604, + "grad_norm": 7.293433666229248, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8587623238563538, + "num_tokens": 433755099.0, + "step": 11368 + }, + { + "epoch": 1.4462536572955096, + "ewc_loss": 0.06345567107200623, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029764269129373133, + "grad_norm": 7.391688823699951, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8587633371353149, + "num_tokens": 433794012.0, + "step": 11369 + }, + { + "epoch": 1.4463808675741001, + "ewc_loss": 0.06328509747982025, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002959368866868317, + "grad_norm": 7.310329914093018, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8662265539169312, + "num_tokens": 433831965.0, + "step": 11370 + }, + { + "epoch": 1.4465080778526904, + "ewc_loss": 0.06348168104887009, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029790273401886225, + "grad_norm": 7.417682647705078, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8632290959358215, + "num_tokens": 433870819.0, + "step": 11371 + }, + { + "epoch": 1.446635288131281, + "ewc_loss": 0.06326388567686081, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002957248070742935, + "grad_norm": 7.322019100189209, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8564556837081909, + "num_tokens": 433909197.0, + "step": 11372 + }, + { + "epoch": 1.4467624984098715, + "ewc_loss": 0.06351570785045624, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002982430160045624, + "grad_norm": 7.422516822814941, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8692153692245483, + "num_tokens": 433946026.0, + "step": 11373 + }, + { + "epoch": 1.446889708688462, + "ewc_loss": 0.06332898885011673, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029637583065778017, + "grad_norm": 7.301022052764893, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8843948245048523, + "num_tokens": 433986239.0, + "step": 11374 + }, + { + "epoch": 1.4470169189670525, + "ewc_loss": 0.06348694860935211, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002979553828481585, + "grad_norm": 7.366058826446533, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.867383599281311, + "num_tokens": 434022557.0, + "step": 11375 + }, + { + "epoch": 1.447144129245643, + "ewc_loss": 0.06325820088386536, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002956679672934115, + "grad_norm": 7.319769382476807, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8677826523780823, + "num_tokens": 434059258.0, + "step": 11376 + }, + { + "epoch": 1.4472713395242336, + "ewc_loss": 0.06331919133663177, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029871927108615637, + "grad_norm": 7.438318729400635, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8483526706695557, + "num_tokens": 434094872.0, + "step": 11377 + }, + { + "epoch": 1.4473985498028241, + "ewc_loss": 0.06303185224533081, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029584584990516305, + "grad_norm": 7.326592922210693, + "learning_rate": 1e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8389439582824707, + "num_tokens": 434134842.0, + "step": 11378 + }, + { + "epoch": 1.4475257600814146, + "ewc_loss": 0.06331560015678406, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002986833860632032, + "grad_norm": 7.425675392150879, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8739752769470215, + "num_tokens": 434173152.0, + "step": 11379 + }, + { + "epoch": 1.447652970360005, + "ewc_loss": 0.06299494951963425, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029547681333497167, + "grad_norm": 7.3418378829956055, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8651150465011597, + "num_tokens": 434208318.0, + "step": 11380 + }, + { + "epoch": 1.4477801806385955, + "ewc_loss": 0.06327809393405914, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002983082376886159, + "grad_norm": 7.462403774261475, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8594179153442383, + "num_tokens": 434243693.0, + "step": 11381 + }, + { + "epoch": 1.447907390917186, + "ewc_loss": 0.06295675784349442, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029509494197554886, + "grad_norm": 7.3280534744262695, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8514254093170166, + "num_tokens": 434283172.0, + "step": 11382 + }, + { + "epoch": 1.4480346011957765, + "ewc_loss": 0.06321477144956589, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002976750547531992, + "grad_norm": 7.421009540557861, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8642356991767883, + "num_tokens": 434318789.0, + "step": 11383 + }, + { + "epoch": 1.448161811474367, + "ewc_loss": 0.06303974986076355, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029592480859719217, + "grad_norm": 7.3137311935424805, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8552069664001465, + "num_tokens": 434358943.0, + "step": 11384 + }, + { + "epoch": 1.4482890217529576, + "ewc_loss": 0.06316712498664856, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029719859594479203, + "grad_norm": 7.38831090927124, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8638458251953125, + "num_tokens": 434398264.0, + "step": 11385 + }, + { + "epoch": 1.4484162320315481, + "ewc_loss": 0.06310151517391205, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029654245008714497, + "grad_norm": 7.386407375335693, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8726906776428223, + "num_tokens": 434433602.0, + "step": 11386 + }, + { + "epoch": 1.4485434423101387, + "ewc_loss": 0.06313998997211456, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029692723182961345, + "grad_norm": 7.341980457305908, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8564167618751526, + "num_tokens": 434480001.0, + "step": 11387 + }, + { + "epoch": 1.4486706525887292, + "ewc_loss": 0.06318612396717072, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029738855664618313, + "grad_norm": 7.3290252685546875, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8780744075775146, + "num_tokens": 434519285.0, + "step": 11388 + }, + { + "epoch": 1.4487978628673197, + "ewc_loss": 0.06316155940294266, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029714294942095876, + "grad_norm": 7.480646133422852, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8619717359542847, + "num_tokens": 434556118.0, + "step": 11389 + }, + { + "epoch": 1.4489250731459102, + "ewc_loss": 0.06302950531244278, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002958223922178149, + "grad_norm": 7.316805839538574, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8474195003509521, + "num_tokens": 434595350.0, + "step": 11390 + }, + { + "epoch": 1.4490522834245008, + "ewc_loss": 0.06327299028635025, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002982572477776557, + "grad_norm": 7.501504421234131, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8597928285598755, + "num_tokens": 434632988.0, + "step": 11391 + }, + { + "epoch": 1.4491794937030913, + "ewc_loss": 0.06297807395458221, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029530812753364444, + "grad_norm": 7.257561683654785, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8702943921089172, + "num_tokens": 434673040.0, + "step": 11392 + }, + { + "epoch": 1.4493067039816818, + "ewc_loss": 0.06342515349388123, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029977888334542513, + "grad_norm": 7.512091159820557, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8794749975204468, + "num_tokens": 434712201.0, + "step": 11393 + }, + { + "epoch": 1.4494339142602723, + "ewc_loss": 0.06292757391929626, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029480308876372874, + "grad_norm": 7.245534896850586, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8764337301254272, + "num_tokens": 434756676.0, + "step": 11394 + }, + { + "epoch": 1.4495611245388629, + "ewc_loss": 0.06359443813562393, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003014717367477715, + "grad_norm": 7.422605991363525, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8660508990287781, + "num_tokens": 434797890.0, + "step": 11395 + }, + { + "epoch": 1.4496883348174532, + "ewc_loss": 0.06314854323863983, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029701279709115624, + "grad_norm": 7.370229721069336, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8753518462181091, + "num_tokens": 434836933.0, + "step": 11396 + }, + { + "epoch": 1.4498155450960437, + "ewc_loss": 0.06333315372467041, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002988589112646878, + "grad_norm": 7.399687767028809, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.874976396560669, + "num_tokens": 434868098.0, + "step": 11397 + }, + { + "epoch": 1.4499427553746342, + "ewc_loss": 0.06325848400592804, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002981122233904898, + "grad_norm": 7.388986587524414, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.856829047203064, + "num_tokens": 434907239.0, + "step": 11398 + }, + { + "epoch": 1.4500699656532248, + "ewc_loss": 0.06329330056905746, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002984603343065828, + "grad_norm": 7.333972454071045, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8698348999023438, + "num_tokens": 434945672.0, + "step": 11399 + }, + { + "epoch": 1.4501971759318153, + "ewc_loss": 0.06338039040565491, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029933120822533965, + "grad_norm": 7.421227931976318, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8521106839179993, + "num_tokens": 434981693.0, + "step": 11400 + }, + { + "epoch": 1.4503243862104058, + "ewc_loss": 0.06320961564779282, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029762351186946034, + "grad_norm": 7.381000518798828, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8847789168357849, + "num_tokens": 435012452.0, + "step": 11401 + }, + { + "epoch": 1.4504515964889964, + "ewc_loss": 0.06331411749124527, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002986685140058398, + "grad_norm": 7.393582820892334, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8571755886077881, + "num_tokens": 435052711.0, + "step": 11402 + }, + { + "epoch": 1.4505788067675869, + "ewc_loss": 0.06317666172981262, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002972939400933683, + "grad_norm": 7.33126974105835, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8618663549423218, + "num_tokens": 435092766.0, + "step": 11403 + }, + { + "epoch": 1.4507060170461774, + "ewc_loss": 0.06330004334449768, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029852776788175106, + "grad_norm": 7.427439212799072, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8468067049980164, + "num_tokens": 435132093.0, + "step": 11404 + }, + { + "epoch": 1.4508332273247677, + "ewc_loss": 0.06318250298500061, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029735235148109496, + "grad_norm": 7.390246391296387, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.858819842338562, + "num_tokens": 435169071.0, + "step": 11405 + }, + { + "epoch": 1.4509604376033582, + "ewc_loss": 0.06345000863075256, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029758605523966253, + "grad_norm": 7.36475944519043, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8724176287651062, + "num_tokens": 435202665.0, + "step": 11406 + }, + { + "epoch": 1.4510876478819488, + "ewc_loss": 0.06330610811710358, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029858844936825335, + "grad_norm": 7.381548881530762, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8775866031646729, + "num_tokens": 435234719.0, + "step": 11407 + }, + { + "epoch": 1.4512148581605393, + "ewc_loss": 0.06328783184289932, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0002984056482091546, + "grad_norm": 7.377498626708984, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8778541088104248, + "num_tokens": 435267090.0, + "step": 11408 + }, + { + "epoch": 1.4513420684391298, + "ewc_loss": 0.06342314183712006, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029975874349474907, + "grad_norm": 7.381035804748535, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8711436986923218, + "num_tokens": 435307243.0, + "step": 11409 + }, + { + "epoch": 1.4514692787177204, + "ewc_loss": 0.06345994770526886, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029768538661301136, + "grad_norm": 7.3336873054504395, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8714683055877686, + "num_tokens": 435347507.0, + "step": 11410 + }, + { + "epoch": 1.4515964889963109, + "ewc_loss": 0.06363185495138168, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029940446256659925, + "grad_norm": 7.4522624015808105, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8531556725502014, + "num_tokens": 435383499.0, + "step": 11411 + }, + { + "epoch": 1.4517236992749014, + "ewc_loss": 0.06316328048706055, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029716012068092823, + "grad_norm": 7.319623947143555, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8637024164199829, + "num_tokens": 435423500.0, + "step": 11412 + }, + { + "epoch": 1.451850909553492, + "ewc_loss": 0.06339079886674881, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.00029943534173071384, + "grad_norm": 7.434733867645264, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8662315607070923, + "num_tokens": 435457075.0, + "step": 11413 + }, + { + "epoch": 1.4519781198320825, + "ewc_loss": 0.06346890330314636, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029777499730698764, + "grad_norm": 7.40723180770874, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8620153069496155, + "num_tokens": 435498933.0, + "step": 11414 + }, + { + "epoch": 1.452105330110673, + "ewc_loss": 0.06345373392105103, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029762330814264715, + "grad_norm": 7.327645301818848, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8550832271575928, + "num_tokens": 435535949.0, + "step": 11415 + }, + { + "epoch": 1.4522325403892635, + "ewc_loss": 0.06359200924634933, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029900603112764657, + "grad_norm": 7.4832916259765625, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8558013439178467, + "num_tokens": 435565252.0, + "step": 11416 + }, + { + "epoch": 1.452359750667854, + "ewc_loss": 0.06343524903059006, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002974384115077555, + "grad_norm": 7.343410015106201, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8434268236160278, + "num_tokens": 435604663.0, + "step": 11417 + }, + { + "epoch": 1.4524869609464446, + "ewc_loss": 0.06366318464279175, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029971779440529644, + "grad_norm": 7.431484699249268, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8578158617019653, + "num_tokens": 435640883.0, + "step": 11418 + }, + { + "epoch": 1.4526141712250351, + "ewc_loss": 0.06340987980365753, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002971847716253251, + "grad_norm": 7.320940017700195, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8664647340774536, + "num_tokens": 435682787.0, + "step": 11419 + }, + { + "epoch": 1.4527413815036254, + "ewc_loss": 0.0637606829404831, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000300692772725597, + "grad_norm": 7.486785888671875, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8603801131248474, + "num_tokens": 435719982.0, + "step": 11420 + }, + { + "epoch": 1.452868591782216, + "ewc_loss": 0.06348618865013123, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002979478449560702, + "grad_norm": 7.388799667358398, + "learning_rate": 1e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.846507728099823, + "num_tokens": 435755153.0, + "step": 11421 + }, + { + "epoch": 1.4529958020608065, + "ewc_loss": 0.06356372684240341, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029872320010326803, + "grad_norm": 7.390563011169434, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8721301555633545, + "num_tokens": 435797513.0, + "step": 11422 + }, + { + "epoch": 1.453123012339397, + "ewc_loss": 0.06345764547586441, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002976623654831201, + "grad_norm": 7.353679180145264, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8620843291282654, + "num_tokens": 435836835.0, + "step": 11423 + }, + { + "epoch": 1.4532502226179875, + "ewc_loss": 0.06350439786911011, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002981299185194075, + "grad_norm": 7.368897914886475, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8546364307403564, + "num_tokens": 435875124.0, + "step": 11424 + }, + { + "epoch": 1.453377432896578, + "ewc_loss": 0.06355860084295273, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029867191915400326, + "grad_norm": 7.389806270599365, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.875097393989563, + "num_tokens": 435914993.0, + "step": 11425 + }, + { + "epoch": 1.4535046431751686, + "ewc_loss": 0.06353219598531723, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002984078892040998, + "grad_norm": 7.454836368560791, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8718494176864624, + "num_tokens": 435950916.0, + "step": 11426 + }, + { + "epoch": 1.4536318534537591, + "ewc_loss": 0.06341153383255005, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002972013026010245, + "grad_norm": 7.355592727661133, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8564149737358093, + "num_tokens": 435991986.0, + "step": 11427 + }, + { + "epoch": 1.4537590637323496, + "ewc_loss": 0.0634184181690216, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029727016226388514, + "grad_norm": 7.409531593322754, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8706448674201965, + "num_tokens": 436024603.0, + "step": 11428 + }, + { + "epoch": 1.45388627401094, + "ewc_loss": 0.06345825642347336, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000297668477287516, + "grad_norm": 7.326775074005127, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8580948114395142, + "num_tokens": 436065540.0, + "step": 11429 + }, + { + "epoch": 1.4540134842895305, + "ewc_loss": 0.06352563202381134, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002983422891702503, + "grad_norm": 7.4269304275512695, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8498015403747559, + "num_tokens": 436104394.0, + "step": 11430 + }, + { + "epoch": 1.454140694568121, + "ewc_loss": 0.06336186826229095, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029670458752661943, + "grad_norm": 7.398966312408447, + "learning_rate": 1e-06, + "loss": 0.526, + "mean_token_accuracy": 0.8496745824813843, + "num_tokens": 436139977.0, + "step": 11431 + }, + { + "epoch": 1.4542679048467115, + "ewc_loss": 0.0635792464017868, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002988784108310938, + "grad_norm": 7.3605546951293945, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8781217336654663, + "num_tokens": 436178047.0, + "step": 11432 + }, + { + "epoch": 1.454395115125302, + "ewc_loss": 0.06339665502309799, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029705246561206877, + "grad_norm": 7.344563007354736, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8615983724594116, + "num_tokens": 436218331.0, + "step": 11433 + }, + { + "epoch": 1.4545223254038926, + "ewc_loss": 0.06351715326309204, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002982574515044689, + "grad_norm": 7.359875679016113, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8607987761497498, + "num_tokens": 436255663.0, + "step": 11434 + }, + { + "epoch": 1.4546495356824831, + "ewc_loss": 0.06348545104265213, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002979404234793037, + "grad_norm": 7.305035591125488, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8626998662948608, + "num_tokens": 436301328.0, + "step": 11435 + }, + { + "epoch": 1.4547767459610736, + "ewc_loss": 0.06362219154834747, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029930780874565244, + "grad_norm": 7.449491024017334, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8709368705749512, + "num_tokens": 436336487.0, + "step": 11436 + }, + { + "epoch": 1.4549039562396642, + "ewc_loss": 0.06350493431091309, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000298135302728042, + "grad_norm": 7.361548900604248, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8650228977203369, + "num_tokens": 436377629.0, + "step": 11437 + }, + { + "epoch": 1.4550311665182547, + "ewc_loss": 0.06361325830221176, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002992185181938112, + "grad_norm": 7.338539123535156, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8776513338088989, + "num_tokens": 436421590.0, + "step": 11438 + }, + { + "epoch": 1.4551583767968452, + "ewc_loss": 0.06360878795385361, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002991738438140601, + "grad_norm": 7.404051303863525, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8555785417556763, + "num_tokens": 436461172.0, + "step": 11439 + }, + { + "epoch": 1.4552855870754358, + "ewc_loss": 0.0636986792087555, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003000727156177163, + "grad_norm": 7.412561416625977, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8715038299560547, + "num_tokens": 436497536.0, + "step": 11440 + }, + { + "epoch": 1.4554127973540263, + "ewc_loss": 0.06367072463035583, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002997931733261794, + "grad_norm": 7.387686252593994, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8602176308631897, + "num_tokens": 436543906.0, + "step": 11441 + }, + { + "epoch": 1.4555400076326168, + "ewc_loss": 0.06362500041723251, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029933592304587364, + "grad_norm": 7.428734302520752, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.865654706954956, + "num_tokens": 436579462.0, + "step": 11442 + }, + { + "epoch": 1.4556672179112073, + "ewc_loss": 0.06365163624286652, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002996023395098746, + "grad_norm": 7.382863998413086, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8554088473320007, + "num_tokens": 436618840.0, + "step": 11443 + }, + { + "epoch": 1.4557944281897979, + "ewc_loss": 0.06366253644227982, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029971127514727414, + "grad_norm": 7.425882339477539, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8522894978523254, + "num_tokens": 436654176.0, + "step": 11444 + }, + { + "epoch": 1.4559216384683882, + "ewc_loss": 0.06358923017978668, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002989782369695604, + "grad_norm": 7.424076557159424, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8768664598464966, + "num_tokens": 436685507.0, + "step": 11445 + }, + { + "epoch": 1.4560488487469787, + "ewc_loss": 0.06366124004125595, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029969835304655135, + "grad_norm": 7.366144180297852, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8705061078071594, + "num_tokens": 436729881.0, + "step": 11446 + }, + { + "epoch": 1.4561760590255692, + "ewc_loss": 0.06357836723327637, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029886962147429585, + "grad_norm": 7.39103364944458, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8618168234825134, + "num_tokens": 436772355.0, + "step": 11447 + }, + { + "epoch": 1.4563032693041598, + "ewc_loss": 0.0636456161737442, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002995421236846596, + "grad_norm": 7.413712501525879, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8560965657234192, + "num_tokens": 436808701.0, + "step": 11448 + }, + { + "epoch": 1.4564304795827503, + "ewc_loss": 0.06372713297605515, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030035729287192225, + "grad_norm": 7.401525020599365, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.871667206287384, + "num_tokens": 436845441.0, + "step": 11449 + }, + { + "epoch": 1.4565576898613408, + "ewc_loss": 0.06361038982868195, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002991897927131504, + "grad_norm": 7.3817291259765625, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8668291568756104, + "num_tokens": 436885316.0, + "step": 11450 + }, + { + "epoch": 1.4566849001399313, + "ewc_loss": 0.06372956931591034, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030038165277801454, + "grad_norm": 7.465480804443359, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8562761545181274, + "num_tokens": 436918315.0, + "step": 11451 + }, + { + "epoch": 1.4568121104185219, + "ewc_loss": 0.06368277966976166, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002999136922881007, + "grad_norm": 7.4018402099609375, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8560790419578552, + "num_tokens": 436956039.0, + "step": 11452 + }, + { + "epoch": 1.4569393206971124, + "ewc_loss": 0.06373439729213715, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003004299069289118, + "grad_norm": 7.458695888519287, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8487153053283691, + "num_tokens": 436991274.0, + "step": 11453 + }, + { + "epoch": 1.4570665309757027, + "ewc_loss": 0.06360659748315811, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029915192862972617, + "grad_norm": 7.362948417663574, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8745681047439575, + "num_tokens": 437029963.0, + "step": 11454 + }, + { + "epoch": 1.4571937412542932, + "ewc_loss": 0.06390274316072464, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003021133888978511, + "grad_norm": 7.502621650695801, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8558365106582642, + "num_tokens": 437067030.0, + "step": 11455 + }, + { + "epoch": 1.4573209515328838, + "ewc_loss": 0.0634915828704834, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029800174525007606, + "grad_norm": 7.371140480041504, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8692295551300049, + "num_tokens": 437103363.0, + "step": 11456 + }, + { + "epoch": 1.4574481618114743, + "ewc_loss": 0.06387147307395935, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030180063913576305, + "grad_norm": 7.55781364440918, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8695404529571533, + "num_tokens": 437131574.0, + "step": 11457 + }, + { + "epoch": 1.4575753720900648, + "ewc_loss": 0.06346239149570465, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029770989203825593, + "grad_norm": 7.31699275970459, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8566896319389343, + "num_tokens": 437169537.0, + "step": 11458 + }, + { + "epoch": 1.4577025823686554, + "ewc_loss": 0.06383012235164642, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003013871319126338, + "grad_norm": 7.484833240509033, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8669511079788208, + "num_tokens": 437208025.0, + "step": 11459 + }, + { + "epoch": 1.4578297926472459, + "ewc_loss": 0.06352288275957108, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002983147569466382, + "grad_norm": 7.350037097930908, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8743034601211548, + "num_tokens": 437246642.0, + "step": 11460 + }, + { + "epoch": 1.4579570029258364, + "ewc_loss": 0.06380005180835724, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030108646024018526, + "grad_norm": 7.432233810424805, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.859192967414856, + "num_tokens": 437285274.0, + "step": 11461 + }, + { + "epoch": 1.458084213204427, + "ewc_loss": 0.0636000782251358, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00029908670694567263, + "grad_norm": 7.361180782318115, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8784697651863098, + "num_tokens": 437325865.0, + "step": 11462 + }, + { + "epoch": 1.4582114234830175, + "ewc_loss": 0.06375093013048172, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003005952457897365, + "grad_norm": 13.587571144104004, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8682317733764648, + "num_tokens": 437364093.0, + "step": 11463 + }, + { + "epoch": 1.458338633761608, + "ewc_loss": 0.07324180006980896, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000395503913750872, + "grad_norm": 8.605484008789062, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8507788181304932, + "num_tokens": 437402413.0, + "step": 11464 + }, + { + "epoch": 1.4584658440401985, + "ewc_loss": 0.06231391429901123, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002862250548787415, + "grad_norm": 7.162538528442383, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8497176766395569, + "num_tokens": 437442583.0, + "step": 11465 + }, + { + "epoch": 1.458593054318789, + "ewc_loss": 0.06563594937324524, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031944538932293653, + "grad_norm": 7.865898609161377, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8631608486175537, + "num_tokens": 437485682.0, + "step": 11466 + }, + { + "epoch": 1.4587202645973796, + "ewc_loss": 0.06414103507995605, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030449629412032664, + "grad_norm": 7.300265312194824, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8536569476127625, + "num_tokens": 437524859.0, + "step": 11467 + }, + { + "epoch": 1.45884747487597, + "ewc_loss": 0.06506019830703735, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003136879240628332, + "grad_norm": 7.729525089263916, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8552006483078003, + "num_tokens": 437563590.0, + "step": 11468 + }, + { + "epoch": 1.4589746851545604, + "ewc_loss": 0.06403334438800812, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030341933597810566, + "grad_norm": 7.351189613342285, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8793243169784546, + "num_tokens": 437601429.0, + "step": 11469 + }, + { + "epoch": 1.459101895433151, + "ewc_loss": 0.06483954191207886, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000311481358949095, + "grad_norm": 7.670482635498047, + "learning_rate": 1e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8508179187774658, + "num_tokens": 437639144.0, + "step": 11470 + }, + { + "epoch": 1.4592291057117415, + "ewc_loss": 0.06411780416965485, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030426401644945145, + "grad_norm": 7.369545936584473, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8715571165084839, + "num_tokens": 437680846.0, + "step": 11471 + }, + { + "epoch": 1.459356315990332, + "ewc_loss": 0.06452091038227081, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003082950715906918, + "grad_norm": 7.539809703826904, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8645239472389221, + "num_tokens": 437722156.0, + "step": 11472 + }, + { + "epoch": 1.4594835262689225, + "ewc_loss": 0.06410078704357147, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030409378814511, + "grad_norm": 7.410585880279541, + "learning_rate": 1e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.8405710458755493, + "num_tokens": 437766905.0, + "step": 11473 + }, + { + "epoch": 1.459610736547513, + "ewc_loss": 0.06435319036245346, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003066178469453007, + "grad_norm": 7.547945499420166, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.863286018371582, + "num_tokens": 437801425.0, + "step": 11474 + }, + { + "epoch": 1.4597379468261036, + "ewc_loss": 0.06394262611865997, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030251219868659973, + "grad_norm": 7.411527156829834, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8526096343994141, + "num_tokens": 437838126.0, + "step": 11475 + }, + { + "epoch": 1.4598651571046941, + "ewc_loss": 0.06423773616552353, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030546329799108207, + "grad_norm": 7.478614807128906, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8578775525093079, + "num_tokens": 437876597.0, + "step": 11476 + }, + { + "epoch": 1.4599923673832846, + "ewc_loss": 0.06366683542728424, + "ewc_loss_diag": 3.337860107421875e-05, + "ewc_loss_parallel": 0.0003021957236342132, + "grad_norm": 7.377806663513184, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8746029734611511, + "num_tokens": 437911852.0, + "step": 11477 + }, + { + "epoch": 1.460119577661875, + "ewc_loss": 0.06408847868442535, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003039707080461085, + "grad_norm": 7.451718807220459, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8637944459915161, + "num_tokens": 437953418.0, + "step": 11478 + }, + { + "epoch": 1.4602467879404655, + "ewc_loss": 0.06387335807085037, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000301819498417899, + "grad_norm": 7.40765380859375, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8689672946929932, + "num_tokens": 437989561.0, + "step": 11479 + }, + { + "epoch": 1.460373998219056, + "ewc_loss": 0.06397917866706848, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030287771369330585, + "grad_norm": 7.372497081756592, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8716638088226318, + "num_tokens": 438033464.0, + "step": 11480 + }, + { + "epoch": 1.4605012084976465, + "ewc_loss": 0.0639723464846611, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030280937789939344, + "grad_norm": 7.421249866485596, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8673932552337646, + "num_tokens": 438068676.0, + "step": 11481 + }, + { + "epoch": 1.460628418776237, + "ewc_loss": 0.06398305296897888, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003029164217878133, + "grad_norm": 7.463711261749268, + "learning_rate": 1e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8446488380432129, + "num_tokens": 438107867.0, + "step": 11482 + }, + { + "epoch": 1.4607556290548276, + "ewc_loss": 0.06395108997821808, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030259686172939837, + "grad_norm": 7.372306823730469, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8796627521514893, + "num_tokens": 438144256.0, + "step": 11483 + }, + { + "epoch": 1.4608828393334181, + "ewc_loss": 0.06404908001422882, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030357675859704614, + "grad_norm": 13.572122573852539, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8721961975097656, + "num_tokens": 438183458.0, + "step": 11484 + }, + { + "epoch": 1.4610100496120086, + "ewc_loss": 0.07348057627677917, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00039789165020920336, + "grad_norm": 8.561888694763184, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8508186936378479, + "num_tokens": 438216025.0, + "step": 11485 + }, + { + "epoch": 1.4611372598905992, + "ewc_loss": 0.06284940242767334, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002915799559559673, + "grad_norm": 7.100366592407227, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8739153146743774, + "num_tokens": 438258185.0, + "step": 11486 + }, + { + "epoch": 1.4612644701691897, + "ewc_loss": 0.06579872965812683, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003210732829757035, + "grad_norm": 7.8074822425842285, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8585059642791748, + "num_tokens": 438298189.0, + "step": 11487 + }, + { + "epoch": 1.4613916804477802, + "ewc_loss": 0.06448377668857574, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003079237067140639, + "grad_norm": 7.352543830871582, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8522112965583801, + "num_tokens": 438336593.0, + "step": 11488 + }, + { + "epoch": 1.4615188907263708, + "ewc_loss": 0.06522609293460846, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000315346842398867, + "grad_norm": 7.704111099243164, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8695515394210815, + "num_tokens": 438374110.0, + "step": 11489 + }, + { + "epoch": 1.4616461010049613, + "ewc_loss": 0.06418095529079437, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003048954822588712, + "grad_norm": 7.316690921783447, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8460781574249268, + "num_tokens": 438413062.0, + "step": 11490 + }, + { + "epoch": 1.4617733112835518, + "ewc_loss": 0.06504964828491211, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000313582451781258, + "grad_norm": 7.636434078216553, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8657350540161133, + "num_tokens": 438452055.0, + "step": 11491 + }, + { + "epoch": 1.4619005215621423, + "ewc_loss": 0.0642957091331482, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003060429880861193, + "grad_norm": 7.362610340118408, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8658390641212463, + "num_tokens": 438494224.0, + "step": 11492 + }, + { + "epoch": 1.4620277318407329, + "ewc_loss": 0.06481106579303741, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031119657796807587, + "grad_norm": 7.553036689758301, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8595191836357117, + "num_tokens": 438529790.0, + "step": 11493 + }, + { + "epoch": 1.4621549421193232, + "ewc_loss": 0.06432554870843887, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030634141876362264, + "grad_norm": 7.393606185913086, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8657145500183105, + "num_tokens": 438571371.0, + "step": 11494 + }, + { + "epoch": 1.4622821523979137, + "ewc_loss": 0.06460751593112946, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003091611433774233, + "grad_norm": 7.479282379150391, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8491606712341309, + "num_tokens": 438612383.0, + "step": 11495 + }, + { + "epoch": 1.4624093626765042, + "ewc_loss": 0.06434355676174164, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030652154237031937, + "grad_norm": 7.467196464538574, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8531445264816284, + "num_tokens": 438653443.0, + "step": 11496 + }, + { + "epoch": 1.4625365729550948, + "ewc_loss": 0.06434430181980133, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003065289929509163, + "grad_norm": 7.477667331695557, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8514519929885864, + "num_tokens": 438689565.0, + "step": 11497 + }, + { + "epoch": 1.4626637832336853, + "ewc_loss": 0.06417824327945709, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000304868386592716, + "grad_norm": 7.408156871795654, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8629904985427856, + "num_tokens": 438727753.0, + "step": 11498 + }, + { + "epoch": 1.4627909935122758, + "ewc_loss": 0.06435856223106384, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030667157261632383, + "grad_norm": 7.482241630554199, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8606481552124023, + "num_tokens": 438764667.0, + "step": 11499 + }, + { + "epoch": 1.4629182037908663, + "ewc_loss": 0.06414742767810822, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003045602061320096, + "grad_norm": 7.394872188568115, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8684767484664917, + "num_tokens": 438801739.0, + "step": 11500 + }, + { + "epoch": 1.4630454140694569, + "ewc_loss": 0.0643177404999733, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003062633331865072, + "grad_norm": 7.467855930328369, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8640512824058533, + "num_tokens": 438840028.0, + "step": 11501 + }, + { + "epoch": 1.4631726243480474, + "ewc_loss": 0.06405039131641388, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003035898844245821, + "grad_norm": 7.395323753356934, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8613941669464111, + "num_tokens": 438880632.0, + "step": 11502 + }, + { + "epoch": 1.4632998346266377, + "ewc_loss": 0.06419635564088821, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003050494706258178, + "grad_norm": 7.417938709259033, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.859807014465332, + "num_tokens": 438923040.0, + "step": 11503 + }, + { + "epoch": 1.4634270449052282, + "ewc_loss": 0.0641830787062645, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030491669895127416, + "grad_norm": 7.424942970275879, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.863054096698761, + "num_tokens": 438958597.0, + "step": 11504 + }, + { + "epoch": 1.4635542551838188, + "ewc_loss": 0.06408274173736572, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003039133735001087, + "grad_norm": 7.406052112579346, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8507716059684753, + "num_tokens": 438997909.0, + "step": 11505 + }, + { + "epoch": 1.4636814654624093, + "ewc_loss": 0.06420989334583282, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030518489074893296, + "grad_norm": 7.45565128326416, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8707194924354553, + "num_tokens": 439030084.0, + "step": 11506 + }, + { + "epoch": 1.4638086757409998, + "ewc_loss": 0.06416383385658264, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003047242935281247, + "grad_norm": 7.4262776374816895, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8655063509941101, + "num_tokens": 439067770.0, + "step": 11507 + }, + { + "epoch": 1.4639358860195903, + "ewc_loss": 0.06406852602958679, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003037711721844971, + "grad_norm": 7.450572967529297, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8651891350746155, + "num_tokens": 439102476.0, + "step": 11508 + }, + { + "epoch": 1.4640630962981809, + "ewc_loss": 0.06412535160779953, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003043394535779953, + "grad_norm": 7.390153408050537, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.873700737953186, + "num_tokens": 439138660.0, + "step": 11509 + }, + { + "epoch": 1.4641903065767714, + "ewc_loss": 0.06422285735607147, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030531451920978725, + "grad_norm": 7.394106864929199, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.871522068977356, + "num_tokens": 439181690.0, + "step": 11510 + }, + { + "epoch": 1.464317516855362, + "ewc_loss": 0.06414280086755753, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003045139601454139, + "grad_norm": 7.447348117828369, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8687489628791809, + "num_tokens": 439218825.0, + "step": 11511 + }, + { + "epoch": 1.4644447271339525, + "ewc_loss": 0.06414837390184402, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003045696939807385, + "grad_norm": 7.391870498657227, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8681501746177673, + "num_tokens": 439253505.0, + "step": 11512 + }, + { + "epoch": 1.464571937412543, + "ewc_loss": 0.06418025493621826, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003048884682357311, + "grad_norm": 7.441227912902832, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8581569194793701, + "num_tokens": 439292764.0, + "step": 11513 + }, + { + "epoch": 1.4646991476911335, + "ewc_loss": 0.06406538188457489, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003037397109437734, + "grad_norm": 7.355926036834717, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8727461099624634, + "num_tokens": 439335269.0, + "step": 11514 + }, + { + "epoch": 1.464826357969724, + "ewc_loss": 0.06432357430458069, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003063216572627425, + "grad_norm": 7.48816442489624, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8651512861251831, + "num_tokens": 439371977.0, + "step": 11515 + }, + { + "epoch": 1.4649535682483146, + "ewc_loss": 0.06406491249799728, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030373508343473077, + "grad_norm": 7.379456996917725, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.863247811794281, + "num_tokens": 439412710.0, + "step": 11516 + }, + { + "epoch": 1.465080778526905, + "ewc_loss": 0.06428978592157364, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003059837908949703, + "grad_norm": 7.492739677429199, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8615785837173462, + "num_tokens": 439449637.0, + "step": 11517 + }, + { + "epoch": 1.4652079888054954, + "ewc_loss": 0.06402543187141418, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003033402608707547, + "grad_norm": 7.350694179534912, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8574417233467102, + "num_tokens": 439495143.0, + "step": 11518 + }, + { + "epoch": 1.465335199084086, + "ewc_loss": 0.06415650993585587, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003046510391868651, + "grad_norm": 7.4504194259643555, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8510812520980835, + "num_tokens": 439532721.0, + "step": 11519 + }, + { + "epoch": 1.4654624093626765, + "ewc_loss": 0.0640038400888443, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030312436865642667, + "grad_norm": 7.420166969299316, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8727346062660217, + "num_tokens": 439567938.0, + "step": 11520 + }, + { + "epoch": 1.465589619641267, + "ewc_loss": 0.06421376764774323, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030522356973960996, + "grad_norm": 7.479997634887695, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8726003170013428, + "num_tokens": 439595720.0, + "step": 11521 + }, + { + "epoch": 1.4657168299198575, + "ewc_loss": 0.06391912698745728, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030227721435949206, + "grad_norm": 7.366080284118652, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8639504313468933, + "num_tokens": 439634091.0, + "step": 11522 + }, + { + "epoch": 1.465844040198448, + "ewc_loss": 0.06415525078773499, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003046384372282773, + "grad_norm": 7.506247043609619, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8516141176223755, + "num_tokens": 439671978.0, + "step": 11523 + }, + { + "epoch": 1.4659712504770386, + "ewc_loss": 0.06378211081027985, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003009070351254195, + "grad_norm": 7.335558891296387, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8582597374916077, + "num_tokens": 439707748.0, + "step": 11524 + }, + { + "epoch": 1.466098460755629, + "ewc_loss": 0.06412310153245926, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003043169272132218, + "grad_norm": 7.4492926597595215, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8708001971244812, + "num_tokens": 439744060.0, + "step": 11525 + }, + { + "epoch": 1.4662256710342196, + "ewc_loss": 0.06398279964923859, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030291397706605494, + "grad_norm": 7.343252658843994, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8563492298126221, + "num_tokens": 439789277.0, + "step": 11526 + }, + { + "epoch": 1.46635288131281, + "ewc_loss": 0.06415875256061554, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030467347824014723, + "grad_norm": 7.399134159088135, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8806909322738647, + "num_tokens": 439827524.0, + "step": 11527 + }, + { + "epoch": 1.4664800915914005, + "ewc_loss": 0.06403624266386032, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003034483816009015, + "grad_norm": 7.416092872619629, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8592054843902588, + "num_tokens": 439865181.0, + "step": 11528 + }, + { + "epoch": 1.466607301869991, + "ewc_loss": 0.06399766355752945, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000303062581224367, + "grad_norm": 7.485100269317627, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.855355978012085, + "num_tokens": 439900071.0, + "step": 11529 + }, + { + "epoch": 1.4667345121485815, + "ewc_loss": 0.06398738920688629, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030295984470285475, + "grad_norm": 7.380316734313965, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8614897727966309, + "num_tokens": 439940839.0, + "step": 11530 + }, + { + "epoch": 1.466861722427172, + "ewc_loss": 0.06412677466869354, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003043536562472582, + "grad_norm": 7.4215779304504395, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8592784404754639, + "num_tokens": 439978908.0, + "step": 11531 + }, + { + "epoch": 1.4669889327057626, + "ewc_loss": 0.06391000747680664, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030218600295484066, + "grad_norm": 7.378759384155273, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8614442348480225, + "num_tokens": 440013448.0, + "step": 11532 + }, + { + "epoch": 1.467116142984353, + "ewc_loss": 0.06405900418758392, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003036760026589036, + "grad_norm": 7.414154529571533, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8661710023880005, + "num_tokens": 440048377.0, + "step": 11533 + }, + { + "epoch": 1.4672433532629436, + "ewc_loss": 0.06405316293239594, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000303617533063516, + "grad_norm": 7.379918575286865, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8568698763847351, + "num_tokens": 440089806.0, + "step": 11534 + }, + { + "epoch": 1.4673705635415342, + "ewc_loss": 0.0640001967549324, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003030879015568644, + "grad_norm": 7.413455009460449, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8689918518066406, + "num_tokens": 440124209.0, + "step": 11535 + }, + { + "epoch": 1.4674977738201247, + "ewc_loss": 0.06404073536396027, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003034932888112962, + "grad_norm": 7.462254524230957, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8648815751075745, + "num_tokens": 440158349.0, + "step": 11536 + }, + { + "epoch": 1.4676249840987152, + "ewc_loss": 0.0639905110001564, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030299104400910437, + "grad_norm": 7.403932571411133, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8640649318695068, + "num_tokens": 440195220.0, + "step": 11537 + }, + { + "epoch": 1.4677521943773058, + "ewc_loss": 0.06399250775575638, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003030110092367977, + "grad_norm": 7.346038818359375, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8629266619682312, + "num_tokens": 440235158.0, + "step": 11538 + }, + { + "epoch": 1.4678794046558963, + "ewc_loss": 0.06411653757095337, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030425129807554185, + "grad_norm": 7.45042610168457, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8565616011619568, + "num_tokens": 440271879.0, + "step": 11539 + }, + { + "epoch": 1.4680066149344868, + "ewc_loss": 0.06392907351255417, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003023766621481627, + "grad_norm": 7.4106926918029785, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8619986772537231, + "num_tokens": 440306454.0, + "step": 11540 + }, + { + "epoch": 1.4681338252130773, + "ewc_loss": 0.06405376642942429, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003036236157640815, + "grad_norm": 7.3968095779418945, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.851564347743988, + "num_tokens": 440341185.0, + "step": 11541 + }, + { + "epoch": 1.4682610354916679, + "ewc_loss": 0.06401944160461426, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003032803360838443, + "grad_norm": 7.394135475158691, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8614721298217773, + "num_tokens": 440379168.0, + "step": 11542 + }, + { + "epoch": 1.4683882457702582, + "ewc_loss": 0.06410335749387741, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030411951593123376, + "grad_norm": 7.379642963409424, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8654716610908508, + "num_tokens": 440417945.0, + "step": 11543 + }, + { + "epoch": 1.4685154560488487, + "ewc_loss": 0.06399878114461899, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030307372799143195, + "grad_norm": 7.353019714355469, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8474768400192261, + "num_tokens": 440457864.0, + "step": 11544 + }, + { + "epoch": 1.4686426663274392, + "ewc_loss": 0.0642162412405014, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030524833709932864, + "grad_norm": 7.490757942199707, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8578194379806519, + "num_tokens": 440491709.0, + "step": 11545 + }, + { + "epoch": 1.4687698766060298, + "ewc_loss": 0.0638863667845726, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030194962164387107, + "grad_norm": 7.326543807983398, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8681638836860657, + "num_tokens": 440533570.0, + "step": 11546 + }, + { + "epoch": 1.4688970868846203, + "ewc_loss": 0.06436396390199661, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003067255893256515, + "grad_norm": 7.519439220428467, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8686791658401489, + "num_tokens": 440568360.0, + "step": 11547 + }, + { + "epoch": 1.4690242971632108, + "ewc_loss": 0.063885897397995, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030194493592716753, + "grad_norm": 7.312323093414307, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8727637529373169, + "num_tokens": 440606848.0, + "step": 11548 + }, + { + "epoch": 1.4691515074418013, + "ewc_loss": 0.0642469972372055, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030555593548342586, + "grad_norm": 7.466684818267822, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8749459385871887, + "num_tokens": 440638254.0, + "step": 11549 + }, + { + "epoch": 1.4692787177203919, + "ewc_loss": 0.06389233469963074, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030200931360013783, + "grad_norm": 7.393805027008057, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.869431734085083, + "num_tokens": 440674122.0, + "step": 11550 + }, + { + "epoch": 1.4694059279989824, + "ewc_loss": 0.06407780945301056, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003038640134036541, + "grad_norm": 7.341200828552246, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8770161867141724, + "num_tokens": 440718362.0, + "step": 11551 + }, + { + "epoch": 1.4695331382775727, + "ewc_loss": 0.06412900239229202, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003043759788852185, + "grad_norm": 7.408219337463379, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8696684837341309, + "num_tokens": 440755971.0, + "step": 11552 + }, + { + "epoch": 1.4696603485561632, + "ewc_loss": 0.06394708156585693, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000302556756651029, + "grad_norm": 7.386014938354492, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8606317639350891, + "num_tokens": 440794328.0, + "step": 11553 + }, + { + "epoch": 1.4697875588347538, + "ewc_loss": 0.0641355812549591, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030444172443822026, + "grad_norm": 7.393311977386475, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8499501943588257, + "num_tokens": 440833383.0, + "step": 11554 + }, + { + "epoch": 1.4699147691133443, + "ewc_loss": 0.06406356394290924, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030372157925739884, + "grad_norm": 7.378891468048096, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8702013492584229, + "num_tokens": 440875502.0, + "step": 11555 + }, + { + "epoch": 1.4700419793919348, + "ewc_loss": 0.06400557607412338, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003031416854355484, + "grad_norm": 7.416975498199463, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8664574027061462, + "num_tokens": 440915259.0, + "step": 11556 + }, + { + "epoch": 1.4701691896705253, + "ewc_loss": 0.06407349556684494, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003038208815269172, + "grad_norm": 7.408404350280762, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8597480058670044, + "num_tokens": 440951660.0, + "step": 11557 + }, + { + "epoch": 1.4702963999491159, + "ewc_loss": 0.06405849009752274, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003036708221770823, + "grad_norm": 7.361673831939697, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8626198172569275, + "num_tokens": 440992533.0, + "step": 11558 + }, + { + "epoch": 1.4704236102277064, + "ewc_loss": 0.06414829194545746, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003045688499696553, + "grad_norm": 7.481865406036377, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8524079918861389, + "num_tokens": 441027360.0, + "step": 11559 + }, + { + "epoch": 1.470550820506297, + "ewc_loss": 0.06399074196815491, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030299331410788, + "grad_norm": 7.329686164855957, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8699014186859131, + "num_tokens": 441071746.0, + "step": 11560 + }, + { + "epoch": 1.4706780307848875, + "ewc_loss": 0.06418876349925995, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030497353873215616, + "grad_norm": 7.476879119873047, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8616715669631958, + "num_tokens": 441105378.0, + "step": 11561 + }, + { + "epoch": 1.470805241063478, + "ewc_loss": 0.06396548449993134, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030274075106717646, + "grad_norm": 7.3798322677612305, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8638624548912048, + "num_tokens": 441150000.0, + "step": 11562 + }, + { + "epoch": 1.4709324513420685, + "ewc_loss": 0.06418762356042862, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003049621591344476, + "grad_norm": 7.410763740539551, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.856479287147522, + "num_tokens": 441188426.0, + "step": 11563 + }, + { + "epoch": 1.471059661620659, + "ewc_loss": 0.06407449394464493, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030383089324459434, + "grad_norm": 7.431994438171387, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8696724772453308, + "num_tokens": 441226028.0, + "step": 11564 + }, + { + "epoch": 1.4711868718992496, + "ewc_loss": 0.06404893100261688, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003035752451978624, + "grad_norm": 7.452309608459473, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8646431565284729, + "num_tokens": 441265788.0, + "step": 11565 + }, + { + "epoch": 1.47131408217784, + "ewc_loss": 0.06403104960918427, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030339646036736667, + "grad_norm": 7.430629730224609, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8711495399475098, + "num_tokens": 441297280.0, + "step": 11566 + }, + { + "epoch": 1.4714412924564304, + "ewc_loss": 0.06414653360843658, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030455124215222895, + "grad_norm": 7.493858814239502, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8631922006607056, + "num_tokens": 441331229.0, + "step": 11567 + }, + { + "epoch": 1.471568502735021, + "ewc_loss": 0.06389481574296951, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003020340809598565, + "grad_norm": 7.351524353027344, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.867603600025177, + "num_tokens": 441371064.0, + "step": 11568 + }, + { + "epoch": 1.4716957130136115, + "ewc_loss": 0.06405997276306152, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030368566513061523, + "grad_norm": 7.497189521789551, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8439925909042358, + "num_tokens": 441405706.0, + "step": 11569 + }, + { + "epoch": 1.471822923292202, + "ewc_loss": 0.06393962353467941, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003024821635335684, + "grad_norm": 7.41084623336792, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8547940254211426, + "num_tokens": 441445966.0, + "step": 11570 + }, + { + "epoch": 1.4719501335707925, + "ewc_loss": 0.06410524249076843, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030413837521336973, + "grad_norm": 7.489799976348877, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8470290899276733, + "num_tokens": 441478916.0, + "step": 11571 + }, + { + "epoch": 1.472077343849383, + "ewc_loss": 0.06389106065034866, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003019965661223978, + "grad_norm": 7.361949443817139, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8598935604095459, + "num_tokens": 441519245.0, + "step": 11572 + }, + { + "epoch": 1.4722045541279736, + "ewc_loss": 0.06414379179477692, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003045238845515996, + "grad_norm": 7.462252616882324, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8561973571777344, + "num_tokens": 441557485.0, + "step": 11573 + }, + { + "epoch": 1.472331764406564, + "ewc_loss": 0.06387278437614441, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003018137940671295, + "grad_norm": 7.406523704528809, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8735088109970093, + "num_tokens": 441596042.0, + "step": 11574 + }, + { + "epoch": 1.4724589746851546, + "ewc_loss": 0.06410102546215057, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000304096145555377, + "grad_norm": 7.490410804748535, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8735904693603516, + "num_tokens": 441634391.0, + "step": 11575 + }, + { + "epoch": 1.472586184963745, + "ewc_loss": 0.0639139860868454, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030222575878724456, + "grad_norm": 7.443438529968262, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8553749322891235, + "num_tokens": 441672201.0, + "step": 11576 + }, + { + "epoch": 1.4727133952423355, + "ewc_loss": 0.06404684484004974, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030355434864759445, + "grad_norm": 7.434764385223389, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8618670105934143, + "num_tokens": 441713350.0, + "step": 11577 + }, + { + "epoch": 1.472840605520926, + "ewc_loss": 0.06400871276855469, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003031730302609503, + "grad_norm": 7.4309282302856445, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.861105740070343, + "num_tokens": 441754677.0, + "step": 11578 + }, + { + "epoch": 1.4729678157995165, + "ewc_loss": 0.06394428014755249, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003025287005584687, + "grad_norm": 7.446584224700928, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8518379330635071, + "num_tokens": 441797037.0, + "step": 11579 + }, + { + "epoch": 1.473095026078107, + "ewc_loss": 0.06408888846635818, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003039748116862029, + "grad_norm": 7.505154609680176, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8606411218643188, + "num_tokens": 441831661.0, + "step": 11580 + }, + { + "epoch": 1.4732222363566976, + "ewc_loss": 0.06379760801792145, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003010620130226016, + "grad_norm": 7.348486423492432, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8668065071105957, + "num_tokens": 441876933.0, + "step": 11581 + }, + { + "epoch": 1.473349446635288, + "ewc_loss": 0.06408292800188065, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003039152070414275, + "grad_norm": 7.399938583374023, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8836492300033569, + "num_tokens": 441918015.0, + "step": 11582 + }, + { + "epoch": 1.4734766569138786, + "ewc_loss": 0.06400132924318314, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003030992520507425, + "grad_norm": 7.460001468658447, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8553173542022705, + "num_tokens": 441956573.0, + "step": 11583 + }, + { + "epoch": 1.4736038671924692, + "ewc_loss": 0.06398805975914001, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030296656768769026, + "grad_norm": 7.387228488922119, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8765501976013184, + "num_tokens": 442002064.0, + "step": 11584 + }, + { + "epoch": 1.4737310774710597, + "ewc_loss": 0.06412005424499512, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003042865137103945, + "grad_norm": 7.40467643737793, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8695474863052368, + "num_tokens": 442040122.0, + "step": 11585 + }, + { + "epoch": 1.4738582877496502, + "ewc_loss": 0.0640709400177002, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003037953283637762, + "grad_norm": 7.420121192932129, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8657538890838623, + "num_tokens": 442079698.0, + "step": 11586 + }, + { + "epoch": 1.4739854980282407, + "ewc_loss": 0.06420005857944489, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003050865198019892, + "grad_norm": 7.438636779785156, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8760271668434143, + "num_tokens": 442118252.0, + "step": 11587 + }, + { + "epoch": 1.4741127083068313, + "ewc_loss": 0.06407398730516434, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003038258000742644, + "grad_norm": 7.45702600479126, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8667049407958984, + "num_tokens": 442154675.0, + "step": 11588 + }, + { + "epoch": 1.4742399185854218, + "ewc_loss": 0.06418496370315552, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030493561644107103, + "grad_norm": 7.438055038452148, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8693732023239136, + "num_tokens": 442189449.0, + "step": 11589 + }, + { + "epoch": 1.4743671288640123, + "ewc_loss": 0.06417129933834076, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030479897395707667, + "grad_norm": 7.443310737609863, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8660417795181274, + "num_tokens": 442222844.0, + "step": 11590 + }, + { + "epoch": 1.4744943391426026, + "ewc_loss": 0.0640956237912178, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003040421870537102, + "grad_norm": 7.473505020141602, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8701404929161072, + "num_tokens": 442258599.0, + "step": 11591 + }, + { + "epoch": 1.4746215494211932, + "ewc_loss": 0.0641230046749115, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030431602499447763, + "grad_norm": 7.464414119720459, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8711627125740051, + "num_tokens": 442294775.0, + "step": 11592 + }, + { + "epoch": 1.4747487596997837, + "ewc_loss": 0.06404102593660355, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030349617009051144, + "grad_norm": 7.455103874206543, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8740907311439514, + "num_tokens": 442330313.0, + "step": 11593 + }, + { + "epoch": 1.4748759699783742, + "ewc_loss": 0.06396995484828949, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003027855127584189, + "grad_norm": 7.488544464111328, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8632543683052063, + "num_tokens": 442364988.0, + "step": 11594 + }, + { + "epoch": 1.4750031802569648, + "ewc_loss": 0.06393194943666458, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000302405416732654, + "grad_norm": 7.349071979522705, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.864649772644043, + "num_tokens": 442406927.0, + "step": 11595 + }, + { + "epoch": 1.4751303905355553, + "ewc_loss": 0.0642395094037056, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003054810222238302, + "grad_norm": 7.489597320556641, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8674913644790649, + "num_tokens": 442442092.0, + "step": 11596 + }, + { + "epoch": 1.4752576008141458, + "ewc_loss": 0.06389036029577255, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003019895520992577, + "grad_norm": 7.349883079528809, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8711753487586975, + "num_tokens": 442483013.0, + "step": 11597 + }, + { + "epoch": 1.4753848110927363, + "ewc_loss": 0.06425788253545761, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030566478380933404, + "grad_norm": 7.431605815887451, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8684951066970825, + "num_tokens": 442525052.0, + "step": 11598 + }, + { + "epoch": 1.4755120213713269, + "ewc_loss": 0.06401613354682922, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000303247245028615, + "grad_norm": 7.451920509338379, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8549505472183228, + "num_tokens": 442562543.0, + "step": 11599 + }, + { + "epoch": 1.4756392316499174, + "ewc_loss": 0.0641179233789444, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003042651806026697, + "grad_norm": 7.396225452423096, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.868753969669342, + "num_tokens": 442604976.0, + "step": 11600 + }, + { + "epoch": 1.4757664419285077, + "ewc_loss": 0.0640769675374031, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003038556023966521, + "grad_norm": 7.4274139404296875, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.875533938407898, + "num_tokens": 442642271.0, + "step": 11601 + }, + { + "epoch": 1.4758936522070982, + "ewc_loss": 0.0641368106007576, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003044540644623339, + "grad_norm": 7.411741256713867, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8596925735473633, + "num_tokens": 442680508.0, + "step": 11602 + }, + { + "epoch": 1.4760208624856888, + "ewc_loss": 0.06425880640745163, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003056740097235888, + "grad_norm": 7.443985462188721, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8538845777511597, + "num_tokens": 442718391.0, + "step": 11603 + }, + { + "epoch": 1.4761480727642793, + "ewc_loss": 0.06416234374046326, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003047094214707613, + "grad_norm": 7.385406494140625, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8729097843170166, + "num_tokens": 442757688.0, + "step": 11604 + }, + { + "epoch": 1.4762752830428698, + "ewc_loss": 0.06428871303796768, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030597305158153176, + "grad_norm": 7.422085762023926, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8772697448730469, + "num_tokens": 442796704.0, + "step": 11605 + }, + { + "epoch": 1.4764024933214603, + "ewc_loss": 0.06417261809110641, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030481209978461266, + "grad_norm": 7.378553867340088, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8740422129631042, + "num_tokens": 442838791.0, + "step": 11606 + }, + { + "epoch": 1.4765297036000509, + "ewc_loss": 0.06433829665184021, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003064689226448536, + "grad_norm": 7.439642429351807, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8648899793624878, + "num_tokens": 442882368.0, + "step": 11607 + }, + { + "epoch": 1.4766569138786414, + "ewc_loss": 0.06420731544494629, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030515913385897875, + "grad_norm": 7.419285297393799, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.864909291267395, + "num_tokens": 442923768.0, + "step": 11608 + }, + { + "epoch": 1.476784124157232, + "ewc_loss": 0.06428774446249008, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030596336000598967, + "grad_norm": 7.460898399353027, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8781664967536926, + "num_tokens": 442955656.0, + "step": 11609 + }, + { + "epoch": 1.4769113344358225, + "ewc_loss": 0.064085453748703, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003039404400624335, + "grad_norm": 7.393549919128418, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8552706837654114, + "num_tokens": 442997323.0, + "step": 11610 + }, + { + "epoch": 1.477038544714413, + "ewc_loss": 0.06439562886953354, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030704220989719033, + "grad_norm": 7.526909828186035, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8504918813705444, + "num_tokens": 443033454.0, + "step": 11611 + }, + { + "epoch": 1.4771657549930035, + "ewc_loss": 0.06414005160331726, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003044864279218018, + "grad_norm": 7.3275933265686035, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8623613715171814, + "num_tokens": 443077902.0, + "step": 11612 + }, + { + "epoch": 1.477292965271594, + "ewc_loss": 0.06456831097602844, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030876905657351017, + "grad_norm": 7.532793998718262, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8553733825683594, + "num_tokens": 443116515.0, + "step": 11613 + }, + { + "epoch": 1.4774201755501846, + "ewc_loss": 0.06413369625806808, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003044229233637452, + "grad_norm": 7.348392009735107, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8897755146026611, + "num_tokens": 443154124.0, + "step": 11614 + }, + { + "epoch": 1.477547385828775, + "ewc_loss": 0.0645049512386322, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003081354661844671, + "grad_norm": 7.454212188720703, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.872465968132019, + "num_tokens": 443195852.0, + "step": 11615 + }, + { + "epoch": 1.4776745961073654, + "ewc_loss": 0.06414195895195007, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003045055200345814, + "grad_norm": 7.447691440582275, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.856648862361908, + "num_tokens": 443229850.0, + "step": 11616 + }, + { + "epoch": 1.477801806385956, + "ewc_loss": 0.0643143355846405, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003062293108087033, + "grad_norm": 7.46535587310791, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8566350936889648, + "num_tokens": 443274481.0, + "step": 11617 + }, + { + "epoch": 1.4779290166645465, + "ewc_loss": 0.06429542601108551, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000306040165014565, + "grad_norm": 7.459066867828369, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8701595067977905, + "num_tokens": 443311737.0, + "step": 11618 + }, + { + "epoch": 1.478056226943137, + "ewc_loss": 0.06435415893793106, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030662750941701233, + "grad_norm": 7.526067733764648, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8567267656326294, + "num_tokens": 443345310.0, + "step": 11619 + }, + { + "epoch": 1.4781834372217275, + "ewc_loss": 0.06421427428722382, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003052286629099399, + "grad_norm": 7.395465850830078, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8619417548179626, + "num_tokens": 443392869.0, + "step": 11620 + }, + { + "epoch": 1.478310647500318, + "ewc_loss": 0.06438452005386353, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000306931120576337, + "grad_norm": 7.4767560958862305, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8618649244308472, + "num_tokens": 443432842.0, + "step": 11621 + }, + { + "epoch": 1.4784378577789086, + "ewc_loss": 0.06419534981250763, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030503940070047975, + "grad_norm": 7.450343608856201, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8661198616027832, + "num_tokens": 443465954.0, + "step": 11622 + }, + { + "epoch": 1.478565068057499, + "ewc_loss": 0.06435105204582214, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030659648473374546, + "grad_norm": 7.444988250732422, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8543784022331238, + "num_tokens": 443508506.0, + "step": 11623 + }, + { + "epoch": 1.4786922783360896, + "ewc_loss": 0.06430288404226303, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003061147581320256, + "grad_norm": 7.477590560913086, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8621813654899597, + "num_tokens": 443543575.0, + "step": 11624 + }, + { + "epoch": 1.47881948861468, + "ewc_loss": 0.06411856412887573, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030427161254920065, + "grad_norm": 7.42799711227417, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8670708537101746, + "num_tokens": 443578504.0, + "step": 11625 + }, + { + "epoch": 1.4789466988932705, + "ewc_loss": 0.06428907066583633, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030597663135267794, + "grad_norm": 7.432806015014648, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8764544129371643, + "num_tokens": 443614647.0, + "step": 11626 + }, + { + "epoch": 1.479073909171861, + "ewc_loss": 0.06427270919084549, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003058130096178502, + "grad_norm": 7.485108375549316, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8602732419967651, + "num_tokens": 443649244.0, + "step": 11627 + }, + { + "epoch": 1.4792011194504515, + "ewc_loss": 0.06416399776935577, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030472592334263027, + "grad_norm": 7.503326416015625, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8600086569786072, + "num_tokens": 443688374.0, + "step": 11628 + }, + { + "epoch": 1.479328329729042, + "ewc_loss": 0.06417594105005264, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030484533635899425, + "grad_norm": 7.424494743347168, + "learning_rate": 1e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.8336621522903442, + "num_tokens": 443730929.0, + "step": 11629 + }, + { + "epoch": 1.4794555400076326, + "ewc_loss": 0.06435507535934448, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030663664801977575, + "grad_norm": 7.538493633270264, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8569210767745972, + "num_tokens": 443763067.0, + "step": 11630 + }, + { + "epoch": 1.479582750286223, + "ewc_loss": 0.06404809653759003, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030356686329469085, + "grad_norm": 7.398253917694092, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8509021401405334, + "num_tokens": 443801273.0, + "step": 11631 + }, + { + "epoch": 1.4797099605648136, + "ewc_loss": 0.06433123350143433, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030639831675216556, + "grad_norm": 7.514237880706787, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8515816926956177, + "num_tokens": 443840394.0, + "step": 11632 + }, + { + "epoch": 1.4798371708434042, + "ewc_loss": 0.06400827318429947, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003031686937902123, + "grad_norm": 7.426289081573486, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8592064380645752, + "num_tokens": 443877486.0, + "step": 11633 + }, + { + "epoch": 1.4799643811219947, + "ewc_loss": 0.06428488343954086, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003059347509406507, + "grad_norm": 7.458352088928223, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8625417947769165, + "num_tokens": 443913763.0, + "step": 11634 + }, + { + "epoch": 1.4800915914005852, + "ewc_loss": 0.0640496164560318, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003035821137018502, + "grad_norm": 7.443525314331055, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.871002197265625, + "num_tokens": 443946840.0, + "step": 11635 + }, + { + "epoch": 1.4802188016791757, + "ewc_loss": 0.06426366418600082, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030572255491279066, + "grad_norm": 7.465948581695557, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8491173982620239, + "num_tokens": 443992439.0, + "step": 11636 + }, + { + "epoch": 1.4803460119577663, + "ewc_loss": 0.06411298364400864, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003042157622985542, + "grad_norm": 7.460813999176025, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8728774785995483, + "num_tokens": 444029348.0, + "step": 11637 + }, + { + "epoch": 1.4804732222363568, + "ewc_loss": 0.06416245549917221, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003047104983124882, + "grad_norm": 7.4318528175354, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8727686405181885, + "num_tokens": 444068386.0, + "step": 11638 + }, + { + "epoch": 1.4806004325149473, + "ewc_loss": 0.06420321762561798, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003051181265618652, + "grad_norm": 7.453031539916992, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.851620078086853, + "num_tokens": 444110307.0, + "step": 11639 + }, + { + "epoch": 1.4807276427935376, + "ewc_loss": 0.06414112448692322, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030449716723524034, + "grad_norm": 7.423044681549072, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8638228178024292, + "num_tokens": 444153919.0, + "step": 11640 + }, + { + "epoch": 1.4808548530721282, + "ewc_loss": 0.06419605016708374, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003050464147236198, + "grad_norm": 7.452581405639648, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8652117252349854, + "num_tokens": 444190575.0, + "step": 11641 + }, + { + "epoch": 1.4809820633507187, + "ewc_loss": 0.0641183853149414, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003042697790078819, + "grad_norm": 7.455392360687256, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8573023080825806, + "num_tokens": 444226108.0, + "step": 11642 + }, + { + "epoch": 1.4811092736293092, + "ewc_loss": 0.0640915185213089, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030400112154893577, + "grad_norm": 7.390682220458984, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.860196590423584, + "num_tokens": 444268162.0, + "step": 11643 + }, + { + "epoch": 1.4812364839078997, + "ewc_loss": 0.06424365937709808, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030552249518223107, + "grad_norm": 7.453001499176025, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8497036695480347, + "num_tokens": 444309273.0, + "step": 11644 + }, + { + "epoch": 1.4813636941864903, + "ewc_loss": 0.06408080458641052, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003038939612451941, + "grad_norm": 7.486263751983643, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8672525882720947, + "num_tokens": 444338697.0, + "step": 11645 + }, + { + "epoch": 1.4814909044650808, + "ewc_loss": 0.06409436464309692, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030402958509512246, + "grad_norm": 7.447445392608643, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8732333779335022, + "num_tokens": 444377954.0, + "step": 11646 + }, + { + "epoch": 1.4816181147436713, + "ewc_loss": 0.06411173939704895, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030420333496294916, + "grad_norm": 7.440768241882324, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8647553324699402, + "num_tokens": 444416996.0, + "step": 11647 + }, + { + "epoch": 1.4817453250222619, + "ewc_loss": 0.06415657699108124, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030465173767879605, + "grad_norm": 7.464804172515869, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8692946434020996, + "num_tokens": 444455550.0, + "step": 11648 + }, + { + "epoch": 1.4818725353008524, + "ewc_loss": 0.06416118890047073, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030469780904240906, + "grad_norm": 7.36911678314209, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8585876226425171, + "num_tokens": 444493979.0, + "step": 11649 + }, + { + "epoch": 1.4819997455794427, + "ewc_loss": 0.0642818808555603, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030590471578761935, + "grad_norm": 7.42609977722168, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8617860674858093, + "num_tokens": 444540622.0, + "step": 11650 + }, + { + "epoch": 1.4821269558580332, + "ewc_loss": 0.06424407660961151, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030552674434147775, + "grad_norm": 7.459381103515625, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8599966168403625, + "num_tokens": 444577772.0, + "step": 11651 + }, + { + "epoch": 1.4822541661366238, + "ewc_loss": 0.06427490711212158, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030583504121750593, + "grad_norm": 7.464709281921387, + "learning_rate": 1e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8500726222991943, + "num_tokens": 444615523.0, + "step": 11652 + }, + { + "epoch": 1.4823813764152143, + "ewc_loss": 0.06426453590393066, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030573131516575813, + "grad_norm": 7.47200345993042, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8618596792221069, + "num_tokens": 444659785.0, + "step": 11653 + }, + { + "epoch": 1.4825085866938048, + "ewc_loss": 0.06427875906229019, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030587351648136973, + "grad_norm": 7.449586868286133, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8585875034332275, + "num_tokens": 444700806.0, + "step": 11654 + }, + { + "epoch": 1.4826357969723953, + "ewc_loss": 0.06429200619459152, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003060059971176088, + "grad_norm": 13.590734481811523, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8645444512367249, + "num_tokens": 444734320.0, + "step": 11655 + }, + { + "epoch": 1.4827630072509859, + "ewc_loss": 0.07351532578468323, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003982391790486872, + "grad_norm": 8.569188117980957, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.857973039150238, + "num_tokens": 444770215.0, + "step": 11656 + }, + { + "epoch": 1.4828902175295764, + "ewc_loss": 0.06326213479042053, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002957072574645281, + "grad_norm": 7.218703269958496, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8620421886444092, + "num_tokens": 444805432.0, + "step": 11657 + }, + { + "epoch": 1.483017427808167, + "ewc_loss": 0.06604281812906265, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003235141048207879, + "grad_norm": 7.918240070343018, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8612003326416016, + "num_tokens": 444832398.0, + "step": 11658 + }, + { + "epoch": 1.4831446380867574, + "ewc_loss": 0.06477149575948715, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003108008822891861, + "grad_norm": 7.367869853973389, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8675839900970459, + "num_tokens": 444876128.0, + "step": 11659 + }, + { + "epoch": 1.483271848365348, + "ewc_loss": 0.06559328734874725, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031901884358376265, + "grad_norm": 7.838702201843262, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8576357364654541, + "num_tokens": 444915188.0, + "step": 11660 + }, + { + "epoch": 1.4833990586439385, + "ewc_loss": 0.0645752027630806, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003088379744440317, + "grad_norm": 7.398956298828125, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8675430417060852, + "num_tokens": 444953528.0, + "step": 11661 + }, + { + "epoch": 1.483526268922529, + "ewc_loss": 0.06533794105052948, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031646533170714974, + "grad_norm": 7.665192604064941, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8567386865615845, + "num_tokens": 444996469.0, + "step": 11662 + }, + { + "epoch": 1.4836534792011196, + "ewc_loss": 0.06465353071689606, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030962127493694425, + "grad_norm": 7.471138954162598, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8793352246284485, + "num_tokens": 445034298.0, + "step": 11663 + }, + { + "epoch": 1.48378068947971, + "ewc_loss": 0.06493998318910599, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031248576124198735, + "grad_norm": 7.579777717590332, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8619597554206848, + "num_tokens": 445079010.0, + "step": 11664 + }, + { + "epoch": 1.4839078997583004, + "ewc_loss": 0.06456947326660156, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003087806690018624, + "grad_norm": 7.518558025360107, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8763311505317688, + "num_tokens": 445116009.0, + "step": 11665 + }, + { + "epoch": 1.484035110036891, + "ewc_loss": 0.064552903175354, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003086149808950722, + "grad_norm": 7.4614763259887695, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8770138025283813, + "num_tokens": 445156313.0, + "step": 11666 + }, + { + "epoch": 1.4841623203154815, + "ewc_loss": 0.06458809971809387, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003089669335167855, + "grad_norm": 7.562097549438477, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.863444983959198, + "num_tokens": 445187424.0, + "step": 11667 + }, + { + "epoch": 1.484289530594072, + "ewc_loss": 0.06428514420986176, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003059373702853918, + "grad_norm": 7.4277167320251465, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8635990619659424, + "num_tokens": 445233448.0, + "step": 11668 + }, + { + "epoch": 1.4844167408726625, + "ewc_loss": 0.06465719640254974, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003096578875556588, + "grad_norm": 7.565685749053955, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.873088002204895, + "num_tokens": 445273663.0, + "step": 11669 + }, + { + "epoch": 1.484543951151253, + "ewc_loss": 0.06416389346122742, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030472484650090337, + "grad_norm": 7.397278308868408, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8685121536254883, + "num_tokens": 445313790.0, + "step": 11670 + }, + { + "epoch": 1.4846711614298436, + "ewc_loss": 0.06456626206636429, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030874856747686863, + "grad_norm": 7.551301956176758, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8736958503723145, + "num_tokens": 445351818.0, + "step": 11671 + }, + { + "epoch": 1.484798371708434, + "ewc_loss": 0.06432414799928665, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030632741982117295, + "grad_norm": 7.523025035858154, + "learning_rate": 1e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.8388475775718689, + "num_tokens": 445391763.0, + "step": 11672 + }, + { + "epoch": 1.4849255819870246, + "ewc_loss": 0.06439617276191711, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003070476232096553, + "grad_norm": 7.538804054260254, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8570736646652222, + "num_tokens": 445426723.0, + "step": 11673 + }, + { + "epoch": 1.485052792265615, + "ewc_loss": 0.06420768052339554, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003051627427339554, + "grad_norm": 7.419549465179443, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8680188059806824, + "num_tokens": 445465635.0, + "step": 11674 + }, + { + "epoch": 1.4851800025442055, + "ewc_loss": 0.06446381658315659, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030772408354096115, + "grad_norm": 7.543592929840088, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8633636236190796, + "num_tokens": 445502819.0, + "step": 11675 + }, + { + "epoch": 1.485307212822796, + "ewc_loss": 0.06433096528053284, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003063956100959331, + "grad_norm": 7.474105358123779, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8728742599487305, + "num_tokens": 445541861.0, + "step": 11676 + }, + { + "epoch": 1.4854344231013865, + "ewc_loss": 0.06447215378284454, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030780749511905015, + "grad_norm": 7.564014911651611, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8778296709060669, + "num_tokens": 445578058.0, + "step": 11677 + }, + { + "epoch": 1.485561633379977, + "ewc_loss": 0.0641234964132309, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003043208853341639, + "grad_norm": 7.519034385681152, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8491125106811523, + "num_tokens": 445612896.0, + "step": 11678 + }, + { + "epoch": 1.4856888436585676, + "ewc_loss": 0.06431050598621368, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030619101016782224, + "grad_norm": 7.516653537750244, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8566162586212158, + "num_tokens": 445647849.0, + "step": 11679 + }, + { + "epoch": 1.485816053937158, + "ewc_loss": 0.06433454900979996, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030643140780739486, + "grad_norm": 7.509227275848389, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8486055135726929, + "num_tokens": 445689199.0, + "step": 11680 + }, + { + "epoch": 1.4859432642157486, + "ewc_loss": 0.06429782509803772, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003060641756746918, + "grad_norm": 7.4690775871276855, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8620364665985107, + "num_tokens": 445727869.0, + "step": 11681 + }, + { + "epoch": 1.4860704744943392, + "ewc_loss": 0.06436251848936081, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030671112472191453, + "grad_norm": 7.490063667297363, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8676937818527222, + "num_tokens": 445770105.0, + "step": 11682 + }, + { + "epoch": 1.4861976847729297, + "ewc_loss": 0.06434774398803711, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003065634227823466, + "grad_norm": 7.486754417419434, + "learning_rate": 1e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8421509265899658, + "num_tokens": 445812950.0, + "step": 11683 + }, + { + "epoch": 1.4863248950515202, + "ewc_loss": 0.06440696120262146, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003071555111091584, + "grad_norm": 7.476124286651611, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8618210554122925, + "num_tokens": 445850811.0, + "step": 11684 + }, + { + "epoch": 1.4864521053301107, + "ewc_loss": 0.06436116993427277, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030669764964841306, + "grad_norm": 7.47484827041626, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8658854961395264, + "num_tokens": 445891967.0, + "step": 11685 + }, + { + "epoch": 1.4865793156087013, + "ewc_loss": 0.06429651379585266, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030605107895098627, + "grad_norm": 7.435512542724609, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8645920157432556, + "num_tokens": 445927936.0, + "step": 11686 + }, + { + "epoch": 1.4867065258872918, + "ewc_loss": 0.06442078948020935, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030729384161531925, + "grad_norm": 7.51920223236084, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8765341639518738, + "num_tokens": 445962481.0, + "step": 11687 + }, + { + "epoch": 1.4868337361658823, + "ewc_loss": 0.06437395513057709, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030682553187943995, + "grad_norm": 7.470667362213135, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8769817352294922, + "num_tokens": 446001445.0, + "step": 11688 + }, + { + "epoch": 1.4869609464444726, + "ewc_loss": 0.06441879272460938, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003072738472837955, + "grad_norm": 7.569092750549316, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8833501935005188, + "num_tokens": 446037904.0, + "step": 11689 + }, + { + "epoch": 1.4870881567230632, + "ewc_loss": 0.0642903745174408, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030598972807638347, + "grad_norm": 7.4768900871276855, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8720218539237976, + "num_tokens": 446077583.0, + "step": 11690 + }, + { + "epoch": 1.4872153670016537, + "ewc_loss": 0.06434927880764008, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003065787022933364, + "grad_norm": 7.5644001960754395, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8615759611129761, + "num_tokens": 446110123.0, + "step": 11691 + }, + { + "epoch": 1.4873425772802442, + "ewc_loss": 0.06417715549468994, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003048574726562947, + "grad_norm": 7.426579475402832, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8619657158851624, + "num_tokens": 446149832.0, + "step": 11692 + }, + { + "epoch": 1.4874697875588347, + "ewc_loss": 0.06447993963956833, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030788531876169145, + "grad_norm": 7.6038923263549805, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.870343804359436, + "num_tokens": 446184211.0, + "step": 11693 + }, + { + "epoch": 1.4875969978374253, + "ewc_loss": 0.06410252302885056, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030411119223572314, + "grad_norm": 7.430154800415039, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8571021556854248, + "num_tokens": 446229620.0, + "step": 11694 + }, + { + "epoch": 1.4877242081160158, + "ewc_loss": 0.06447779387235641, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030786386923864484, + "grad_norm": 7.59934663772583, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.865484893321991, + "num_tokens": 446267134.0, + "step": 11695 + }, + { + "epoch": 1.4878514183946063, + "ewc_loss": 0.06406792998313904, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003037652641069144, + "grad_norm": 7.420995235443115, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8496658802032471, + "num_tokens": 446303001.0, + "step": 11696 + }, + { + "epoch": 1.4879786286731969, + "ewc_loss": 0.06448830664157867, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000307969021378085, + "grad_norm": 7.610105991363525, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8622220754623413, + "num_tokens": 446334286.0, + "step": 11697 + }, + { + "epoch": 1.4881058389517874, + "ewc_loss": 0.06417745351791382, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030486047035083175, + "grad_norm": 7.4448561668396, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8823144435882568, + "num_tokens": 446369547.0, + "step": 11698 + }, + { + "epoch": 1.4882330492303777, + "ewc_loss": 0.06448373198509216, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003079232119489461, + "grad_norm": 7.601223468780518, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8485638499259949, + "num_tokens": 446408179.0, + "step": 11699 + }, + { + "epoch": 1.4883602595089682, + "ewc_loss": 0.06417344510555267, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003048204234801233, + "grad_norm": 7.422682285308838, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8654654026031494, + "num_tokens": 446448533.0, + "step": 11700 + }, + { + "epoch": 1.4884874697875587, + "ewc_loss": 0.06443887203931808, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030747466371394694, + "grad_norm": 7.515337944030762, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8611131310462952, + "num_tokens": 446489445.0, + "step": 11701 + }, + { + "epoch": 1.4886146800661493, + "ewc_loss": 0.06425105035305023, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003055964771192521, + "grad_norm": 7.517306804656982, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8772758841514587, + "num_tokens": 446526530.0, + "step": 11702 + }, + { + "epoch": 1.4887418903447398, + "ewc_loss": 0.06436370313167572, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030672294087707996, + "grad_norm": 7.618658542633057, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8641225099563599, + "num_tokens": 446558315.0, + "step": 11703 + }, + { + "epoch": 1.4888691006233303, + "ewc_loss": 0.06416983157396317, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000304784276522696, + "grad_norm": 7.470456600189209, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8660472631454468, + "num_tokens": 446598548.0, + "step": 11704 + }, + { + "epoch": 1.4889963109019209, + "ewc_loss": 0.06431373953819275, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030622328631579876, + "grad_norm": 7.501898288726807, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8593675494194031, + "num_tokens": 446632111.0, + "step": 11705 + }, + { + "epoch": 1.4891235211805114, + "ewc_loss": 0.06427902728319168, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030587619403377175, + "grad_norm": 7.503636360168457, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8714120388031006, + "num_tokens": 446669296.0, + "step": 11706 + }, + { + "epoch": 1.489250731459102, + "ewc_loss": 0.06422312557697296, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003053172258660197, + "grad_norm": 7.509068965911865, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.865912675857544, + "num_tokens": 446707891.0, + "step": 11707 + }, + { + "epoch": 1.4893779417376924, + "ewc_loss": 0.06429532915353775, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003060392336919904, + "grad_norm": 7.5062971115112305, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8516314625740051, + "num_tokens": 446749455.0, + "step": 11708 + }, + { + "epoch": 1.489505152016283, + "ewc_loss": 0.06416021287441254, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003046880301553756, + "grad_norm": 7.48369026184082, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8763483762741089, + "num_tokens": 446787001.0, + "step": 11709 + }, + { + "epoch": 1.4896323622948735, + "ewc_loss": 0.0641232505440712, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030431844061240554, + "grad_norm": 7.492515563964844, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8616260886192322, + "num_tokens": 446827262.0, + "step": 11710 + }, + { + "epoch": 1.489759572573464, + "ewc_loss": 0.06427706778049469, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030585660715587437, + "grad_norm": 7.558750152587891, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8785596489906311, + "num_tokens": 446860514.0, + "step": 11711 + }, + { + "epoch": 1.4898867828520546, + "ewc_loss": 0.0640668272972107, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003037542337551713, + "grad_norm": 7.484179973602295, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8545857667922974, + "num_tokens": 446902605.0, + "step": 11712 + }, + { + "epoch": 1.490013993130645, + "ewc_loss": 0.06424366682767868, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003055226115975529, + "grad_norm": 7.54585599899292, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8695589303970337, + "num_tokens": 446938638.0, + "step": 11713 + }, + { + "epoch": 1.4901412034092354, + "ewc_loss": 0.06405683606863022, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030365432030521333, + "grad_norm": 7.461092948913574, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8688604831695557, + "num_tokens": 446979286.0, + "step": 11714 + }, + { + "epoch": 1.490268413687826, + "ewc_loss": 0.06431408226490021, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003062267496716231, + "grad_norm": 7.610098838806152, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8474853038787842, + "num_tokens": 447017405.0, + "step": 11715 + }, + { + "epoch": 1.4903956239664164, + "ewc_loss": 0.06398503482341766, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003029363288078457, + "grad_norm": 7.384559631347656, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8666126728057861, + "num_tokens": 447056733.0, + "step": 11716 + }, + { + "epoch": 1.490522834245007, + "ewc_loss": 0.06439166516065598, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030700257048010826, + "grad_norm": 7.524792671203613, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8572250604629517, + "num_tokens": 447094493.0, + "step": 11717 + }, + { + "epoch": 1.4906500445235975, + "ewc_loss": 0.06401512026786804, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030323717510327697, + "grad_norm": 7.383023738861084, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8790204524993896, + "num_tokens": 447134039.0, + "step": 11718 + }, + { + "epoch": 1.490777254802188, + "ewc_loss": 0.06454277038574219, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030851367046125233, + "grad_norm": 7.597083568572998, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8710319995880127, + "num_tokens": 447168320.0, + "step": 11719 + }, + { + "epoch": 1.4909044650807786, + "ewc_loss": 0.06396954506635666, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030278138001449406, + "grad_norm": 7.450235366821289, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8480187058448792, + "num_tokens": 447205640.0, + "step": 11720 + }, + { + "epoch": 1.491031675359369, + "ewc_loss": 0.06448866426944733, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003079725429415703, + "grad_norm": 7.560320854187012, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8762097358703613, + "num_tokens": 447239831.0, + "step": 11721 + }, + { + "epoch": 1.4911588856379596, + "ewc_loss": 0.06412136554718018, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003042995813302696, + "grad_norm": 7.427628517150879, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.848940372467041, + "num_tokens": 447281100.0, + "step": 11722 + }, + { + "epoch": 1.49128609591655, + "ewc_loss": 0.06436746567487717, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000306760601233691, + "grad_norm": 7.5673346519470215, + "learning_rate": 1e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.844565212726593, + "num_tokens": 447316137.0, + "step": 11723 + }, + { + "epoch": 1.4914133061951405, + "ewc_loss": 0.06412571668624878, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030434306245297194, + "grad_norm": 7.48764705657959, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8568621277809143, + "num_tokens": 447357922.0, + "step": 11724 + }, + { + "epoch": 1.491540516473731, + "ewc_loss": 0.06427174061536789, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003058033180423081, + "grad_norm": 7.454814434051514, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8597599267959595, + "num_tokens": 447394905.0, + "step": 11725 + }, + { + "epoch": 1.4916677267523215, + "ewc_loss": 0.0643388107419014, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000306474044919014, + "grad_norm": 7.565749168395996, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8753005266189575, + "num_tokens": 447435379.0, + "step": 11726 + }, + { + "epoch": 1.491794937030912, + "ewc_loss": 0.06408916413784027, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030397760565392673, + "grad_norm": 7.520479202270508, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8543778657913208, + "num_tokens": 447473058.0, + "step": 11727 + }, + { + "epoch": 1.4919221473095026, + "ewc_loss": 0.06420853734016418, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003051713574677706, + "grad_norm": 7.488845348358154, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8730729818344116, + "num_tokens": 447510966.0, + "step": 11728 + }, + { + "epoch": 1.492049357588093, + "ewc_loss": 0.06419771164655685, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003050630330108106, + "grad_norm": 7.722265720367432, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8667060732841492, + "num_tokens": 447554747.0, + "step": 11729 + }, + { + "epoch": 1.4921765678666836, + "ewc_loss": 0.06406654417514801, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000303751410683617, + "grad_norm": 7.407375812530518, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8663719892501831, + "num_tokens": 447594426.0, + "step": 11730 + }, + { + "epoch": 1.4923037781452742, + "ewc_loss": 0.0644756630063057, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003078425652347505, + "grad_norm": 7.569219589233398, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8566845059394836, + "num_tokens": 447634786.0, + "step": 11731 + }, + { + "epoch": 1.4924309884238647, + "ewc_loss": 0.06400080025196075, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030309389694593847, + "grad_norm": 7.441869735717773, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.860103964805603, + "num_tokens": 447668682.0, + "step": 11732 + }, + { + "epoch": 1.4925581987024552, + "ewc_loss": 0.06450890749692917, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003081749891862273, + "grad_norm": 7.609477519989014, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8536947965621948, + "num_tokens": 447705566.0, + "step": 11733 + }, + { + "epoch": 1.4926854089810457, + "ewc_loss": 0.06402865797281265, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003033725079149008, + "grad_norm": 7.460002899169922, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8602443933486938, + "num_tokens": 447746385.0, + "step": 11734 + }, + { + "epoch": 1.4928126192596363, + "ewc_loss": 0.06442715227603912, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030735749169252813, + "grad_norm": 7.5425004959106445, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8676310777664185, + "num_tokens": 447790483.0, + "step": 11735 + }, + { + "epoch": 1.4929398295382268, + "ewc_loss": 0.06409905850887299, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003040765586774796, + "grad_norm": 7.49125862121582, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8615098595619202, + "num_tokens": 447826872.0, + "step": 11736 + }, + { + "epoch": 1.4930670398168173, + "ewc_loss": 0.06425438821315765, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003056298301089555, + "grad_norm": 7.485456943511963, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.869594931602478, + "num_tokens": 447864071.0, + "step": 11737 + }, + { + "epoch": 1.4931942500954076, + "ewc_loss": 0.06469589471817017, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003051620733458549, + "grad_norm": 7.499019622802734, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8465813994407654, + "num_tokens": 447903320.0, + "step": 11738 + }, + { + "epoch": 1.4933214603739982, + "ewc_loss": 0.06414595991373062, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030454553780145943, + "grad_norm": 7.449389457702637, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8775650262832642, + "num_tokens": 447940253.0, + "step": 11739 + }, + { + "epoch": 1.4934486706525887, + "ewc_loss": 0.06439727544784546, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003070587117690593, + "grad_norm": 7.503756999969482, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8684867024421692, + "num_tokens": 447978277.0, + "step": 11740 + }, + { + "epoch": 1.4935758809311792, + "ewc_loss": 0.06474132090806961, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003056163259316236, + "grad_norm": 7.471584320068359, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8529230356216431, + "num_tokens": 448018798.0, + "step": 11741 + }, + { + "epoch": 1.4937030912097697, + "ewc_loss": 0.06428778171539307, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030596370925195515, + "grad_norm": 7.472772598266602, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.851852536201477, + "num_tokens": 448057469.0, + "step": 11742 + }, + { + "epoch": 1.4938303014883603, + "ewc_loss": 0.06482859700918198, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003064890915993601, + "grad_norm": 7.4803571701049805, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.850119948387146, + "num_tokens": 448089032.0, + "step": 11743 + }, + { + "epoch": 1.4939575117669508, + "ewc_loss": 0.06428306549787521, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030591661925427616, + "grad_norm": 7.503457546234131, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8587936758995056, + "num_tokens": 448122438.0, + "step": 11744 + }, + { + "epoch": 1.4940847220455413, + "ewc_loss": 0.0642264112830162, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030535002588294446, + "grad_norm": 7.475359916687012, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8630263209342957, + "num_tokens": 448166844.0, + "step": 11745 + }, + { + "epoch": 1.4942119323241319, + "ewc_loss": 0.06437325477600098, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030681845964863896, + "grad_norm": 7.5108561515808105, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.854495644569397, + "num_tokens": 448197766.0, + "step": 11746 + }, + { + "epoch": 1.4943391426027224, + "ewc_loss": 0.06428447365760803, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003059306473005563, + "grad_norm": 7.450869083404541, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8510739803314209, + "num_tokens": 448240850.0, + "step": 11747 + }, + { + "epoch": 1.4944663528813127, + "ewc_loss": 0.06438747048377991, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030696060275658965, + "grad_norm": 7.523408889770508, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8722628355026245, + "num_tokens": 448274187.0, + "step": 11748 + }, + { + "epoch": 1.4945935631599032, + "ewc_loss": 0.06414151191711426, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030450106714852154, + "grad_norm": 7.427772521972656, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8690462708473206, + "num_tokens": 448311069.0, + "step": 11749 + }, + { + "epoch": 1.4947207734384937, + "ewc_loss": 0.06440244615077972, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030711037106812, + "grad_norm": 7.49580192565918, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8625693321228027, + "num_tokens": 448351536.0, + "step": 11750 + }, + { + "epoch": 1.4948479837170843, + "ewc_loss": 0.06422193348407745, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030530529329553246, + "grad_norm": 7.486697673797607, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8608317375183105, + "num_tokens": 448385108.0, + "step": 11751 + }, + { + "epoch": 1.4949751939956748, + "ewc_loss": 0.06440156698226929, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030710158171132207, + "grad_norm": 7.484540939331055, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8741660118103027, + "num_tokens": 448423272.0, + "step": 11752 + }, + { + "epoch": 1.4951024042742653, + "ewc_loss": 0.0642341747879982, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030542767490260303, + "grad_norm": 7.465124130249023, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8673133850097656, + "num_tokens": 448462904.0, + "step": 11753 + }, + { + "epoch": 1.4952296145528559, + "ewc_loss": 0.06432554870843887, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030634141876362264, + "grad_norm": 7.47992467880249, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8598354458808899, + "num_tokens": 448503257.0, + "step": 11754 + }, + { + "epoch": 1.4953568248314464, + "ewc_loss": 0.06430695950984955, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000306155503494665, + "grad_norm": 7.504186153411865, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8677510023117065, + "num_tokens": 448535299.0, + "step": 11755 + }, + { + "epoch": 1.495484035110037, + "ewc_loss": 0.06419491767883301, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030503509333357215, + "grad_norm": 7.4467620849609375, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8699817657470703, + "num_tokens": 448568878.0, + "step": 11756 + }, + { + "epoch": 1.4956112453886274, + "ewc_loss": 0.06427109986543655, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030579694430343807, + "grad_norm": 7.463394641876221, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8810900449752808, + "num_tokens": 448604959.0, + "step": 11757 + }, + { + "epoch": 1.495738455667218, + "ewc_loss": 0.06465837359428406, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00030478686676360667, + "grad_norm": 7.437348365783691, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.875582218170166, + "num_tokens": 448638487.0, + "step": 11758 + }, + { + "epoch": 1.4958656659458085, + "ewc_loss": 0.0642881914973259, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030596784199588, + "grad_norm": 7.436023235321045, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8691625595092773, + "num_tokens": 448677212.0, + "step": 11759 + }, + { + "epoch": 1.495992876224399, + "ewc_loss": 0.06447093188762665, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030535386758856475, + "grad_norm": 7.4545063972473145, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8580817580223083, + "num_tokens": 448719654.0, + "step": 11760 + }, + { + "epoch": 1.4961200865029896, + "ewc_loss": 0.06433165818452835, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003064025077037513, + "grad_norm": 7.493783473968506, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8673035502433777, + "num_tokens": 448760783.0, + "step": 11761 + }, + { + "epoch": 1.49624729678158, + "ewc_loss": 0.06445159763097763, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003051605017390102, + "grad_norm": 7.498896598815918, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8529885411262512, + "num_tokens": 448802843.0, + "step": 11762 + }, + { + "epoch": 1.4963745070601704, + "ewc_loss": 0.06418899446725845, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003049758670385927, + "grad_norm": 7.469247341156006, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8606933355331421, + "num_tokens": 448838430.0, + "step": 11763 + }, + { + "epoch": 1.496501717338761, + "ewc_loss": 0.06426314264535904, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030571737443096936, + "grad_norm": 7.4423065185546875, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8550459742546082, + "num_tokens": 448880450.0, + "step": 11764 + }, + { + "epoch": 1.4966289276173514, + "ewc_loss": 0.06414802372455597, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003045662015210837, + "grad_norm": 7.443230152130127, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8643522262573242, + "num_tokens": 448919352.0, + "step": 11765 + }, + { + "epoch": 1.496756137895942, + "ewc_loss": 0.06421251595020294, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030521105509251356, + "grad_norm": 7.462029933929443, + "learning_rate": 1e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.8387023210525513, + "num_tokens": 448965402.0, + "step": 11766 + }, + { + "epoch": 1.4968833481745325, + "ewc_loss": 0.06423051655292511, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030539106228388846, + "grad_norm": 7.52898645401001, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8753376007080078, + "num_tokens": 448996983.0, + "step": 11767 + }, + { + "epoch": 1.497010558453123, + "ewc_loss": 0.06399880349636078, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030307398992590606, + "grad_norm": 7.428604602813721, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8499688506126404, + "num_tokens": 449036555.0, + "step": 11768 + }, + { + "epoch": 1.4971377687317136, + "ewc_loss": 0.06422996520996094, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030538556165993214, + "grad_norm": 7.477609634399414, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8747061491012573, + "num_tokens": 449070545.0, + "step": 11769 + }, + { + "epoch": 1.497264979010304, + "ewc_loss": 0.06399752199649811, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003030611842405051, + "grad_norm": 7.488076210021973, + "learning_rate": 1e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8480359315872192, + "num_tokens": 449108859.0, + "step": 11770 + }, + { + "epoch": 1.4973921892888946, + "ewc_loss": 0.06415525823831558, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003046385245397687, + "grad_norm": 7.439522743225098, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8614802360534668, + "num_tokens": 449145333.0, + "step": 11771 + }, + { + "epoch": 1.497519399567485, + "ewc_loss": 0.06618183106184006, + "ewc_loss_diag": 3.5762786865234375e-05, + "ewc_loss_parallel": 0.00030537298880517483, + "grad_norm": 53.763648986816406, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8567928671836853, + "num_tokens": 449181292.0, + "step": 11772 + }, + { + "epoch": 1.4976466098460754, + "ewc_loss": 0.10345953702926636, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0006927985232323408, + "grad_norm": 11.982598304748535, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8655176162719727, + "num_tokens": 449213940.0, + "step": 11773 + }, + { + "epoch": 1.497773820124666, + "ewc_loss": 0.06555366516113281, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031373981619253755, + "grad_norm": 6.579954147338867, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8756304979324341, + "num_tokens": 449252487.0, + "step": 11774 + }, + { + "epoch": 1.4979010304032565, + "ewc_loss": 0.08685274422168732, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000529172015376389, + "grad_norm": 10.9874849319458, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8553967475891113, + "num_tokens": 449287268.0, + "step": 11775 + }, + { + "epoch": 1.498028240681847, + "ewc_loss": 0.09290103614330292, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000589654897339642, + "grad_norm": 10.866804122924805, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8591697812080383, + "num_tokens": 449326975.0, + "step": 11776 + }, + { + "epoch": 1.4981554509604376, + "ewc_loss": 0.07293280214071274, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00038997255614958704, + "grad_norm": 7.927844524383545, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8625505566596985, + "num_tokens": 449366061.0, + "step": 11777 + }, + { + "epoch": 1.498282661239028, + "ewc_loss": 0.07591050863265991, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00041974964551627636, + "grad_norm": 9.483935356140137, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8718080520629883, + "num_tokens": 449401763.0, + "step": 11778 + }, + { + "epoch": 1.4984098715176186, + "ewc_loss": 0.07839791476726532, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00044462367077358067, + "grad_norm": 9.014232635498047, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8827142119407654, + "num_tokens": 449436605.0, + "step": 11779 + }, + { + "epoch": 1.4985370817962091, + "ewc_loss": 0.07078403234481812, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000368484907085076, + "grad_norm": 8.229778289794922, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8589392900466919, + "num_tokens": 449474523.0, + "step": 11780 + }, + { + "epoch": 1.4986642920747997, + "ewc_loss": 0.07138201594352722, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00037690604222007096, + "grad_norm": 8.57394790649414, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8751348257064819, + "num_tokens": 449513301.0, + "step": 11781 + }, + { + "epoch": 1.4987915023533902, + "ewc_loss": 0.07046794891357422, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00036776537308469415, + "grad_norm": 8.154703140258789, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8595806360244751, + "num_tokens": 449547826.0, + "step": 11782 + }, + { + "epoch": 1.4989187126319807, + "ewc_loss": 0.06860776245594025, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003491635143291205, + "grad_norm": 8.200998306274414, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8632775545120239, + "num_tokens": 449584818.0, + "step": 11783 + }, + { + "epoch": 1.4990459229105713, + "ewc_loss": 0.06835348159074783, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003466207708697766, + "grad_norm": 8.028292655944824, + "learning_rate": 1e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8448143005371094, + "num_tokens": 449625368.0, + "step": 11784 + }, + { + "epoch": 1.4991731331891618, + "ewc_loss": 0.06747332215309143, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003378191904630512, + "grad_norm": 7.901802062988281, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8498398065567017, + "num_tokens": 449666320.0, + "step": 11785 + }, + { + "epoch": 1.4993003434677523, + "ewc_loss": 0.0668555349111557, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003316412621643394, + "grad_norm": 7.865877151489258, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8559095859527588, + "num_tokens": 449705993.0, + "step": 11786 + }, + { + "epoch": 1.4994275537463426, + "ewc_loss": 0.06661766022443771, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00032926254789344966, + "grad_norm": 7.823390007019043, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.857385516166687, + "num_tokens": 449747088.0, + "step": 11787 + }, + { + "epoch": 1.4995547640249332, + "ewc_loss": 0.06603986024856567, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003234845935367048, + "grad_norm": 7.716947555541992, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8648093938827515, + "num_tokens": 449789840.0, + "step": 11788 + }, + { + "epoch": 1.4996819743035237, + "ewc_loss": 0.06592310965061188, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00032231700606644154, + "grad_norm": 7.683957576751709, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8621530532836914, + "num_tokens": 449830599.0, + "step": 11789 + }, + { + "epoch": 1.4998091845821142, + "ewc_loss": 0.06560114026069641, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000319097307510674, + "grad_norm": 7.760722637176514, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8682273626327515, + "num_tokens": 449864476.0, + "step": 11790 + }, + { + "epoch": 1.4999363948607047, + "ewc_loss": 0.0654253214597702, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031733911600895226, + "grad_norm": 7.671748638153076, + "learning_rate": 1e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8463101387023926, + "num_tokens": 449903791.0, + "step": 11791 + }, + { + "epoch": 1.5000636051392953, + "ewc_loss": 0.06521233916282654, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003152092976961285, + "grad_norm": 7.65053129196167, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8629820346832275, + "num_tokens": 449945478.0, + "step": 11792 + }, + { + "epoch": 1.5001908154178858, + "ewc_loss": 0.06516893208026886, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003147752140648663, + "grad_norm": 7.61509895324707, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8603803515434265, + "num_tokens": 449986175.0, + "step": 11793 + }, + { + "epoch": 1.5003180256964763, + "ewc_loss": 0.06490752846002579, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031216119532473385, + "grad_norm": 7.624800682067871, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8591530323028564, + "num_tokens": 450025537.0, + "step": 11794 + }, + { + "epoch": 1.5004452359750666, + "ewc_loss": 0.06487014889717102, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031178747303783894, + "grad_norm": 7.589945316314697, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8493199944496155, + "num_tokens": 450068269.0, + "step": 11795 + }, + { + "epoch": 1.5005724462536572, + "ewc_loss": 0.06467380374670029, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030982395401224494, + "grad_norm": 7.769924163818359, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8790817856788635, + "num_tokens": 450106234.0, + "step": 11796 + }, + { + "epoch": 1.5006996565322477, + "ewc_loss": 0.0644739419221878, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030782539397478104, + "grad_norm": 7.496916770935059, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8727291822433472, + "num_tokens": 450141916.0, + "step": 11797 + }, + { + "epoch": 1.5008268668108382, + "ewc_loss": 0.06484527885913849, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003115387517027557, + "grad_norm": 7.746747016906738, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8476513624191284, + "num_tokens": 450175427.0, + "step": 11798 + }, + { + "epoch": 1.5009540770894287, + "ewc_loss": 0.06427952647209167, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030588117078877985, + "grad_norm": 7.497658729553223, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8656471371650696, + "num_tokens": 450211104.0, + "step": 11799 + }, + { + "epoch": 1.5010812873680193, + "ewc_loss": 0.06483089923858643, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003113949205726385, + "grad_norm": 7.684060096740723, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.855800449848175, + "num_tokens": 450244844.0, + "step": 11800 + }, + { + "epoch": 1.5012084976466098, + "ewc_loss": 0.06426800787448883, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030576600693166256, + "grad_norm": 7.442007064819336, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8614937663078308, + "num_tokens": 450286094.0, + "step": 11801 + }, + { + "epoch": 1.5013357079252003, + "ewc_loss": 0.0647115558385849, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003102014889009297, + "grad_norm": 7.682439804077148, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8541542291641235, + "num_tokens": 450324938.0, + "step": 11802 + }, + { + "epoch": 1.5014629182037909, + "ewc_loss": 0.0642685666680336, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003057716239709407, + "grad_norm": 7.403956413269043, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8625693321228027, + "num_tokens": 450368572.0, + "step": 11803 + }, + { + "epoch": 1.5015901284823814, + "ewc_loss": 0.0647110715508461, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031019665766507387, + "grad_norm": 7.593907833099365, + "learning_rate": 1e-06, + "loss": 0.542, + "mean_token_accuracy": 0.8409765958786011, + "num_tokens": 450410926.0, + "step": 11804 + }, + { + "epoch": 1.501717338760972, + "ewc_loss": 0.06428743153810501, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003059602458961308, + "grad_norm": 7.421601295471191, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8669604063034058, + "num_tokens": 450450206.0, + "step": 11805 + }, + { + "epoch": 1.5018445490395624, + "ewc_loss": 0.06477805972099304, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031086651142686605, + "grad_norm": 7.603346347808838, + "learning_rate": 1e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8430532217025757, + "num_tokens": 450488998.0, + "step": 11806 + }, + { + "epoch": 1.501971759318153, + "ewc_loss": 0.06440284848213196, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030711444560438395, + "grad_norm": 7.453608989715576, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8735979795455933, + "num_tokens": 450530476.0, + "step": 11807 + }, + { + "epoch": 1.5020989695967435, + "ewc_loss": 0.06473544239997864, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000310440402245149, + "grad_norm": 7.570539474487305, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8698681592941284, + "num_tokens": 450560655.0, + "step": 11808 + }, + { + "epoch": 1.502226179875334, + "ewc_loss": 0.06451261788606644, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030821209657005966, + "grad_norm": 7.519524097442627, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8652666211128235, + "num_tokens": 450601816.0, + "step": 11809 + }, + { + "epoch": 1.5023533901539246, + "ewc_loss": 0.0646124929189682, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003092109109275043, + "grad_norm": 7.557845115661621, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8651033639907837, + "num_tokens": 450634182.0, + "step": 11810 + }, + { + "epoch": 1.502480600432515, + "ewc_loss": 0.06442327797412872, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003073187544941902, + "grad_norm": 7.460049629211426, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.858550488948822, + "num_tokens": 450672705.0, + "step": 11811 + }, + { + "epoch": 1.5026078107111056, + "ewc_loss": 0.06463129818439484, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003093989216722548, + "grad_norm": 7.587958335876465, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8512530326843262, + "num_tokens": 450705636.0, + "step": 11812 + }, + { + "epoch": 1.502735020989696, + "ewc_loss": 0.06440205872058868, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030710650025866926, + "grad_norm": 7.561068534851074, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8536760807037354, + "num_tokens": 450736165.0, + "step": 11813 + }, + { + "epoch": 1.5028622312682864, + "ewc_loss": 0.06448328495025635, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030791875906288624, + "grad_norm": 7.499696731567383, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.864054799079895, + "num_tokens": 450776939.0, + "step": 11814 + }, + { + "epoch": 1.502989441546877, + "ewc_loss": 0.0651608407497406, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003073701518587768, + "grad_norm": 15.228286743164062, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8677072525024414, + "num_tokens": 450811994.0, + "step": 11815 + }, + { + "epoch": 1.5031166518254675, + "ewc_loss": 0.07724601030349731, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0004306631744839251, + "grad_norm": 8.946741104125977, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8643617033958435, + "num_tokens": 450853688.0, + "step": 11816 + }, + { + "epoch": 1.503243862104058, + "ewc_loss": 0.0625462606549263, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0002885485300794244, + "grad_norm": 7.063812732696533, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.862369179725647, + "num_tokens": 450892100.0, + "step": 11817 + }, + { + "epoch": 1.5033710723826486, + "ewc_loss": 0.06718344986438751, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003349204489495605, + "grad_norm": 8.0845308303833, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8789322376251221, + "num_tokens": 450936383.0, + "step": 11818 + }, + { + "epoch": 1.503498282661239, + "ewc_loss": 0.06520310044288635, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003151169221382588, + "grad_norm": 7.4137115478515625, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8695266246795654, + "num_tokens": 450965525.0, + "step": 11819 + }, + { + "epoch": 1.5036254929398294, + "ewc_loss": 0.06608113646507263, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00032389728585258126, + "grad_norm": 7.824890613555908, + "learning_rate": 1e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8506133556365967, + "num_tokens": 451005874.0, + "step": 11820 + }, + { + "epoch": 1.50375270321842, + "ewc_loss": 0.06502579897642136, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003133439167868346, + "grad_norm": 7.475424289703369, + "learning_rate": 1e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.845245361328125, + "num_tokens": 451040668.0, + "step": 11821 + }, + { + "epoch": 1.5038799134970104, + "ewc_loss": 0.06616184115409851, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003198215563315898, + "grad_norm": 7.6724066734313965, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8601142764091492, + "num_tokens": 451082747.0, + "step": 11822 + }, + { + "epoch": 1.504007123775601, + "ewc_loss": 0.06513567268848419, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003144427028018981, + "grad_norm": 7.5508623123168945, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.870097815990448, + "num_tokens": 451120344.0, + "step": 11823 + }, + { + "epoch": 1.5041343340541915, + "ewc_loss": 0.06513404846191406, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031442646286450326, + "grad_norm": 7.615170478820801, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8547732830047607, + "num_tokens": 451156564.0, + "step": 11824 + }, + { + "epoch": 1.504261544332782, + "ewc_loss": 0.06486831605434418, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003117690794169903, + "grad_norm": 7.517490863800049, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8577742576599121, + "num_tokens": 451197252.0, + "step": 11825 + }, + { + "epoch": 1.5043887546113726, + "ewc_loss": 0.06501133739948273, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031319927074946463, + "grad_norm": 7.566706657409668, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8511574268341064, + "num_tokens": 451236150.0, + "step": 11826 + }, + { + "epoch": 1.504515964889963, + "ewc_loss": 0.06471454352140427, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003102313494309783, + "grad_norm": 7.4894185066223145, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8580722808837891, + "num_tokens": 451275751.0, + "step": 11827 + }, + { + "epoch": 1.5046431751685536, + "ewc_loss": 0.06485515832901001, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003116375009994954, + "grad_norm": 7.496118545532227, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8829163908958435, + "num_tokens": 451316778.0, + "step": 11828 + }, + { + "epoch": 1.5047703854471441, + "ewc_loss": 0.0647381916642189, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003104678762611002, + "grad_norm": 7.466626167297363, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8620079159736633, + "num_tokens": 451359921.0, + "step": 11829 + }, + { + "epoch": 1.5048975957257347, + "ewc_loss": 0.06474022567272186, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000310488190734759, + "grad_norm": 7.535976409912109, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8682445883750916, + "num_tokens": 451395616.0, + "step": 11830 + }, + { + "epoch": 1.5050248060043252, + "ewc_loss": 0.0647096112370491, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003101820475421846, + "grad_norm": 7.527592658996582, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8634405732154846, + "num_tokens": 451427766.0, + "step": 11831 + }, + { + "epoch": 1.5051520162829157, + "ewc_loss": 0.06474044919013977, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003104904026258737, + "grad_norm": 7.529235363006592, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8715333938598633, + "num_tokens": 451463073.0, + "step": 11832 + }, + { + "epoch": 1.5052792265615063, + "ewc_loss": 0.06470349431037903, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003101208421867341, + "grad_norm": 7.500823020935059, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.881953239440918, + "num_tokens": 451503039.0, + "step": 11833 + }, + { + "epoch": 1.5054064368400968, + "ewc_loss": 0.06474025547504425, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000310488510876894, + "grad_norm": 7.517323017120361, + "learning_rate": 1e-06, + "loss": 0.5424, + "mean_token_accuracy": 0.845818281173706, + "num_tokens": 451541599.0, + "step": 11834 + }, + { + "epoch": 1.5055336471186873, + "ewc_loss": 0.06468673050403595, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003099532623309642, + "grad_norm": 7.524302959442139, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8492036461830139, + "num_tokens": 451579278.0, + "step": 11835 + }, + { + "epoch": 1.5056608573972778, + "ewc_loss": 0.06459825485944748, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003090685058850795, + "grad_norm": 7.483232021331787, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8659969568252563, + "num_tokens": 451618591.0, + "step": 11836 + }, + { + "epoch": 1.5057880676758684, + "ewc_loss": 0.06471773982048035, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031026333454065025, + "grad_norm": 7.562900066375732, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8618090152740479, + "num_tokens": 451654453.0, + "step": 11837 + }, + { + "epoch": 1.5059152779544587, + "ewc_loss": 0.06448361277580261, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003079220186918974, + "grad_norm": 7.515071392059326, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8615199327468872, + "num_tokens": 451688302.0, + "step": 11838 + }, + { + "epoch": 1.5060424882330492, + "ewc_loss": 0.06463156640529633, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003094016283284873, + "grad_norm": 7.496796607971191, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8679773807525635, + "num_tokens": 451727301.0, + "step": 11839 + }, + { + "epoch": 1.5061696985116397, + "ewc_loss": 0.06452282518148422, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003083141637034714, + "grad_norm": 7.491028785705566, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.849061131477356, + "num_tokens": 451770471.0, + "step": 11840 + }, + { + "epoch": 1.5062969087902303, + "ewc_loss": 0.06452416628599167, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000308327580569312, + "grad_norm": 7.502475738525391, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8506603837013245, + "num_tokens": 451809413.0, + "step": 11841 + }, + { + "epoch": 1.5064241190688208, + "ewc_loss": 0.06455066800117493, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030859257094562054, + "grad_norm": 7.480655670166016, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8597584962844849, + "num_tokens": 451850177.0, + "step": 11842 + }, + { + "epoch": 1.5065513293474113, + "ewc_loss": 0.06480064988136292, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030865101143717766, + "grad_norm": 7.527777671813965, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8506408929824829, + "num_tokens": 451887977.0, + "step": 11843 + }, + { + "epoch": 1.5066785396260016, + "ewc_loss": 0.06477506458759308, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030839513055980206, + "grad_norm": 7.48801326751709, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8727105259895325, + "num_tokens": 451927403.0, + "step": 11844 + }, + { + "epoch": 1.5068057499045922, + "ewc_loss": 0.06477567553520203, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030840124236419797, + "grad_norm": 7.472640514373779, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8629490733146667, + "num_tokens": 451967350.0, + "step": 11845 + }, + { + "epoch": 1.5069329601831827, + "ewc_loss": 0.06476274132728577, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030827196314930916, + "grad_norm": 7.464842319488525, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8626930713653564, + "num_tokens": 452008664.0, + "step": 11846 + }, + { + "epoch": 1.5070601704617732, + "ewc_loss": 0.06461557000875473, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030924161546863616, + "grad_norm": 7.601657390594482, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8669508099555969, + "num_tokens": 452047303.0, + "step": 11847 + }, + { + "epoch": 1.5071873807403637, + "ewc_loss": 0.06470401585102081, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030768473516218364, + "grad_norm": 7.512350559234619, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8632950782775879, + "num_tokens": 452087359.0, + "step": 11848 + }, + { + "epoch": 1.5073145910189543, + "ewc_loss": 0.06455680727958679, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003086540091317147, + "grad_norm": 7.49507474899292, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8490193486213684, + "num_tokens": 452129426.0, + "step": 11849 + }, + { + "epoch": 1.5074418012975448, + "ewc_loss": 0.06450428813695908, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003081288014072925, + "grad_norm": 7.48326301574707, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8675487637519836, + "num_tokens": 452169134.0, + "step": 11850 + }, + { + "epoch": 1.5075690115761353, + "ewc_loss": 0.06452500820159912, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003083360497839749, + "grad_norm": 7.510120391845703, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.852697491645813, + "num_tokens": 452209962.0, + "step": 11851 + }, + { + "epoch": 1.5076962218547258, + "ewc_loss": 0.06462148576974869, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030930081265978515, + "grad_norm": 7.531319618225098, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8655111789703369, + "num_tokens": 452245978.0, + "step": 11852 + }, + { + "epoch": 1.5078234321333164, + "ewc_loss": 0.06446397304534912, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003077257133554667, + "grad_norm": 7.53219747543335, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.871025025844574, + "num_tokens": 452281013.0, + "step": 11853 + }, + { + "epoch": 1.507950642411907, + "ewc_loss": 0.06453225761651993, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030840851832181215, + "grad_norm": 7.454998970031738, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8766511678695679, + "num_tokens": 452317174.0, + "step": 11854 + }, + { + "epoch": 1.5080778526904974, + "ewc_loss": 0.06461752206087112, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003092611732427031, + "grad_norm": 7.50421667098999, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8587906956672668, + "num_tokens": 452362974.0, + "step": 11855 + }, + { + "epoch": 1.508205062969088, + "ewc_loss": 0.06447498500347137, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030783575493842363, + "grad_norm": 7.496120929718018, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8738949298858643, + "num_tokens": 452400701.0, + "step": 11856 + }, + { + "epoch": 1.5083322732476785, + "ewc_loss": 0.06477127969264984, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003107987286057323, + "grad_norm": 7.568493843078613, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.871675968170166, + "num_tokens": 452438541.0, + "step": 11857 + }, + { + "epoch": 1.508459483526269, + "ewc_loss": 0.0644158124923706, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030724407406523824, + "grad_norm": 7.481897354125977, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8575312495231628, + "num_tokens": 452476918.0, + "step": 11858 + }, + { + "epoch": 1.5085866938048595, + "ewc_loss": 0.06466607004404068, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030974665423855186, + "grad_norm": 7.569447040557861, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8668512105941772, + "num_tokens": 452509231.0, + "step": 11859 + }, + { + "epoch": 1.50871390408345, + "ewc_loss": 0.06451988220214844, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003082847688347101, + "grad_norm": 7.505547523498535, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8579513430595398, + "num_tokens": 452547079.0, + "step": 11860 + }, + { + "epoch": 1.5088411143620406, + "ewc_loss": 0.06451119482517242, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003081978647969663, + "grad_norm": 7.5215020179748535, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8740329742431641, + "num_tokens": 452580215.0, + "step": 11861 + }, + { + "epoch": 1.508968324640631, + "ewc_loss": 0.06449172645807266, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003080031892750412, + "grad_norm": 7.575930118560791, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8682329058647156, + "num_tokens": 452615778.0, + "step": 11862 + }, + { + "epoch": 1.5090955349192214, + "ewc_loss": 0.06449265778064728, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030801253160461783, + "grad_norm": 7.47128438949585, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8653021454811096, + "num_tokens": 452652184.0, + "step": 11863 + }, + { + "epoch": 1.509222745197812, + "ewc_loss": 0.06461924314498901, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000309278373606503, + "grad_norm": 7.5190863609313965, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8489443063735962, + "num_tokens": 452691345.0, + "step": 11864 + }, + { + "epoch": 1.5093499554764025, + "ewc_loss": 0.06445072591304779, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030759323271922767, + "grad_norm": 7.483410358428955, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8641954660415649, + "num_tokens": 452729982.0, + "step": 11865 + }, + { + "epoch": 1.509477165754993, + "ewc_loss": 0.0645887702703476, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030897362739779055, + "grad_norm": 7.515446186065674, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8597673773765564, + "num_tokens": 452771272.0, + "step": 11866 + }, + { + "epoch": 1.5096043760335836, + "ewc_loss": 0.06446134299039841, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003076993743889034, + "grad_norm": 7.506950378417969, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8586940169334412, + "num_tokens": 452812231.0, + "step": 11867 + }, + { + "epoch": 1.509731586312174, + "ewc_loss": 0.06457370519638062, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030882301507517695, + "grad_norm": 7.5010480880737305, + "learning_rate": 1e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.8520241975784302, + "num_tokens": 452858406.0, + "step": 11868 + }, + { + "epoch": 1.5098587965907644, + "ewc_loss": 0.0645281970500946, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030836788937449455, + "grad_norm": 7.481357097625732, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8537206649780273, + "num_tokens": 452898793.0, + "step": 11869 + }, + { + "epoch": 1.509986006869355, + "ewc_loss": 0.0646219477057457, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030930544016882777, + "grad_norm": 7.540213584899902, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8683703541755676, + "num_tokens": 452937929.0, + "step": 11870 + }, + { + "epoch": 1.5101132171479454, + "ewc_loss": 0.06461326032876968, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003092185652349144, + "grad_norm": 7.513983249664307, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8666191697120667, + "num_tokens": 452975436.0, + "step": 11871 + }, + { + "epoch": 1.510240427426536, + "ewc_loss": 0.06455634534358978, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030864941072650254, + "grad_norm": 7.506204128265381, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.862700343132019, + "num_tokens": 453016334.0, + "step": 11872 + }, + { + "epoch": 1.5103676377051265, + "ewc_loss": 0.0646272823214531, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030935878749005497, + "grad_norm": 7.499261379241943, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8666156530380249, + "num_tokens": 453053630.0, + "step": 11873 + }, + { + "epoch": 1.510494847983717, + "ewc_loss": 0.06460796296596527, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003091655671596527, + "grad_norm": 7.517784595489502, + "learning_rate": 1e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8436195850372314, + "num_tokens": 453092341.0, + "step": 11874 + }, + { + "epoch": 1.5106220582623076, + "ewc_loss": 0.06456126272678375, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003086985961999744, + "grad_norm": 7.540330410003662, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8710124492645264, + "num_tokens": 453126859.0, + "step": 11875 + }, + { + "epoch": 1.510749268540898, + "ewc_loss": 0.0646609365940094, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030969525687396526, + "grad_norm": 7.4773969650268555, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8597792387008667, + "num_tokens": 453167327.0, + "step": 11876 + }, + { + "epoch": 1.5108764788194886, + "ewc_loss": 0.06464256346225739, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030951155349612236, + "grad_norm": 7.537656307220459, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8684638738632202, + "num_tokens": 453205901.0, + "step": 11877 + }, + { + "epoch": 1.5110036890980791, + "ewc_loss": 0.06454723328351974, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003085582866333425, + "grad_norm": 7.459771633148193, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8605285882949829, + "num_tokens": 453241856.0, + "step": 11878 + }, + { + "epoch": 1.5111308993766697, + "ewc_loss": 0.06475132703781128, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003105992218479514, + "grad_norm": 7.530634880065918, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8707789182662964, + "num_tokens": 453282528.0, + "step": 11879 + }, + { + "epoch": 1.5112581096552602, + "ewc_loss": 0.06455733627080917, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030865930602885783, + "grad_norm": 7.59678316116333, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8526526689529419, + "num_tokens": 453318762.0, + "step": 11880 + }, + { + "epoch": 1.5113853199338507, + "ewc_loss": 0.06458261609077454, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030891207279637456, + "grad_norm": 7.501643657684326, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8770524263381958, + "num_tokens": 453352482.0, + "step": 11881 + }, + { + "epoch": 1.5115125302124413, + "ewc_loss": 0.06457890570163727, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003088749945163727, + "grad_norm": 7.454848289489746, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8630188703536987, + "num_tokens": 453393692.0, + "step": 11882 + }, + { + "epoch": 1.5116397404910318, + "ewc_loss": 0.0646560862660408, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003096467989962548, + "grad_norm": 7.5449066162109375, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8653303980827332, + "num_tokens": 453429217.0, + "step": 11883 + }, + { + "epoch": 1.5117669507696223, + "ewc_loss": 0.06457895040512085, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030887543107382953, + "grad_norm": 7.4877543449401855, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8477268218994141, + "num_tokens": 453468190.0, + "step": 11884 + }, + { + "epoch": 1.5118941610482128, + "ewc_loss": 0.0647655725479126, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003107416850980371, + "grad_norm": 7.558923244476318, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8566493988037109, + "num_tokens": 453509893.0, + "step": 11885 + }, + { + "epoch": 1.5120213713268034, + "ewc_loss": 0.06457007676362991, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003087867225985974, + "grad_norm": 7.489859104156494, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8667330741882324, + "num_tokens": 453546972.0, + "step": 11886 + }, + { + "epoch": 1.5121485816053937, + "ewc_loss": 0.06467685103416443, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003098544548265636, + "grad_norm": 7.561501502990723, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8746801018714905, + "num_tokens": 453578480.0, + "step": 11887 + }, + { + "epoch": 1.5122757918839842, + "ewc_loss": 0.06445485353469849, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003076344437431544, + "grad_norm": 7.449103355407715, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8633397817611694, + "num_tokens": 453623248.0, + "step": 11888 + }, + { + "epoch": 1.5124030021625747, + "ewc_loss": 0.06467558443546295, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003098417946603149, + "grad_norm": 7.548238754272461, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8832452297210693, + "num_tokens": 453660088.0, + "step": 11889 + }, + { + "epoch": 1.5125302124411653, + "ewc_loss": 0.06444422900676727, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003075282438658178, + "grad_norm": 7.476673126220703, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8660634756088257, + "num_tokens": 453698931.0, + "step": 11890 + }, + { + "epoch": 1.5126574227197558, + "ewc_loss": 0.06462527811527252, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030933876405470073, + "grad_norm": 7.543119430541992, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8656550645828247, + "num_tokens": 453734220.0, + "step": 11891 + }, + { + "epoch": 1.5127846329983463, + "ewc_loss": 0.06432020664215088, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003062879841309041, + "grad_norm": 7.471840858459473, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8626489043235779, + "num_tokens": 453774056.0, + "step": 11892 + }, + { + "epoch": 1.5129118432769366, + "ewc_loss": 0.0646832063794136, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030991798848845065, + "grad_norm": 7.594840049743652, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8685383796691895, + "num_tokens": 453806616.0, + "step": 11893 + }, + { + "epoch": 1.5130390535555271, + "ewc_loss": 0.06439770013093948, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000307062960928306, + "grad_norm": 7.4446868896484375, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8714092969894409, + "num_tokens": 453843838.0, + "step": 11894 + }, + { + "epoch": 1.5131662638341177, + "ewc_loss": 0.0646834522485733, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030992046231403947, + "grad_norm": 7.631624221801758, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8709137439727783, + "num_tokens": 453876888.0, + "step": 11895 + }, + { + "epoch": 1.5132934741127082, + "ewc_loss": 0.06431286036968231, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003062145260628313, + "grad_norm": 7.3936848640441895, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8773320913314819, + "num_tokens": 453915334.0, + "step": 11896 + }, + { + "epoch": 1.5134206843912987, + "ewc_loss": 0.06487470865249634, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003118330205325037, + "grad_norm": 7.593436241149902, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8726493120193481, + "num_tokens": 453950590.0, + "step": 11897 + }, + { + "epoch": 1.5135478946698893, + "ewc_loss": 0.0642894059419632, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030597997829318047, + "grad_norm": 7.400509357452393, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8663987517356873, + "num_tokens": 453992590.0, + "step": 11898 + }, + { + "epoch": 1.5136751049484798, + "ewc_loss": 0.06489528715610504, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003120388137176633, + "grad_norm": 7.56315803527832, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8676584959030151, + "num_tokens": 454038619.0, + "step": 11899 + }, + { + "epoch": 1.5138023152270703, + "ewc_loss": 0.06439678370952606, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003070537350140512, + "grad_norm": 7.455824375152588, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8534413576126099, + "num_tokens": 454081564.0, + "step": 11900 + }, + { + "epoch": 1.5139295255056608, + "ewc_loss": 0.06468029320240021, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030988891376182437, + "grad_norm": 7.566339015960693, + "learning_rate": 1e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8513908386230469, + "num_tokens": 454125581.0, + "step": 11901 + }, + { + "epoch": 1.5140567357842514, + "ewc_loss": 0.06442829966545105, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003073689294978976, + "grad_norm": 7.436793327331543, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8719152212142944, + "num_tokens": 454163961.0, + "step": 11902 + }, + { + "epoch": 1.514183946062842, + "ewc_loss": 0.06473925709724426, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003104784700553864, + "grad_norm": 7.587926864624023, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8556422591209412, + "num_tokens": 454200410.0, + "step": 11903 + }, + { + "epoch": 1.5143111563414324, + "ewc_loss": 0.06435652077198029, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030665111262351274, + "grad_norm": 7.446667194366455, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.849048912525177, + "num_tokens": 454237143.0, + "step": 11904 + }, + { + "epoch": 1.514438366620023, + "ewc_loss": 0.06476974487304688, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003107834199909121, + "grad_norm": 7.5722761154174805, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8699647188186646, + "num_tokens": 454274022.0, + "step": 11905 + }, + { + "epoch": 1.5145655768986135, + "ewc_loss": 0.06435536593198776, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003066395875066519, + "grad_norm": 7.487784385681152, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8570506572723389, + "num_tokens": 454307955.0, + "step": 11906 + }, + { + "epoch": 1.514692787177204, + "ewc_loss": 0.06467870622873306, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000309873023070395, + "grad_norm": 7.580904483795166, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8705122470855713, + "num_tokens": 454345449.0, + "step": 11907 + }, + { + "epoch": 1.5148199974557945, + "ewc_loss": 0.06441401690244675, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030722611700184643, + "grad_norm": 7.463620662689209, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8509247899055481, + "num_tokens": 454383096.0, + "step": 11908 + }, + { + "epoch": 1.514947207734385, + "ewc_loss": 0.06464250385761261, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003095109714195132, + "grad_norm": 7.53481912612915, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8512292504310608, + "num_tokens": 454420142.0, + "step": 11909 + }, + { + "epoch": 1.5150744180129756, + "ewc_loss": 0.06448427587747574, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030792871257290244, + "grad_norm": 7.467207431793213, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8619816899299622, + "num_tokens": 454454991.0, + "step": 11910 + }, + { + "epoch": 1.515201628291566, + "ewc_loss": 0.06466352939605713, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003097211883869022, + "grad_norm": 7.527090072631836, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8594160676002502, + "num_tokens": 454490161.0, + "step": 11911 + }, + { + "epoch": 1.5153288385701564, + "ewc_loss": 0.06479019671678543, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030854649958200753, + "grad_norm": 7.4379777908325195, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8546249866485596, + "num_tokens": 454534767.0, + "step": 11912 + }, + { + "epoch": 1.515456048848747, + "ewc_loss": 0.06494192779064178, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003100637695752084, + "grad_norm": 7.516973972320557, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8723344802856445, + "num_tokens": 454573246.0, + "step": 11913 + }, + { + "epoch": 1.5155832591273375, + "ewc_loss": 0.06511703133583069, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00030937345582060516, + "grad_norm": 7.52357292175293, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8771870136260986, + "num_tokens": 454607626.0, + "step": 11914 + }, + { + "epoch": 1.515710469405928, + "ewc_loss": 0.06479154527187347, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000308559974655509, + "grad_norm": 7.511061668395996, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8570483326911926, + "num_tokens": 454639840.0, + "step": 11915 + }, + { + "epoch": 1.5158376796845185, + "ewc_loss": 0.06486231088638306, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000309267605189234, + "grad_norm": 7.4320831298828125, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8542304635047913, + "num_tokens": 454681445.0, + "step": 11916 + }, + { + "epoch": 1.515964889963109, + "ewc_loss": 0.06499052047729492, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003105497162323445, + "grad_norm": 7.485334873199463, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.855918288230896, + "num_tokens": 454726191.0, + "step": 11917 + }, + { + "epoch": 1.5160921002416994, + "ewc_loss": 0.06486664712429047, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000309310999000445, + "grad_norm": 7.486289024353027, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8691675662994385, + "num_tokens": 454765112.0, + "step": 11918 + }, + { + "epoch": 1.51621931052029, + "ewc_loss": 0.06495615839958191, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003102061164099723, + "grad_norm": 7.521129131317139, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8597729206085205, + "num_tokens": 454805254.0, + "step": 11919 + }, + { + "epoch": 1.5163465207988804, + "ewc_loss": 0.06486831605434418, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003093277045991272, + "grad_norm": 7.481954574584961, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8559260964393616, + "num_tokens": 454841344.0, + "step": 11920 + }, + { + "epoch": 1.516473731077471, + "ewc_loss": 0.0649942010641098, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003105865034740418, + "grad_norm": 7.503113746643066, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8749287128448486, + "num_tokens": 454879986.0, + "step": 11921 + }, + { + "epoch": 1.5166009413560615, + "ewc_loss": 0.06493470072746277, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030999156297184527, + "grad_norm": 7.560478210449219, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8542868494987488, + "num_tokens": 454917731.0, + "step": 11922 + }, + { + "epoch": 1.516728151634652, + "ewc_loss": 0.06490849703550339, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003097294829785824, + "grad_norm": 7.487468242645264, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8690850734710693, + "num_tokens": 454952837.0, + "step": 11923 + }, + { + "epoch": 1.5168553619132426, + "ewc_loss": 0.06502652168273926, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003109097306150943, + "grad_norm": 7.494904518127441, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8654786348342896, + "num_tokens": 454992751.0, + "step": 11924 + }, + { + "epoch": 1.516982572191833, + "ewc_loss": 0.06462324410676956, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030931836226955056, + "grad_norm": 7.533676624298096, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8620632886886597, + "num_tokens": 455029829.0, + "step": 11925 + }, + { + "epoch": 1.5171097824704236, + "ewc_loss": 0.0649595633149147, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003102401678916067, + "grad_norm": 7.485158920288086, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8620690107345581, + "num_tokens": 455072167.0, + "step": 11926 + }, + { + "epoch": 1.5172369927490141, + "ewc_loss": 0.06468473374843597, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003099332971032709, + "grad_norm": 7.599217414855957, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8703731894493103, + "num_tokens": 455108433.0, + "step": 11927 + }, + { + "epoch": 1.5173642030276047, + "ewc_loss": 0.0645112469792366, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000308198417769745, + "grad_norm": 7.505269527435303, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8664556741714478, + "num_tokens": 455148943.0, + "step": 11928 + }, + { + "epoch": 1.5174914133061952, + "ewc_loss": 0.06480593979358673, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031114532612264156, + "grad_norm": 7.596985340118408, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8592040538787842, + "num_tokens": 455185099.0, + "step": 11929 + }, + { + "epoch": 1.5176186235847857, + "ewc_loss": 0.06476205587387085, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003082651237491518, + "grad_norm": 7.482220649719238, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8769739270210266, + "num_tokens": 455225322.0, + "step": 11930 + }, + { + "epoch": 1.5177458338633762, + "ewc_loss": 0.06477757543325424, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003108617092948407, + "grad_norm": 7.564158916473389, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8598197102546692, + "num_tokens": 455262830.0, + "step": 11931 + }, + { + "epoch": 1.5178730441419668, + "ewc_loss": 0.06454003602266312, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000308486312860623, + "grad_norm": 7.473918437957764, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8703653812408447, + "num_tokens": 455300825.0, + "step": 11932 + }, + { + "epoch": 1.5180002544205573, + "ewc_loss": 0.06505361944437027, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031118071638047695, + "grad_norm": 7.595439434051514, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.858997106552124, + "num_tokens": 455336209.0, + "step": 11933 + }, + { + "epoch": 1.5181274646991478, + "ewc_loss": 0.06455335021018982, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003086194337811321, + "grad_norm": 7.496511936187744, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8615779876708984, + "num_tokens": 455374909.0, + "step": 11934 + }, + { + "epoch": 1.5182546749777384, + "ewc_loss": 0.06466219574213028, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030970791704021394, + "grad_norm": 7.582967281341553, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8763633966445923, + "num_tokens": 455417179.0, + "step": 11935 + }, + { + "epoch": 1.5183818852563287, + "ewc_loss": 0.06452731788158417, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003083591000176966, + "grad_norm": 7.495314598083496, + "learning_rate": 1e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8492467999458313, + "num_tokens": 455457923.0, + "step": 11936 + }, + { + "epoch": 1.5185090955349192, + "ewc_loss": 0.06485918164253235, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003116777807008475, + "grad_norm": 7.685292720794678, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8537140488624573, + "num_tokens": 455492932.0, + "step": 11937 + }, + { + "epoch": 1.5186363058135097, + "ewc_loss": 0.06449532508850098, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030559778679162264, + "grad_norm": 7.413184642791748, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8534800410270691, + "num_tokens": 455540392.0, + "step": 11938 + }, + { + "epoch": 1.5187635160921003, + "ewc_loss": 0.06520320475101471, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003126766241621226, + "grad_norm": 7.7241034507751465, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8575040698051453, + "num_tokens": 455574461.0, + "step": 11939 + }, + { + "epoch": 1.5188907263706908, + "ewc_loss": 0.06420867145061493, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030517263803631067, + "grad_norm": 7.433807849884033, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8601052165031433, + "num_tokens": 455608315.0, + "step": 11940 + }, + { + "epoch": 1.5190179366492813, + "ewc_loss": 0.06527258455753326, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003133704012725502, + "grad_norm": 7.724878787994385, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.866992712020874, + "num_tokens": 455654927.0, + "step": 11941 + }, + { + "epoch": 1.5191451469278716, + "ewc_loss": 0.06449161469936371, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030556065030395985, + "grad_norm": 7.4578680992126465, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8510372638702393, + "num_tokens": 455695745.0, + "step": 11942 + }, + { + "epoch": 1.5192723572064621, + "ewc_loss": 0.06524275243282318, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003130720870103687, + "grad_norm": 7.790310859680176, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8620862364768982, + "num_tokens": 455734652.0, + "step": 11943 + }, + { + "epoch": 1.5193995674850527, + "ewc_loss": 0.06444887816905975, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003051333187613636, + "grad_norm": 7.414272308349609, + "learning_rate": 1e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.840340256690979, + "num_tokens": 455776063.0, + "step": 11944 + }, + { + "epoch": 1.5195267777636432, + "ewc_loss": 0.06531766057014465, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031382113229483366, + "grad_norm": 7.796402454376221, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8571653366088867, + "num_tokens": 455820595.0, + "step": 11945 + }, + { + "epoch": 1.5196539880422337, + "ewc_loss": 0.06456287205219269, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030627328669652343, + "grad_norm": 7.429131507873535, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8680101633071899, + "num_tokens": 455859553.0, + "step": 11946 + }, + { + "epoch": 1.5197811983208243, + "ewc_loss": 0.06513740122318268, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031445996137335896, + "grad_norm": 7.829907417297363, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8637261390686035, + "num_tokens": 455892676.0, + "step": 11947 + }, + { + "epoch": 1.5199084085994148, + "ewc_loss": 0.06471849977970123, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003053881519008428, + "grad_norm": 15.118537902832031, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8545122146606445, + "num_tokens": 455932106.0, + "step": 11948 + }, + { + "epoch": 1.5200356188780053, + "ewc_loss": 0.07481992989778519, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00040884382906369865, + "grad_norm": 8.614386558532715, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8675769567489624, + "num_tokens": 455971555.0, + "step": 11949 + }, + { + "epoch": 1.5201628291565958, + "ewc_loss": 0.06356430053710938, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00029628752963617444, + "grad_norm": 7.322361946105957, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8572435975074768, + "num_tokens": 456014177.0, + "step": 11950 + }, + { + "epoch": 1.5202900394351864, + "ewc_loss": 0.06651695817708969, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032581412233412266, + "grad_norm": 7.873133182525635, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8671640157699585, + "num_tokens": 456048229.0, + "step": 11951 + }, + { + "epoch": 1.520417249713777, + "ewc_loss": 0.06549098342657089, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003155543527100235, + "grad_norm": 7.472481727600098, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8821924924850464, + "num_tokens": 456085815.0, + "step": 11952 + }, + { + "epoch": 1.5205444599923674, + "ewc_loss": 0.0660233423113823, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032087796716950834, + "grad_norm": 7.781117916107178, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8652135729789734, + "num_tokens": 456128971.0, + "step": 11953 + }, + { + "epoch": 1.520671670270958, + "ewc_loss": 0.06518770754337311, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003125216462649405, + "grad_norm": 7.520788669586182, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8452824950218201, + "num_tokens": 456159705.0, + "step": 11954 + }, + { + "epoch": 1.5207988805495485, + "ewc_loss": 0.06578794866800308, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031852402025833726, + "grad_norm": 7.703562259674072, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8527984619140625, + "num_tokens": 456198251.0, + "step": 11955 + }, + { + "epoch": 1.520926090828139, + "ewc_loss": 0.06527754664421082, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031341996509581804, + "grad_norm": 7.54757022857666, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8556817173957825, + "num_tokens": 456237612.0, + "step": 11956 + }, + { + "epoch": 1.5210533011067295, + "ewc_loss": 0.06545913964509964, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003152359277009964, + "grad_norm": 7.6516218185424805, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8621813058853149, + "num_tokens": 456273665.0, + "step": 11957 + }, + { + "epoch": 1.52118051138532, + "ewc_loss": 0.06512750685214996, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003119196044281125, + "grad_norm": 7.6208271980285645, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8639921545982361, + "num_tokens": 456305467.0, + "step": 11958 + }, + { + "epoch": 1.5213077216639106, + "ewc_loss": 0.0652628242969513, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003132728161290288, + "grad_norm": 7.6114983558654785, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8679237365722656, + "num_tokens": 456342951.0, + "step": 11959 + }, + { + "epoch": 1.521434931942501, + "ewc_loss": 0.06501796096563339, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031082413624972105, + "grad_norm": 7.607850551605225, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8629580140113831, + "num_tokens": 456375846.0, + "step": 11960 + }, + { + "epoch": 1.5215621422210914, + "ewc_loss": 0.06502556055784225, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031090015545487404, + "grad_norm": 7.650540351867676, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8708429336547852, + "num_tokens": 456407993.0, + "step": 11961 + }, + { + "epoch": 1.521689352499682, + "ewc_loss": 0.06496529281139374, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003102974151261151, + "grad_norm": 7.5428361892700195, + "learning_rate": 1e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8437320590019226, + "num_tokens": 456444278.0, + "step": 11962 + }, + { + "epoch": 1.5218165627782725, + "ewc_loss": 0.06498713791370392, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003105158975813538, + "grad_norm": 7.598781108856201, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8707694411277771, + "num_tokens": 456483950.0, + "step": 11963 + }, + { + "epoch": 1.521943773056863, + "ewc_loss": 0.06488276273012161, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030947214690968394, + "grad_norm": 7.482503414154053, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8640609383583069, + "num_tokens": 456527438.0, + "step": 11964 + }, + { + "epoch": 1.5220709833354535, + "ewc_loss": 0.06507300585508347, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003113746060989797, + "grad_norm": 7.561180114746094, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.874369740486145, + "num_tokens": 456560572.0, + "step": 11965 + }, + { + "epoch": 1.5221981936140438, + "ewc_loss": 0.06489486247301102, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030959316063672304, + "grad_norm": 7.6630353927612305, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.852957010269165, + "num_tokens": 456604781.0, + "step": 11966 + }, + { + "epoch": 1.5223254038926344, + "ewc_loss": 0.06477610021829605, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030840554973110557, + "grad_norm": 7.5270891189575195, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8758569955825806, + "num_tokens": 456639415.0, + "step": 11967 + }, + { + "epoch": 1.522452614171225, + "ewc_loss": 0.06513442099094391, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031198872602544725, + "grad_norm": 7.762824535369873, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8752371072769165, + "num_tokens": 456671183.0, + "step": 11968 + }, + { + "epoch": 1.5225798244498154, + "ewc_loss": 0.06432396918535233, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003063256444875151, + "grad_norm": 7.415481090545654, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8503457307815552, + "num_tokens": 456712995.0, + "step": 11969 + }, + { + "epoch": 1.522707034728406, + "ewc_loss": 0.06531322747468948, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031377680716104805, + "grad_norm": 7.638849258422852, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8571557998657227, + "num_tokens": 456750587.0, + "step": 11970 + }, + { + "epoch": 1.5228342450069965, + "ewc_loss": 0.06470656394958496, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030771017191000283, + "grad_norm": 7.5079522132873535, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8696851134300232, + "num_tokens": 456787503.0, + "step": 11971 + }, + { + "epoch": 1.522961455285587, + "ewc_loss": 0.06525734066963196, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031321795540861785, + "grad_norm": 7.646172523498535, + "learning_rate": 1e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.8399983644485474, + "num_tokens": 456831645.0, + "step": 11972 + }, + { + "epoch": 1.5230886655641775, + "ewc_loss": 0.06477229297161102, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030836748192086816, + "grad_norm": 7.462618827819824, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8696943521499634, + "num_tokens": 456869143.0, + "step": 11973 + }, + { + "epoch": 1.523215875842768, + "ewc_loss": 0.06528175622224808, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031346207833848894, + "grad_norm": 7.652013301849365, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8606472611427307, + "num_tokens": 456906465.0, + "step": 11974 + }, + { + "epoch": 1.5233430861213586, + "ewc_loss": 0.06483817845582962, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003090263344347477, + "grad_norm": 7.606917858123779, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8616994023323059, + "num_tokens": 456942048.0, + "step": 11975 + }, + { + "epoch": 1.5234702963999491, + "ewc_loss": 0.06500299274921417, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003106744261458516, + "grad_norm": 7.522192478179932, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8805372714996338, + "num_tokens": 456975710.0, + "step": 11976 + }, + { + "epoch": 1.5235975066785397, + "ewc_loss": 0.06494294106960297, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003100739559158683, + "grad_norm": 7.510673522949219, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.852046549320221, + "num_tokens": 457022295.0, + "step": 11977 + }, + { + "epoch": 1.5237247169571302, + "ewc_loss": 0.06494541466236115, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031009866506792605, + "grad_norm": 7.5285444259643555, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8543698191642761, + "num_tokens": 457060433.0, + "step": 11978 + }, + { + "epoch": 1.5238519272357207, + "ewc_loss": 0.06511121988296509, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031175671028904617, + "grad_norm": 7.568079948425293, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8693264722824097, + "num_tokens": 457101760.0, + "step": 11979 + }, + { + "epoch": 1.5239791375143112, + "ewc_loss": 0.06501835584640503, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031082809437066317, + "grad_norm": 7.923186779022217, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.878382682800293, + "num_tokens": 457142676.0, + "step": 11980 + }, + { + "epoch": 1.5241063477929018, + "ewc_loss": 0.06453988701105118, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030604342464357615, + "grad_norm": 7.431778907775879, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8702593445777893, + "num_tokens": 457179507.0, + "step": 11981 + }, + { + "epoch": 1.5242335580714923, + "ewc_loss": 0.06546041369438171, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003152487042825669, + "grad_norm": 7.699978828430176, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8652698993682861, + "num_tokens": 457221563.0, + "step": 11982 + }, + { + "epoch": 1.5243607683500828, + "ewc_loss": 0.0644354447722435, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003049990045838058, + "grad_norm": 7.476903438568115, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8609529733657837, + "num_tokens": 457258634.0, + "step": 11983 + }, + { + "epoch": 1.5244879786286734, + "ewc_loss": 0.0652443990111351, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031308853067457676, + "grad_norm": 7.650600433349609, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8774386048316956, + "num_tokens": 457293097.0, + "step": 11984 + }, + { + "epoch": 1.5246151889072637, + "ewc_loss": 0.06469839066267014, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030762841925024986, + "grad_norm": 7.435703754425049, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8776465654373169, + "num_tokens": 457333525.0, + "step": 11985 + }, + { + "epoch": 1.5247423991858542, + "ewc_loss": 0.0651671290397644, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031231579487212, + "grad_norm": 7.627165794372559, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8494086265563965, + "num_tokens": 457379772.0, + "step": 11986 + }, + { + "epoch": 1.5248696094644447, + "ewc_loss": 0.0647161453962326, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030780595261603594, + "grad_norm": 7.479700088500977, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8684215545654297, + "num_tokens": 457415852.0, + "step": 11987 + }, + { + "epoch": 1.5249968197430352, + "ewc_loss": 0.06509481370449066, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000311592681100592, + "grad_norm": 7.631494522094727, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8680738210678101, + "num_tokens": 457449771.0, + "step": 11988 + }, + { + "epoch": 1.5251240300216258, + "ewc_loss": 0.06452221423387527, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030830808100290596, + "grad_norm": 7.476325511932373, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8594319224357605, + "num_tokens": 457489601.0, + "step": 11989 + }, + { + "epoch": 1.5252512403002163, + "ewc_loss": 0.0648534819483757, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003116207371931523, + "grad_norm": 7.591561317443848, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8770793676376343, + "num_tokens": 457530516.0, + "step": 11990 + }, + { + "epoch": 1.5253784505788066, + "ewc_loss": 0.06464460492134094, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030953201348893344, + "grad_norm": 7.517775535583496, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8585073947906494, + "num_tokens": 457568299.0, + "step": 11991 + }, + { + "epoch": 1.5255056608573971, + "ewc_loss": 0.06476354598999023, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031072142883203924, + "grad_norm": 7.625212669372559, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8613981604576111, + "num_tokens": 457600476.0, + "step": 11992 + }, + { + "epoch": 1.5256328711359877, + "ewc_loss": 0.06454695016145706, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030855543445795774, + "grad_norm": 7.498073577880859, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8827317357063293, + "num_tokens": 457632024.0, + "step": 11993 + }, + { + "epoch": 1.5257600814145782, + "ewc_loss": 0.06486392766237259, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.000311725219944492, + "grad_norm": 7.595030784606934, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8688735961914062, + "num_tokens": 457670805.0, + "step": 11994 + }, + { + "epoch": 1.5258872916931687, + "ewc_loss": 0.06452754139900208, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030836131190881133, + "grad_norm": 7.54829740524292, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8638319969177246, + "num_tokens": 457702468.0, + "step": 11995 + }, + { + "epoch": 1.5260145019717593, + "ewc_loss": 0.0646735280752182, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030982124735601246, + "grad_norm": 7.463446140289307, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8729696273803711, + "num_tokens": 457740443.0, + "step": 11996 + }, + { + "epoch": 1.5261417122503498, + "ewc_loss": 0.06475803256034851, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003106662770733237, + "grad_norm": 7.6041741371154785, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8658760190010071, + "num_tokens": 457775833.0, + "step": 11997 + }, + { + "epoch": 1.5262689225289403, + "ewc_loss": 0.06454227864742279, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003085087228100747, + "grad_norm": 7.5006866455078125, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8684573173522949, + "num_tokens": 457815850.0, + "step": 11998 + }, + { + "epoch": 1.5263961328075308, + "ewc_loss": 0.06470172107219696, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031010317616164684, + "grad_norm": 7.50991153717041, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8703542947769165, + "num_tokens": 457854946.0, + "step": 11999 + }, + { + "epoch": 1.5265233430861214, + "ewc_loss": 0.06473188102245331, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031040472094900906, + "grad_norm": 7.645497798919678, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8685977458953857, + "num_tokens": 457890775.0, + "step": 12000 + }, + { + "epoch": 1.526650553364712, + "ewc_loss": 0.06455270946025848, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003086130600422621, + "grad_norm": 7.504673957824707, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8682626485824585, + "num_tokens": 457928143.0, + "step": 12001 + }, + { + "epoch": 1.5267777636433024, + "ewc_loss": 0.06486410647630692, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031172699527814984, + "grad_norm": 7.581584930419922, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.870235800743103, + "num_tokens": 457965999.0, + "step": 12002 + }, + { + "epoch": 1.526904973921893, + "ewc_loss": 0.06450927257537842, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003081786853726953, + "grad_norm": 7.53548526763916, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8625107407569885, + "num_tokens": 458006638.0, + "step": 12003 + }, + { + "epoch": 1.5270321842004835, + "ewc_loss": 0.06464944779872894, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030958044226281345, + "grad_norm": 7.54888391494751, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8642726540565491, + "num_tokens": 458047522.0, + "step": 12004 + }, + { + "epoch": 1.527159394479074, + "ewc_loss": 0.0646548867225647, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030963480821810663, + "grad_norm": 7.53214168548584, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8641005754470825, + "num_tokens": 458091219.0, + "step": 12005 + }, + { + "epoch": 1.5272866047576645, + "ewc_loss": 0.0646541491150856, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003096274449490011, + "grad_norm": 7.542876243591309, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8524978160858154, + "num_tokens": 458129208.0, + "step": 12006 + }, + { + "epoch": 1.527413815036255, + "ewc_loss": 0.06478791683912277, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031096511520445347, + "grad_norm": 7.529720783233643, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8584342002868652, + "num_tokens": 458163632.0, + "step": 12007 + }, + { + "epoch": 1.5275410253148456, + "ewc_loss": 0.06472143530845642, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031030032550916076, + "grad_norm": 7.529527187347412, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8569707870483398, + "num_tokens": 458202377.0, + "step": 12008 + }, + { + "epoch": 1.527668235593436, + "ewc_loss": 0.06473109871149063, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003103969502262771, + "grad_norm": 7.550758361816406, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.862655758857727, + "num_tokens": 458239901.0, + "step": 12009 + }, + { + "epoch": 1.5277954458720264, + "ewc_loss": 0.06498944759368896, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000310538976918906, + "grad_norm": 7.51835298538208, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8572455644607544, + "num_tokens": 458280291.0, + "step": 12010 + }, + { + "epoch": 1.527922656150617, + "ewc_loss": 0.06481928378343582, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003112787671852857, + "grad_norm": 7.601523399353027, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8691707849502563, + "num_tokens": 458316931.0, + "step": 12011 + }, + { + "epoch": 1.5280498664292075, + "ewc_loss": 0.06460152566432953, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030910116038285196, + "grad_norm": 7.484158992767334, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8580492734909058, + "num_tokens": 458356477.0, + "step": 12012 + }, + { + "epoch": 1.528177076707798, + "ewc_loss": 0.0649297907948494, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003123838687315583, + "grad_norm": 7.609314918518066, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8488059639930725, + "num_tokens": 458401340.0, + "step": 12013 + }, + { + "epoch": 1.5283042869863885, + "ewc_loss": 0.06486936658620834, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003093381819780916, + "grad_norm": 7.507530689239502, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8688513040542603, + "num_tokens": 458436190.0, + "step": 12014 + }, + { + "epoch": 1.5284314972649788, + "ewc_loss": 0.06491480022668839, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031223392579704523, + "grad_norm": 7.628073692321777, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8634721040725708, + "num_tokens": 458465513.0, + "step": 12015 + }, + { + "epoch": 1.5285587075435694, + "ewc_loss": 0.06460890173912048, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030917496769689023, + "grad_norm": 7.512738227844238, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.860944390296936, + "num_tokens": 458505465.0, + "step": 12016 + }, + { + "epoch": 1.52868591782216, + "ewc_loss": 0.06495417654514313, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003126276715192944, + "grad_norm": 7.550613880157471, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8619716167449951, + "num_tokens": 458545087.0, + "step": 12017 + }, + { + "epoch": 1.5288131281007504, + "ewc_loss": 0.06489112228155136, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030955576221458614, + "grad_norm": 7.54083776473999, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8717236518859863, + "num_tokens": 458583274.0, + "step": 12018 + }, + { + "epoch": 1.528940338379341, + "ewc_loss": 0.06513090431690216, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003119535685982555, + "grad_norm": 7.570407867431641, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8590812683105469, + "num_tokens": 458626026.0, + "step": 12019 + }, + { + "epoch": 1.5290675486579315, + "ewc_loss": 0.0649302750825882, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003099472669418901, + "grad_norm": 7.453954696655273, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8761168122291565, + "num_tokens": 458668774.0, + "step": 12020 + }, + { + "epoch": 1.529194758936522, + "ewc_loss": 0.06488257646560669, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003119116881862283, + "grad_norm": 7.611588001251221, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8682830333709717, + "num_tokens": 458702256.0, + "step": 12021 + }, + { + "epoch": 1.5293219692151125, + "ewc_loss": 0.06467393040657043, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030982523458078504, + "grad_norm": 7.541681289672852, + "learning_rate": 1e-06, + "loss": 0.5525, + "mean_token_accuracy": 0.8372491598129272, + "num_tokens": 458741429.0, + "step": 12022 + }, + { + "epoch": 1.529449179493703, + "ewc_loss": 0.06512965261936188, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031194102484732866, + "grad_norm": 7.565075397491455, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8678703308105469, + "num_tokens": 458775608.0, + "step": 12023 + }, + { + "epoch": 1.5295763897722936, + "ewc_loss": 0.06500031799077988, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031064770882949233, + "grad_norm": 7.572508811950684, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8663769960403442, + "num_tokens": 458811451.0, + "step": 12024 + }, + { + "epoch": 1.5297036000508841, + "ewc_loss": 0.06503189355134964, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003109634853899479, + "grad_norm": 7.576828956604004, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8685833215713501, + "num_tokens": 458851761.0, + "step": 12025 + }, + { + "epoch": 1.5298308103294747, + "ewc_loss": 0.06488093733787537, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003094538697041571, + "grad_norm": 7.5132575035095215, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8562449216842651, + "num_tokens": 458888948.0, + "step": 12026 + }, + { + "epoch": 1.5299580206080652, + "ewc_loss": 0.06512869894504547, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031193147879093885, + "grad_norm": 7.607366561889648, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.861430287361145, + "num_tokens": 458922466.0, + "step": 12027 + }, + { + "epoch": 1.5300852308866557, + "ewc_loss": 0.06492579728364944, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030990250525064766, + "grad_norm": 7.520881175994873, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8590399026870728, + "num_tokens": 458956947.0, + "step": 12028 + }, + { + "epoch": 1.5302124411652462, + "ewc_loss": 0.06509897112846375, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003116342704743147, + "grad_norm": 7.569631576538086, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8816925287246704, + "num_tokens": 458993634.0, + "step": 12029 + }, + { + "epoch": 1.5303396514438368, + "ewc_loss": 0.06483771651983261, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003090217069257051, + "grad_norm": 7.4609880447387695, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8732802271842957, + "num_tokens": 459034314.0, + "step": 12030 + }, + { + "epoch": 1.5304668617224273, + "ewc_loss": 0.06522864103317261, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003129309043288231, + "grad_norm": 7.5851359367370605, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8697357177734375, + "num_tokens": 459072117.0, + "step": 12031 + }, + { + "epoch": 1.5305940720010178, + "ewc_loss": 0.06482532620429993, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030889775371178985, + "grad_norm": 7.491574287414551, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.858331561088562, + "num_tokens": 459112382.0, + "step": 12032 + }, + { + "epoch": 1.5307212822796084, + "ewc_loss": 0.0652252584695816, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003128971147816628, + "grad_norm": 7.607002258300781, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8759106397628784, + "num_tokens": 459148973.0, + "step": 12033 + }, + { + "epoch": 1.5308484925581987, + "ewc_loss": 0.06488808244466782, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003095253778155893, + "grad_norm": 7.504039287567139, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8603233695030212, + "num_tokens": 459183242.0, + "step": 12034 + }, + { + "epoch": 1.5309757028367892, + "ewc_loss": 0.06517325341701508, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000312377029331401, + "grad_norm": 7.525312900543213, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8817347288131714, + "num_tokens": 459219606.0, + "step": 12035 + }, + { + "epoch": 1.5311029131153797, + "ewc_loss": 0.06511780619621277, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031182257225736976, + "grad_norm": 7.570931911468506, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8819403648376465, + "num_tokens": 459253763.0, + "step": 12036 + }, + { + "epoch": 1.5312301233939702, + "ewc_loss": 0.06503263860940933, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031097090686671436, + "grad_norm": 7.545953273773193, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8671093583106995, + "num_tokens": 459291187.0, + "step": 12037 + }, + { + "epoch": 1.5313573336725608, + "ewc_loss": 0.06512577831745148, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003119023167528212, + "grad_norm": 7.565241813659668, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8605178594589233, + "num_tokens": 459327423.0, + "step": 12038 + }, + { + "epoch": 1.5314845439511513, + "ewc_loss": 0.06497402489185333, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003103847848251462, + "grad_norm": 7.52724552154541, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8602504730224609, + "num_tokens": 459366207.0, + "step": 12039 + }, + { + "epoch": 1.5316117542297416, + "ewc_loss": 0.0650680661201477, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003113251877948642, + "grad_norm": 7.547508716583252, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8558222055435181, + "num_tokens": 459408338.0, + "step": 12040 + }, + { + "epoch": 1.5317389645083321, + "ewc_loss": 0.06501241773366928, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031076872255653143, + "grad_norm": 7.516663074493408, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8734414577484131, + "num_tokens": 459443420.0, + "step": 12041 + }, + { + "epoch": 1.5318661747869227, + "ewc_loss": 0.06504601240158081, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031110463896766305, + "grad_norm": 7.616635799407959, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8468340635299683, + "num_tokens": 459475835.0, + "step": 12042 + }, + { + "epoch": 1.5319933850655132, + "ewc_loss": 0.06487197428941727, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003093642881140113, + "grad_norm": 7.548011779785156, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8607410192489624, + "num_tokens": 459511993.0, + "step": 12043 + }, + { + "epoch": 1.5321205953441037, + "ewc_loss": 0.06504905223846436, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003111350815743208, + "grad_norm": 7.556296348571777, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8715299367904663, + "num_tokens": 459546221.0, + "step": 12044 + }, + { + "epoch": 1.5322478056226942, + "ewc_loss": 0.06490524113178253, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030969694489613175, + "grad_norm": 7.440184593200684, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8677382469177246, + "num_tokens": 459587271.0, + "step": 12045 + }, + { + "epoch": 1.5323750159012848, + "ewc_loss": 0.06500668078660965, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003107113589067012, + "grad_norm": 7.513878345489502, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8527820706367493, + "num_tokens": 459623388.0, + "step": 12046 + }, + { + "epoch": 1.5325022261798753, + "ewc_loss": 0.06501328945159912, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031077745370566845, + "grad_norm": 7.521463394165039, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8649312257766724, + "num_tokens": 459666440.0, + "step": 12047 + }, + { + "epoch": 1.5326294364584658, + "ewc_loss": 0.06506402790546417, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031128484988585114, + "grad_norm": 7.596258640289307, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8508204221725464, + "num_tokens": 459708373.0, + "step": 12048 + }, + { + "epoch": 1.5327566467370564, + "ewc_loss": 0.0650118887424469, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003107634256593883, + "grad_norm": 7.495826244354248, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8736090064048767, + "num_tokens": 459746707.0, + "step": 12049 + }, + { + "epoch": 1.532883857015647, + "ewc_loss": 0.06519369781017303, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031258154194802046, + "grad_norm": 7.5941057205200195, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8695709109306335, + "num_tokens": 459792493.0, + "step": 12050 + }, + { + "epoch": 1.5330110672942374, + "ewc_loss": 0.0649116188287735, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030976071138866246, + "grad_norm": 7.507035732269287, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8714986443519592, + "num_tokens": 459828897.0, + "step": 12051 + }, + { + "epoch": 1.533138277572828, + "ewc_loss": 0.06522557139396667, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031290019978769124, + "grad_norm": 7.5723347663879395, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8550419211387634, + "num_tokens": 459866274.0, + "step": 12052 + }, + { + "epoch": 1.5332654878514185, + "ewc_loss": 0.06501112878322601, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003107558295596391, + "grad_norm": 7.525959014892578, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8792273998260498, + "num_tokens": 459902442.0, + "step": 12053 + }, + { + "epoch": 1.533392698130009, + "ewc_loss": 0.06522531807422638, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003128976677544415, + "grad_norm": 7.540387153625488, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8712822198867798, + "num_tokens": 459943331.0, + "step": 12054 + }, + { + "epoch": 1.5335199084085995, + "ewc_loss": 0.06507772207260132, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003114217834081501, + "grad_norm": 7.580098628997803, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8697226047515869, + "num_tokens": 459975349.0, + "step": 12055 + }, + { + "epoch": 1.53364711868719, + "ewc_loss": 0.06502526998519897, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031089724507182837, + "grad_norm": 7.724410057067871, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8772686719894409, + "num_tokens": 460008295.0, + "step": 12056 + }, + { + "epoch": 1.5337743289657806, + "ewc_loss": 0.06490713357925415, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003097158332820982, + "grad_norm": 7.497687339782715, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8681555390357971, + "num_tokens": 460045892.0, + "step": 12057 + }, + { + "epoch": 1.533901539244371, + "ewc_loss": 0.0649511069059372, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003125969669781625, + "grad_norm": 7.591933250427246, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8644145727157593, + "num_tokens": 460084255.0, + "step": 12058 + }, + { + "epoch": 1.5340287495229614, + "ewc_loss": 0.06466639041900635, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00030974988476373255, + "grad_norm": 7.498880386352539, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8504140377044678, + "num_tokens": 460123706.0, + "step": 12059 + }, + { + "epoch": 1.534155959801552, + "ewc_loss": 0.06508229672908783, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003114675055257976, + "grad_norm": 7.57478141784668, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8505599498748779, + "num_tokens": 460162351.0, + "step": 12060 + }, + { + "epoch": 1.5342831700801425, + "ewc_loss": 0.0647779256105423, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003108652017544955, + "grad_norm": 7.5522871017456055, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8549752831459045, + "num_tokens": 460200135.0, + "step": 12061 + }, + { + "epoch": 1.534410380358733, + "ewc_loss": 0.06483824551105499, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031146834953688085, + "grad_norm": 7.564709663391113, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8584098219871521, + "num_tokens": 460243585.0, + "step": 12062 + }, + { + "epoch": 1.5345375906373235, + "ewc_loss": 0.06473557651042938, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003104416828136891, + "grad_norm": 7.56850004196167, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8564277291297913, + "num_tokens": 460283695.0, + "step": 12063 + }, + { + "epoch": 1.5346648009159138, + "ewc_loss": 0.06503289937973022, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003109735553152859, + "grad_norm": 7.582296848297119, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8724560141563416, + "num_tokens": 460317241.0, + "step": 12064 + }, + { + "epoch": 1.5347920111945044, + "ewc_loss": 0.06498797982931137, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003105243085883558, + "grad_norm": 7.539321422576904, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8664402961730957, + "num_tokens": 460351900.0, + "step": 12065 + }, + { + "epoch": 1.534919221473095, + "ewc_loss": 0.0650624930858612, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031126945395953953, + "grad_norm": 7.632867336273193, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8652892112731934, + "num_tokens": 460391672.0, + "step": 12066 + }, + { + "epoch": 1.5350464317516854, + "ewc_loss": 0.06489967554807663, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000309641269268468, + "grad_norm": 7.550901412963867, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8562225103378296, + "num_tokens": 460434120.0, + "step": 12067 + }, + { + "epoch": 1.535173642030276, + "ewc_loss": 0.06503555178642273, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003110000106971711, + "grad_norm": 7.6209940910339355, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8656356930732727, + "num_tokens": 460474214.0, + "step": 12068 + }, + { + "epoch": 1.5353008523088665, + "ewc_loss": 0.06481508165597916, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003087953373324126, + "grad_norm": 7.5888752937316895, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8698657751083374, + "num_tokens": 460507549.0, + "step": 12069 + }, + { + "epoch": 1.535428062587457, + "ewc_loss": 0.06501298397779465, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031077436869964004, + "grad_norm": 7.638484954833984, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8568724393844604, + "num_tokens": 460546391.0, + "step": 12070 + }, + { + "epoch": 1.5355552728660475, + "ewc_loss": 0.06485623121261597, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003092068072874099, + "grad_norm": 7.567378520965576, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8531337976455688, + "num_tokens": 460585232.0, + "step": 12071 + }, + { + "epoch": 1.535682483144638, + "ewc_loss": 0.06490553170442581, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030969982617534697, + "grad_norm": 7.560897350311279, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8709917068481445, + "num_tokens": 460624398.0, + "step": 12072 + }, + { + "epoch": 1.5358096934232286, + "ewc_loss": 0.06497159600257874, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031036045402288437, + "grad_norm": 7.596020221710205, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8564090132713318, + "num_tokens": 460667193.0, + "step": 12073 + }, + { + "epoch": 1.5359369037018191, + "ewc_loss": 0.064727783203125, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003079223388340324, + "grad_norm": 7.5105061531066895, + "learning_rate": 1e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8403262495994568, + "num_tokens": 460710614.0, + "step": 12074 + }, + { + "epoch": 1.5360641139804097, + "ewc_loss": 0.06504224240779877, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003110669204033911, + "grad_norm": 7.611040115356445, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8594462871551514, + "num_tokens": 460751079.0, + "step": 12075 + }, + { + "epoch": 1.5361913242590002, + "ewc_loss": 0.06477470695972443, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030839157989248633, + "grad_norm": 7.516308307647705, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8639719486236572, + "num_tokens": 460786768.0, + "step": 12076 + }, + { + "epoch": 1.5363185345375907, + "ewc_loss": 0.06503774225711823, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000311021925881505, + "grad_norm": 7.581105709075928, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8538088798522949, + "num_tokens": 460824757.0, + "step": 12077 + }, + { + "epoch": 1.5364457448161812, + "ewc_loss": 0.06493134796619415, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003099579771514982, + "grad_norm": 7.522549152374268, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.85700523853302, + "num_tokens": 460865024.0, + "step": 12078 + }, + { + "epoch": 1.5365729550947718, + "ewc_loss": 0.0650375634431839, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003110201796516776, + "grad_norm": 7.676303386688232, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.854379415512085, + "num_tokens": 460900895.0, + "step": 12079 + }, + { + "epoch": 1.5367001653733623, + "ewc_loss": 0.06474713981151581, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003081159375142306, + "grad_norm": 7.477458477020264, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8688725233078003, + "num_tokens": 460936876.0, + "step": 12080 + }, + { + "epoch": 1.5368273756519528, + "ewc_loss": 0.06523922085762024, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003130366967525333, + "grad_norm": 7.645618915557861, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8475666046142578, + "num_tokens": 460975297.0, + "step": 12081 + }, + { + "epoch": 1.5369545859305433, + "ewc_loss": 0.06474731117486954, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003081176255363971, + "grad_norm": 7.540275573730469, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8520733118057251, + "num_tokens": 461008603.0, + "step": 12082 + }, + { + "epoch": 1.5370817962091337, + "ewc_loss": 0.06507525593042374, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003113970742560923, + "grad_norm": 7.61724853515625, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8544855117797852, + "num_tokens": 461047770.0, + "step": 12083 + }, + { + "epoch": 1.5372090064877242, + "ewc_loss": 0.06494712829589844, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003101158363278955, + "grad_norm": 7.5503926277160645, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8504596948623657, + "num_tokens": 461088554.0, + "step": 12084 + }, + { + "epoch": 1.5373362167663147, + "ewc_loss": 0.06506423652172089, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031128685805015266, + "grad_norm": 7.616519451141357, + "learning_rate": 1e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.841086745262146, + "num_tokens": 461126070.0, + "step": 12085 + }, + { + "epoch": 1.5374634270449052, + "ewc_loss": 0.06492634117603302, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030990797677077353, + "grad_norm": 7.561070919036865, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8602988719940186, + "num_tokens": 461166394.0, + "step": 12086 + }, + { + "epoch": 1.5375906373234958, + "ewc_loss": 0.06501539051532745, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003107984666712582, + "grad_norm": 7.56319522857666, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8552494049072266, + "num_tokens": 461208263.0, + "step": 12087 + }, + { + "epoch": 1.5377178476020863, + "ewc_loss": 0.06490470468997955, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003096915897913277, + "grad_norm": 7.516026973724365, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8737668395042419, + "num_tokens": 461245054.0, + "step": 12088 + }, + { + "epoch": 1.5378450578806766, + "ewc_loss": 0.06500855833292007, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003107301308773458, + "grad_norm": 7.61667537689209, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.853119969367981, + "num_tokens": 461281584.0, + "step": 12089 + }, + { + "epoch": 1.5379722681592671, + "ewc_loss": 0.06485120207071304, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003091565740760416, + "grad_norm": 7.481183052062988, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.871985912322998, + "num_tokens": 461322247.0, + "step": 12090 + }, + { + "epoch": 1.5380994784378577, + "ewc_loss": 0.06512872874736786, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031193182803690434, + "grad_norm": 7.639829158782959, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8507623672485352, + "num_tokens": 461358371.0, + "step": 12091 + }, + { + "epoch": 1.5382266887164482, + "ewc_loss": 0.06475970894098282, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030824163695797324, + "grad_norm": 7.4594502449035645, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8699073791503906, + "num_tokens": 461394666.0, + "step": 12092 + }, + { + "epoch": 1.5383538989950387, + "ewc_loss": 0.065206378698349, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031270828912965953, + "grad_norm": 7.610706806182861, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8760658502578735, + "num_tokens": 461430875.0, + "step": 12093 + }, + { + "epoch": 1.5384811092736292, + "ewc_loss": 0.06475619971752167, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003082065668422729, + "grad_norm": 7.476381301879883, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8590370416641235, + "num_tokens": 461467412.0, + "step": 12094 + }, + { + "epoch": 1.5386083195522198, + "ewc_loss": 0.06519947946071625, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003126392839476466, + "grad_norm": 7.605221271514893, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.841505765914917, + "num_tokens": 461501970.0, + "step": 12095 + }, + { + "epoch": 1.5387355298308103, + "ewc_loss": 0.06487724184989929, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000309416966047138, + "grad_norm": 7.5131402015686035, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8509800434112549, + "num_tokens": 461542735.0, + "step": 12096 + }, + { + "epoch": 1.5388627401094008, + "ewc_loss": 0.06515917181968689, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031223625410348177, + "grad_norm": 7.568002223968506, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8654683232307434, + "num_tokens": 461583883.0, + "step": 12097 + }, + { + "epoch": 1.5389899503879914, + "ewc_loss": 0.0649237260222435, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030988178332336247, + "grad_norm": 7.52374792098999, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8552793264389038, + "num_tokens": 461625124.0, + "step": 12098 + }, + { + "epoch": 1.5391171606665819, + "ewc_loss": 0.06509796530008316, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031162420054897666, + "grad_norm": 7.602773666381836, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8533221483230591, + "num_tokens": 461660273.0, + "step": 12099 + }, + { + "epoch": 1.5392443709451724, + "ewc_loss": 0.06522040069103241, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003104071074631065, + "grad_norm": 7.576075553894043, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8579161167144775, + "num_tokens": 461698890.0, + "step": 12100 + }, + { + "epoch": 1.539371581223763, + "ewc_loss": 0.06508633494377136, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031150784343481064, + "grad_norm": 7.543608665466309, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8633890151977539, + "num_tokens": 461736927.0, + "step": 12101 + }, + { + "epoch": 1.5394987915023535, + "ewc_loss": 0.06526131927967072, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003108163073193282, + "grad_norm": 7.551703929901123, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8591822385787964, + "num_tokens": 461777796.0, + "step": 12102 + }, + { + "epoch": 1.539626001780944, + "ewc_loss": 0.06508036702871323, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031144818058237433, + "grad_norm": 7.633404731750488, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8721230030059814, + "num_tokens": 461814656.0, + "step": 12103 + }, + { + "epoch": 1.5397532120595345, + "ewc_loss": 0.0648869127035141, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003095136780757457, + "grad_norm": 7.502352714538574, + "learning_rate": 1e-06, + "loss": 0.5348, + "mean_token_accuracy": 0.8448752164840698, + "num_tokens": 461852186.0, + "step": 12104 + }, + { + "epoch": 1.539880422338125, + "ewc_loss": 0.06513824313879013, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031202693935483694, + "grad_norm": 7.567927837371826, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.876408576965332, + "num_tokens": 461893724.0, + "step": 12105 + }, + { + "epoch": 1.5400076326167156, + "ewc_loss": 0.06495708227157593, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031021531322039664, + "grad_norm": 7.527782917022705, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8499727249145508, + "num_tokens": 461935380.0, + "step": 12106 + }, + { + "epoch": 1.5401348428953059, + "ewc_loss": 0.06510110199451447, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000311655574478209, + "grad_norm": 7.597415924072266, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8693366050720215, + "num_tokens": 461971463.0, + "step": 12107 + }, + { + "epoch": 1.5402620531738964, + "ewc_loss": 0.0649518370628357, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003101628681179136, + "grad_norm": 7.505504608154297, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8695955276489258, + "num_tokens": 462007514.0, + "step": 12108 + }, + { + "epoch": 1.540389263452487, + "ewc_loss": 0.0651073306798935, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031171785667538643, + "grad_norm": 7.604644298553467, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8764220476150513, + "num_tokens": 462046022.0, + "step": 12109 + }, + { + "epoch": 1.5405164737310775, + "ewc_loss": 0.06487594544887543, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030940398573875427, + "grad_norm": 7.471249103546143, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8673005104064941, + "num_tokens": 462089367.0, + "step": 12110 + }, + { + "epoch": 1.540643684009668, + "ewc_loss": 0.0651964545249939, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000312609103275463, + "grad_norm": 7.557314395904541, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8678100109100342, + "num_tokens": 462128558.0, + "step": 12111 + }, + { + "epoch": 1.5407708942882585, + "ewc_loss": 0.06483635306358337, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003090080863330513, + "grad_norm": 7.488224029541016, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8708091974258423, + "num_tokens": 462162639.0, + "step": 12112 + }, + { + "epoch": 1.5408981045668488, + "ewc_loss": 0.06505730748176575, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003112175618298352, + "grad_norm": 7.588188648223877, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8630102276802063, + "num_tokens": 462199303.0, + "step": 12113 + }, + { + "epoch": 1.5410253148454394, + "ewc_loss": 0.06484980881214142, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003091425751335919, + "grad_norm": 7.487762928009033, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8592628836631775, + "num_tokens": 462246095.0, + "step": 12114 + }, + { + "epoch": 1.54115252512403, + "ewc_loss": 0.06513969600200653, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031204152037389576, + "grad_norm": 7.615482330322266, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8586643934249878, + "num_tokens": 462286653.0, + "step": 12115 + }, + { + "epoch": 1.5412797354026204, + "ewc_loss": 0.0647847130894661, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030849166796542704, + "grad_norm": 7.4654717445373535, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8599293231964111, + "num_tokens": 462328295.0, + "step": 12116 + }, + { + "epoch": 1.541406945681211, + "ewc_loss": 0.06519676744937897, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031261215917766094, + "grad_norm": 7.583558559417725, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8636254072189331, + "num_tokens": 462365175.0, + "step": 12117 + }, + { + "epoch": 1.5415341559598015, + "ewc_loss": 0.06494753062725067, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031011979444883764, + "grad_norm": 7.508269309997559, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8551452159881592, + "num_tokens": 462407044.0, + "step": 12118 + }, + { + "epoch": 1.541661366238392, + "ewc_loss": 0.06515862792730331, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031223081168718636, + "grad_norm": 7.673670291900635, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8557816743850708, + "num_tokens": 462443819.0, + "step": 12119 + }, + { + "epoch": 1.5417885765169825, + "ewc_loss": 0.06490597873926163, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003097043081652373, + "grad_norm": 7.46243953704834, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8724290132522583, + "num_tokens": 462487319.0, + "step": 12120 + }, + { + "epoch": 1.541915786795573, + "ewc_loss": 0.06532062590122223, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031385078909806907, + "grad_norm": 7.594547748565674, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.864490270614624, + "num_tokens": 462527026.0, + "step": 12121 + }, + { + "epoch": 1.5420429970741636, + "ewc_loss": 0.06486237049102783, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003092682163696736, + "grad_norm": 7.556379795074463, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.856627345085144, + "num_tokens": 462557552.0, + "step": 12122 + }, + { + "epoch": 1.5421702073527541, + "ewc_loss": 0.06511124223470688, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003117569431196898, + "grad_norm": 7.540976047515869, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8651643991470337, + "num_tokens": 462595243.0, + "step": 12123 + }, + { + "epoch": 1.5422974176313446, + "ewc_loss": 0.0648878663778305, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003119645989499986, + "grad_norm": 7.5993733406066895, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8651447892189026, + "num_tokens": 462636077.0, + "step": 12124 + }, + { + "epoch": 1.5424246279099352, + "ewc_loss": 0.06497235596179962, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003103681083302945, + "grad_norm": 7.477881908416748, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8750346899032593, + "num_tokens": 462668407.0, + "step": 12125 + }, + { + "epoch": 1.5425518381885257, + "ewc_loss": 0.06532861292362213, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031393065000884235, + "grad_norm": 7.6095290184021, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8630015254020691, + "num_tokens": 462707678.0, + "step": 12126 + }, + { + "epoch": 1.5426790484671162, + "ewc_loss": 0.06498120725154877, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031045658397488296, + "grad_norm": 7.546347618103027, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8561767935752869, + "num_tokens": 462739722.0, + "step": 12127 + }, + { + "epoch": 1.5428062587457068, + "ewc_loss": 0.06528574973344803, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031350203789770603, + "grad_norm": 7.59351921081543, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8695434927940369, + "num_tokens": 462770369.0, + "step": 12128 + }, + { + "epoch": 1.5429334690242973, + "ewc_loss": 0.06501194089651108, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031076392042450607, + "grad_norm": 7.523316860198975, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8697896599769592, + "num_tokens": 462806126.0, + "step": 12129 + }, + { + "epoch": 1.5430606793028878, + "ewc_loss": 0.0652235895395279, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003128804382868111, + "grad_norm": 7.576976299285889, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.865281879901886, + "num_tokens": 462846585.0, + "step": 12130 + }, + { + "epoch": 1.5431878895814783, + "ewc_loss": 0.06509828567504883, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000311627343762666, + "grad_norm": 7.501877784729004, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8589264154434204, + "num_tokens": 462889892.0, + "step": 12131 + }, + { + "epoch": 1.5433150998600687, + "ewc_loss": 0.0652221292257309, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003128657990600914, + "grad_norm": 7.592281341552734, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8634310960769653, + "num_tokens": 462927612.0, + "step": 12132 + }, + { + "epoch": 1.5434423101386592, + "ewc_loss": 0.06503766775131226, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031102116918191314, + "grad_norm": 7.467072010040283, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8669564723968506, + "num_tokens": 462971661.0, + "step": 12133 + }, + { + "epoch": 1.5435695204172497, + "ewc_loss": 0.06536467373371124, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003142912464682013, + "grad_norm": 7.590616703033447, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8679316639900208, + "num_tokens": 463011260.0, + "step": 12134 + }, + { + "epoch": 1.5436967306958402, + "ewc_loss": 0.0650596022605896, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031124052475206554, + "grad_norm": 7.514570236206055, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8693755865097046, + "num_tokens": 463045395.0, + "step": 12135 + }, + { + "epoch": 1.5438239409744308, + "ewc_loss": 0.06539466977119446, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031459121964871883, + "grad_norm": 7.613429546356201, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8533666133880615, + "num_tokens": 463081015.0, + "step": 12136 + }, + { + "epoch": 1.5439511512530213, + "ewc_loss": 0.06507542729377747, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003113988204859197, + "grad_norm": 7.507050514221191, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8623824715614319, + "num_tokens": 463115658.0, + "step": 12137 + }, + { + "epoch": 1.5440783615316116, + "ewc_loss": 0.06547880172729492, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031543252407573164, + "grad_norm": 7.592611312866211, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.87357497215271, + "num_tokens": 463152084.0, + "step": 12138 + }, + { + "epoch": 1.5442055718102021, + "ewc_loss": 0.0650482177734375, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031112669967114925, + "grad_norm": 7.510263442993164, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8546180725097656, + "num_tokens": 463190137.0, + "step": 12139 + }, + { + "epoch": 1.5443327820887927, + "ewc_loss": 0.06505927443504333, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003136786981485784, + "grad_norm": 7.586647987365723, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8628960847854614, + "num_tokens": 463227626.0, + "step": 12140 + }, + { + "epoch": 1.5444599923673832, + "ewc_loss": 0.06515979021787643, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003122424241155386, + "grad_norm": 7.593724250793457, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8640931844711304, + "num_tokens": 463266451.0, + "step": 12141 + }, + { + "epoch": 1.5445872026459737, + "ewc_loss": 0.0651969239115715, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003126137889921665, + "grad_norm": 7.576902866363525, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8502340912818909, + "num_tokens": 463305327.0, + "step": 12142 + }, + { + "epoch": 1.5447144129245642, + "ewc_loss": 0.0650922879576683, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031156741897575557, + "grad_norm": 7.618009090423584, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8598009943962097, + "num_tokens": 463345890.0, + "step": 12143 + }, + { + "epoch": 1.5448416232031548, + "ewc_loss": 0.06511951982975006, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031183974351733923, + "grad_norm": 7.568281173706055, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8681439161300659, + "num_tokens": 463381583.0, + "step": 12144 + }, + { + "epoch": 1.5449688334817453, + "ewc_loss": 0.06518875062465668, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000312532065436244, + "grad_norm": 7.576745986938477, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8717920780181885, + "num_tokens": 463414442.0, + "step": 12145 + }, + { + "epoch": 1.5450960437603358, + "ewc_loss": 0.06511881202459335, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003118326421827078, + "grad_norm": 7.58920955657959, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8688755035400391, + "num_tokens": 463449517.0, + "step": 12146 + }, + { + "epoch": 1.5452232540389264, + "ewc_loss": 0.06502170115709305, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031086153467185795, + "grad_norm": 7.561892032623291, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8705943822860718, + "num_tokens": 463487952.0, + "step": 12147 + }, + { + "epoch": 1.5453504643175169, + "ewc_loss": 0.0650579035282135, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031122355721890926, + "grad_norm": 7.612401962280273, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8642832040786743, + "num_tokens": 463520724.0, + "step": 12148 + }, + { + "epoch": 1.5454776745961074, + "ewc_loss": 0.06493356823921204, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030998021247796714, + "grad_norm": 7.480844020843506, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8598794341087341, + "num_tokens": 463564691.0, + "step": 12149 + }, + { + "epoch": 1.545604884874698, + "ewc_loss": 0.06522345542907715, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031287912861444056, + "grad_norm": 7.722167491912842, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8590666055679321, + "num_tokens": 463599037.0, + "step": 12150 + }, + { + "epoch": 1.5457320951532885, + "ewc_loss": 0.06487797200679779, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003094242711085826, + "grad_norm": 7.685016632080078, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8487645387649536, + "num_tokens": 463630744.0, + "step": 12151 + }, + { + "epoch": 1.545859305431879, + "ewc_loss": 0.06490904092788696, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003097349253948778, + "grad_norm": 7.559527397155762, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8597070574760437, + "num_tokens": 463663551.0, + "step": 12152 + }, + { + "epoch": 1.5459865157104695, + "ewc_loss": 0.0651642233133316, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003122868074569851, + "grad_norm": 7.722012042999268, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8748312592506409, + "num_tokens": 463702446.0, + "step": 12153 + }, + { + "epoch": 1.54611372598906, + "ewc_loss": 0.06472653895616531, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030790994060225785, + "grad_norm": 7.566289901733398, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8532692193984985, + "num_tokens": 463739324.0, + "step": 12154 + }, + { + "epoch": 1.5462409362676506, + "ewc_loss": 0.06496205925941467, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031026508077047765, + "grad_norm": 7.710259914398193, + "learning_rate": 1e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8436704874038696, + "num_tokens": 463780765.0, + "step": 12155 + }, + { + "epoch": 1.5463681465462409, + "ewc_loss": 0.06459954380989075, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003066399658564478, + "grad_norm": 7.48704719543457, + "learning_rate": 1e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8417484760284424, + "num_tokens": 463817727.0, + "step": 12156 + }, + { + "epoch": 1.5464953568248314, + "ewc_loss": 0.06513147801160812, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031195933115668595, + "grad_norm": 7.766456604003906, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8780121207237244, + "num_tokens": 463865793.0, + "step": 12157 + }, + { + "epoch": 1.546622567103422, + "ewc_loss": 0.06443195044994354, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003049640799872577, + "grad_norm": 7.344021320343018, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8643361330032349, + "num_tokens": 463908299.0, + "step": 12158 + }, + { + "epoch": 1.5467497773820125, + "ewc_loss": 0.06554658710956573, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003161104104947299, + "grad_norm": 8.034353256225586, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8588976263999939, + "num_tokens": 463944513.0, + "step": 12159 + }, + { + "epoch": 1.546876987660603, + "ewc_loss": 0.06437153369188309, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030435988446697593, + "grad_norm": 7.302670478820801, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.856177806854248, + "num_tokens": 463982862.0, + "step": 12160 + }, + { + "epoch": 1.5470041979391935, + "ewc_loss": 0.06630700826644897, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032127322629094124, + "grad_norm": 8.138418197631836, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.868435263633728, + "num_tokens": 464023357.0, + "step": 12161 + }, + { + "epoch": 1.5471314082177838, + "ewc_loss": 0.06457823514938354, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003064268676098436, + "grad_norm": 7.303792476654053, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8624782562255859, + "num_tokens": 464071889.0, + "step": 12162 + }, + { + "epoch": 1.5472586184963744, + "ewc_loss": 0.06632383912801743, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032388290856033564, + "grad_norm": 7.914331436157227, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8500977158546448, + "num_tokens": 464112334.0, + "step": 12163 + }, + { + "epoch": 1.5473858287749649, + "ewc_loss": 0.06491637229919434, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00030980826704762876, + "grad_norm": 7.623445987701416, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8579853773117065, + "num_tokens": 464156833.0, + "step": 12164 + }, + { + "epoch": 1.5475130390535554, + "ewc_loss": 0.06569312512874603, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003175757883582264, + "grad_norm": 7.741783142089844, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.864890992641449, + "num_tokens": 464192989.0, + "step": 12165 + }, + { + "epoch": 1.547640249332146, + "ewc_loss": 0.06516645848751068, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031230913009494543, + "grad_norm": 7.538683891296387, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8680980801582336, + "num_tokens": 464232855.0, + "step": 12166 + }, + { + "epoch": 1.5477674596107365, + "ewc_loss": 0.06512385606765747, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.0003143245121464133, + "grad_norm": 7.758171081542969, + "learning_rate": 1e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8469696044921875, + "num_tokens": 464271860.0, + "step": 12167 + }, + { + "epoch": 1.547894669889327, + "ewc_loss": 0.06516711413860321, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031231564935296774, + "grad_norm": 7.555657386779785, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8750438690185547, + "num_tokens": 464312925.0, + "step": 12168 + }, + { + "epoch": 1.5480218801679175, + "ewc_loss": 0.06536631286144257, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003143076610285789, + "grad_norm": 7.683815002441406, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8575117588043213, + "num_tokens": 464349190.0, + "step": 12169 + }, + { + "epoch": 1.548149090446508, + "ewc_loss": 0.06495622545480728, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003102067857980728, + "grad_norm": 7.510753631591797, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8580377101898193, + "num_tokens": 464392031.0, + "step": 12170 + }, + { + "epoch": 1.5482763007250986, + "ewc_loss": 0.06532368063926697, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003138813190162182, + "grad_norm": 7.737561225891113, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8574226498603821, + "num_tokens": 464424884.0, + "step": 12171 + }, + { + "epoch": 1.5484035110036891, + "ewc_loss": 0.06496161967515945, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031026071519590914, + "grad_norm": 7.565007209777832, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8571243286132812, + "num_tokens": 464466047.0, + "step": 12172 + }, + { + "epoch": 1.5485307212822796, + "ewc_loss": 0.06531399488449097, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003137844323646277, + "grad_norm": 7.683270454406738, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8616396188735962, + "num_tokens": 464497293.0, + "step": 12173 + }, + { + "epoch": 1.5486579315608702, + "ewc_loss": 0.06497956812381744, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003104402276221663, + "grad_norm": 7.578593730926514, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.848493218421936, + "num_tokens": 464541873.0, + "step": 12174 + }, + { + "epoch": 1.5487851418394607, + "ewc_loss": 0.0652264803647995, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031290933839045465, + "grad_norm": 7.626980781555176, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8618749976158142, + "num_tokens": 464579712.0, + "step": 12175 + }, + { + "epoch": 1.5489123521180512, + "ewc_loss": 0.06471173465251923, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031020326423458755, + "grad_norm": 7.569064140319824, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8603671789169312, + "num_tokens": 464612277.0, + "step": 12176 + }, + { + "epoch": 1.5490395623966418, + "ewc_loss": 0.06511567533016205, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031180126825347543, + "grad_norm": 7.630436420440674, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8590591549873352, + "num_tokens": 464647062.0, + "step": 12177 + }, + { + "epoch": 1.5491667726752323, + "ewc_loss": 0.06507052481174469, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003113498096354306, + "grad_norm": 7.56147575378418, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8653925657272339, + "num_tokens": 464677998.0, + "step": 12178 + }, + { + "epoch": 1.5492939829538228, + "ewc_loss": 0.06515234708786011, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031216800562106073, + "grad_norm": 7.635814189910889, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8548703789710999, + "num_tokens": 464719205.0, + "step": 12179 + }, + { + "epoch": 1.5494211932324133, + "ewc_loss": 0.06489188969135284, + "ewc_loss_diag": 3.361701965332031e-05, + "ewc_loss_parallel": 0.00031200487865135074, + "grad_norm": 7.550657749176025, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8664131164550781, + "num_tokens": 464760758.0, + "step": 12180 + }, + { + "epoch": 1.5495484035110036, + "ewc_loss": 0.06529618799686432, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031360643333755434, + "grad_norm": 7.667292594909668, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8655434846878052, + "num_tokens": 464797422.0, + "step": 12181 + }, + { + "epoch": 1.5496756137895942, + "ewc_loss": 0.06512734293937683, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003119180037174374, + "grad_norm": 7.569408416748047, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.864041268825531, + "num_tokens": 464832889.0, + "step": 12182 + }, + { + "epoch": 1.5498028240681847, + "ewc_loss": 0.06537346541881561, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031437919824384153, + "grad_norm": 7.690522193908691, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8563390970230103, + "num_tokens": 464867743.0, + "step": 12183 + }, + { + "epoch": 1.5499300343467752, + "ewc_loss": 0.06509969383478165, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003116414591204375, + "grad_norm": 7.577707767486572, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8499709963798523, + "num_tokens": 464904666.0, + "step": 12184 + }, + { + "epoch": 1.5500572446253658, + "ewc_loss": 0.06546735763549805, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031531814602203667, + "grad_norm": 7.633565425872803, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8665130138397217, + "num_tokens": 464942886.0, + "step": 12185 + }, + { + "epoch": 1.5501844549039563, + "ewc_loss": 0.06525476276874542, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031319219851866364, + "grad_norm": 7.61328125, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8632118105888367, + "num_tokens": 464977463.0, + "step": 12186 + }, + { + "epoch": 1.5503116651825466, + "ewc_loss": 0.06536507606506348, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003142953210044652, + "grad_norm": 7.585108757019043, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8717584013938904, + "num_tokens": 465015732.0, + "step": 12187 + }, + { + "epoch": 1.5504388754611371, + "ewc_loss": 0.06521793454885483, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003128238604404032, + "grad_norm": 7.532059669494629, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8722965717315674, + "num_tokens": 465060789.0, + "step": 12188 + }, + { + "epoch": 1.5505660857397277, + "ewc_loss": 0.06542380899190903, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003148826363030821, + "grad_norm": 7.642253875732422, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8599903583526611, + "num_tokens": 465097907.0, + "step": 12189 + }, + { + "epoch": 1.5506932960183182, + "ewc_loss": 0.06525968015193939, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031324135488830507, + "grad_norm": 7.584141731262207, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8704225420951843, + "num_tokens": 465139401.0, + "step": 12190 + }, + { + "epoch": 1.5508205062969087, + "ewc_loss": 0.06539735943078995, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031461811158806086, + "grad_norm": 7.675184726715088, + "learning_rate": 1e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8406664133071899, + "num_tokens": 465176531.0, + "step": 12191 + }, + { + "epoch": 1.5509477165754992, + "ewc_loss": 0.06523026525974274, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031294714426621795, + "grad_norm": 7.530834674835205, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8722485303878784, + "num_tokens": 465216074.0, + "step": 12192 + }, + { + "epoch": 1.5510749268540898, + "ewc_loss": 0.06553182005882263, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003159627376589924, + "grad_norm": 7.644096851348877, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8767346143722534, + "num_tokens": 465257986.0, + "step": 12193 + }, + { + "epoch": 1.5512021371326803, + "ewc_loss": 0.06515857577323914, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003122302587144077, + "grad_norm": 7.558651447296143, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.865382730960846, + "num_tokens": 465296038.0, + "step": 12194 + }, + { + "epoch": 1.5513293474112708, + "ewc_loss": 0.06549330055713654, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003155774902552366, + "grad_norm": 7.625808238983154, + "learning_rate": 1e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8388454914093018, + "num_tokens": 465334215.0, + "step": 12195 + }, + { + "epoch": 1.5514565576898613, + "ewc_loss": 0.06527939438819885, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000313438504235819, + "grad_norm": 7.586404323577881, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8611859083175659, + "num_tokens": 465373362.0, + "step": 12196 + }, + { + "epoch": 1.5515837679684519, + "ewc_loss": 0.06546282023191452, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003152727149426937, + "grad_norm": 7.639764308929443, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8573622107505798, + "num_tokens": 465410181.0, + "step": 12197 + }, + { + "epoch": 1.5517109782470424, + "ewc_loss": 0.06535224616527557, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000314166973112151, + "grad_norm": 7.634049892425537, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8701168298721313, + "num_tokens": 465443056.0, + "step": 12198 + }, + { + "epoch": 1.551838188525633, + "ewc_loss": 0.06542982161045074, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031494273571297526, + "grad_norm": 7.5942864418029785, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8659957051277161, + "num_tokens": 465484509.0, + "step": 12199 + }, + { + "epoch": 1.5519653988042235, + "ewc_loss": 0.06536586582660675, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031430323724634945, + "grad_norm": 7.638087749481201, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8611969947814941, + "num_tokens": 465520017.0, + "step": 12200 + }, + { + "epoch": 1.552092609082814, + "ewc_loss": 0.0653364360332489, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003140088520012796, + "grad_norm": 7.573685646057129, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8791822195053101, + "num_tokens": 465557221.0, + "step": 12201 + }, + { + "epoch": 1.5522198193614045, + "ewc_loss": 0.06546159088611603, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003152604040224105, + "grad_norm": 7.629650115966797, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8718140721321106, + "num_tokens": 465595455.0, + "step": 12202 + }, + { + "epoch": 1.552347029639995, + "ewc_loss": 0.065256267786026, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003132071578875184, + "grad_norm": 7.596351623535156, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.863377034664154, + "num_tokens": 465636436.0, + "step": 12203 + }, + { + "epoch": 1.5524742399185856, + "ewc_loss": 0.06537474691867828, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031439203303307295, + "grad_norm": 7.635273456573486, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8598715662956238, + "num_tokens": 465668956.0, + "step": 12204 + }, + { + "epoch": 1.5526014501971759, + "ewc_loss": 0.0653112381696701, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003137569292448461, + "grad_norm": 7.620522975921631, + "learning_rate": 1e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.8439565896987915, + "num_tokens": 465708812.0, + "step": 12205 + }, + { + "epoch": 1.5527286604757664, + "ewc_loss": 0.06525436043739319, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003131881821900606, + "grad_norm": 7.636564254760742, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8511655330657959, + "num_tokens": 465742610.0, + "step": 12206 + }, + { + "epoch": 1.552855870754357, + "ewc_loss": 0.06522396206855774, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003128841344732791, + "grad_norm": 7.597461223602295, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8623071908950806, + "num_tokens": 465776468.0, + "step": 12207 + }, + { + "epoch": 1.5529830810329475, + "ewc_loss": 0.06543947011232376, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003150392440147698, + "grad_norm": 7.726154804229736, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8783882856369019, + "num_tokens": 465809300.0, + "step": 12208 + }, + { + "epoch": 1.553110291311538, + "ewc_loss": 0.06507278978824615, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003113724524155259, + "grad_norm": 7.474616050720215, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8641766309738159, + "num_tokens": 465849847.0, + "step": 12209 + }, + { + "epoch": 1.5532375015901285, + "ewc_loss": 0.06582753360271454, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003164784866385162, + "grad_norm": 7.729119300842285, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8736085891723633, + "num_tokens": 465884986.0, + "step": 12210 + }, + { + "epoch": 1.5533647118687188, + "ewc_loss": 0.06510975956916809, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003117421583738178, + "grad_norm": 7.558941841125488, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8720033168792725, + "num_tokens": 465919710.0, + "step": 12211 + }, + { + "epoch": 1.5534919221473094, + "ewc_loss": 0.06541161984205246, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031476072035729885, + "grad_norm": 7.663213729858398, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8543654680252075, + "num_tokens": 465956041.0, + "step": 12212 + }, + { + "epoch": 1.5536191324258999, + "ewc_loss": 0.06515319645404816, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031217647483572364, + "grad_norm": 7.552380084991455, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.873589038848877, + "num_tokens": 465994352.0, + "step": 12213 + }, + { + "epoch": 1.5537463427044904, + "ewc_loss": 0.0654202252626419, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003148468385916203, + "grad_norm": 7.675713539123535, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8639187216758728, + "num_tokens": 466032493.0, + "step": 12214 + }, + { + "epoch": 1.553873552983081, + "ewc_loss": 0.06525583565235138, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003132029378321022, + "grad_norm": 7.6496100425720215, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8639642596244812, + "num_tokens": 466067048.0, + "step": 12215 + }, + { + "epoch": 1.5540007632616715, + "ewc_loss": 0.06556244194507599, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031382753513753414, + "grad_norm": 7.666979789733887, + "learning_rate": 1e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8398951888084412, + "num_tokens": 466106895.0, + "step": 12216 + }, + { + "epoch": 1.554127973540262, + "ewc_loss": 0.06516920775175095, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031233663321472704, + "grad_norm": 7.600594520568848, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8486975431442261, + "num_tokens": 466146740.0, + "step": 12217 + }, + { + "epoch": 1.5542551838188525, + "ewc_loss": 0.0652323067188263, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031296760425902903, + "grad_norm": 7.590035438537598, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8696103692054749, + "num_tokens": 466187097.0, + "step": 12218 + }, + { + "epoch": 1.554382394097443, + "ewc_loss": 0.06530047208070755, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003136492450721562, + "grad_norm": 7.660286903381348, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8523832559585571, + "num_tokens": 466223969.0, + "step": 12219 + }, + { + "epoch": 1.5545096043760336, + "ewc_loss": 0.06509536504745483, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003115982108283788, + "grad_norm": 7.574582099914551, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8743582963943481, + "num_tokens": 466260127.0, + "step": 12220 + }, + { + "epoch": 1.554636814654624, + "ewc_loss": 0.06534754484891891, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003141199704259634, + "grad_norm": 7.595503807067871, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8687152862548828, + "num_tokens": 466297219.0, + "step": 12221 + }, + { + "epoch": 1.5547640249332146, + "ewc_loss": 0.06522241979837418, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003128687385469675, + "grad_norm": 7.710033416748047, + "learning_rate": 1e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8436948657035828, + "num_tokens": 466329474.0, + "step": 12222 + }, + { + "epoch": 1.5548912352118052, + "ewc_loss": 0.06511226296424866, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031176715856418014, + "grad_norm": 7.572946548461914, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8611994385719299, + "num_tokens": 466372801.0, + "step": 12223 + }, + { + "epoch": 1.5550184454903957, + "ewc_loss": 0.06536489725112915, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000314293458359316, + "grad_norm": 7.67236852645874, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8698848485946655, + "num_tokens": 466412950.0, + "step": 12224 + }, + { + "epoch": 1.5551456557689862, + "ewc_loss": 0.06512413918972015, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003118859604001045, + "grad_norm": 7.63484001159668, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.868561863899231, + "num_tokens": 466448830.0, + "step": 12225 + }, + { + "epoch": 1.5552728660475768, + "ewc_loss": 0.06529112160205841, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003135557344648987, + "grad_norm": 7.6473565101623535, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8517576456069946, + "num_tokens": 466489749.0, + "step": 12226 + }, + { + "epoch": 1.5554000763261673, + "ewc_loss": 0.06506426632404327, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031128720729611814, + "grad_norm": 7.568999290466309, + "learning_rate": 1e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8439847230911255, + "num_tokens": 466531320.0, + "step": 12227 + }, + { + "epoch": 1.5555272866047578, + "ewc_loss": 0.06516773253679276, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003123218775726855, + "grad_norm": 7.652745246887207, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.854733943939209, + "num_tokens": 466570001.0, + "step": 12228 + }, + { + "epoch": 1.5556544968833483, + "ewc_loss": 0.06506973505020142, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003113418642897159, + "grad_norm": 7.629319190979004, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8653324246406555, + "num_tokens": 466607186.0, + "step": 12229 + }, + { + "epoch": 1.5557817071619386, + "ewc_loss": 0.06511642038822174, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031180877704173326, + "grad_norm": 7.635609149932861, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8542134165763855, + "num_tokens": 466649480.0, + "step": 12230 + }, + { + "epoch": 1.5559089174405292, + "ewc_loss": 0.0650918111205101, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031156264594756067, + "grad_norm": 7.571898460388184, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8691461682319641, + "num_tokens": 466688861.0, + "step": 12231 + }, + { + "epoch": 1.5560361277191197, + "ewc_loss": 0.06524518132209778, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003130963013973087, + "grad_norm": 7.744680881500244, + "learning_rate": 1e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8422966003417969, + "num_tokens": 466723200.0, + "step": 12232 + }, + { + "epoch": 1.5561633379977102, + "ewc_loss": 0.06493251770734787, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003099697350990027, + "grad_norm": 7.545372009277344, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8483201265335083, + "num_tokens": 466759639.0, + "step": 12233 + }, + { + "epoch": 1.5562905482763008, + "ewc_loss": 0.0652550756931305, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031319528352469206, + "grad_norm": 7.636727809906006, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8564660549163818, + "num_tokens": 466795748.0, + "step": 12234 + }, + { + "epoch": 1.5564177585548913, + "ewc_loss": 0.06494718790054321, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003101164475083351, + "grad_norm": 7.536501407623291, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8741699457168579, + "num_tokens": 466832938.0, + "step": 12235 + }, + { + "epoch": 1.5565449688334816, + "ewc_loss": 0.06527132540941238, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000313357770210132, + "grad_norm": 7.705946922302246, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8551084995269775, + "num_tokens": 466868710.0, + "step": 12236 + }, + { + "epoch": 1.5566721791120721, + "ewc_loss": 0.0649639368057251, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003102839400526136, + "grad_norm": 7.525254249572754, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8614749908447266, + "num_tokens": 466903558.0, + "step": 12237 + }, + { + "epoch": 1.5567993893906626, + "ewc_loss": 0.06535986065864563, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003142431378364563, + "grad_norm": 7.682235240936279, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8517305850982666, + "num_tokens": 466946785.0, + "step": 12238 + }, + { + "epoch": 1.5569265996692532, + "ewc_loss": 0.06494592130184174, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031010370003059506, + "grad_norm": 7.509000778198242, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8555692434310913, + "num_tokens": 466988563.0, + "step": 12239 + }, + { + "epoch": 1.5570538099478437, + "ewc_loss": 0.06546126306056976, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003152571152895689, + "grad_norm": 7.754688262939453, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.855137825012207, + "num_tokens": 467025447.0, + "step": 12240 + }, + { + "epoch": 1.5571810202264342, + "ewc_loss": 0.0649067759513855, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003097123117186129, + "grad_norm": 7.5615763664245605, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8707176446914673, + "num_tokens": 467060751.0, + "step": 12241 + }, + { + "epoch": 1.5573082305050248, + "ewc_loss": 0.06545689702033997, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003152135177515447, + "grad_norm": 7.706404685974121, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8608525991439819, + "num_tokens": 467101889.0, + "step": 12242 + }, + { + "epoch": 1.5574354407836153, + "ewc_loss": 0.06500697135925293, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003107142692897469, + "grad_norm": 7.500947952270508, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8642995953559875, + "num_tokens": 467136782.0, + "step": 12243 + }, + { + "epoch": 1.5575626510622058, + "ewc_loss": 0.06541799008846283, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003148244577459991, + "grad_norm": 7.618800163269043, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8696253895759583, + "num_tokens": 467174137.0, + "step": 12244 + }, + { + "epoch": 1.5576898613407963, + "ewc_loss": 0.06519778072834015, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031262231641449034, + "grad_norm": 7.530786991119385, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8577073812484741, + "num_tokens": 467213919.0, + "step": 12245 + }, + { + "epoch": 1.5578170716193869, + "ewc_loss": 0.06550925225019455, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031573703745380044, + "grad_norm": 7.599149703979492, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8695788979530334, + "num_tokens": 467254860.0, + "step": 12246 + }, + { + "epoch": 1.5579442818979774, + "ewc_loss": 0.06526899337768555, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003133344289381057, + "grad_norm": 7.529528617858887, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8739833235740662, + "num_tokens": 467296806.0, + "step": 12247 + }, + { + "epoch": 1.558071492176568, + "ewc_loss": 0.06552458554506302, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031589038553647697, + "grad_norm": 7.684298515319824, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8688119649887085, + "num_tokens": 467332882.0, + "step": 12248 + }, + { + "epoch": 1.5581987024551585, + "ewc_loss": 0.06524290889501572, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003130736295133829, + "grad_norm": 7.6066975593566895, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8777447938919067, + "num_tokens": 467375088.0, + "step": 12249 + }, + { + "epoch": 1.558325912733749, + "ewc_loss": 0.06532206386327744, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003138651663903147, + "grad_norm": 8.796929359436035, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8612416386604309, + "num_tokens": 467413000.0, + "step": 12250 + }, + { + "epoch": 1.5584531230123395, + "ewc_loss": 0.06504061073064804, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031105062225833535, + "grad_norm": 7.445348739624023, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8775132894515991, + "num_tokens": 467450947.0, + "step": 12251 + }, + { + "epoch": 1.55858033329093, + "ewc_loss": 0.0663185566663742, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003238301142118871, + "grad_norm": 7.887764930725098, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8530412912368774, + "num_tokens": 467493770.0, + "step": 12252 + }, + { + "epoch": 1.5587075435695206, + "ewc_loss": 0.06477651745080948, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003084096824750304, + "grad_norm": 7.405745506286621, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.859804630279541, + "num_tokens": 467531882.0, + "step": 12253 + }, + { + "epoch": 1.5588347538481109, + "ewc_loss": 0.06644189357757568, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003250634472351521, + "grad_norm": 7.9334001541137695, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8882492184638977, + "num_tokens": 467566059.0, + "step": 12254 + }, + { + "epoch": 1.5589619641267014, + "ewc_loss": 0.06512400507926941, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003118846216239035, + "grad_norm": 7.4755706787109375, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8774465322494507, + "num_tokens": 467603734.0, + "step": 12255 + }, + { + "epoch": 1.559089174405292, + "ewc_loss": 0.06621238589286804, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032276843558065593, + "grad_norm": 7.8495635986328125, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8707267045974731, + "num_tokens": 467641224.0, + "step": 12256 + }, + { + "epoch": 1.5592163846838825, + "ewc_loss": 0.0652945339679718, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000313589844154194, + "grad_norm": 7.634835243225098, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8609734773635864, + "num_tokens": 467677576.0, + "step": 12257 + }, + { + "epoch": 1.559343594962473, + "ewc_loss": 0.0657029002904892, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003176735481247306, + "grad_norm": 8.886529922485352, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8703578114509583, + "num_tokens": 467717526.0, + "step": 12258 + }, + { + "epoch": 1.5594708052410635, + "ewc_loss": 0.06543996185064316, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000315044162562117, + "grad_norm": 7.499579429626465, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8580993413925171, + "num_tokens": 467757591.0, + "step": 12259 + }, + { + "epoch": 1.5595980155196538, + "ewc_loss": 0.06639167666435242, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032456134795211256, + "grad_norm": 7.929582118988037, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8495454788208008, + "num_tokens": 467792794.0, + "step": 12260 + }, + { + "epoch": 1.5597252257982444, + "ewc_loss": 0.06506287306547165, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003112732374574989, + "grad_norm": 7.555155277252197, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8566625714302063, + "num_tokens": 467829554.0, + "step": 12261 + }, + { + "epoch": 1.5598524360768349, + "ewc_loss": 0.06610044836997986, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032164898584596813, + "grad_norm": 7.788337230682373, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.847273051738739, + "num_tokens": 467871403.0, + "step": 12262 + }, + { + "epoch": 1.5599796463554254, + "ewc_loss": 0.06541570276021957, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031480155303142965, + "grad_norm": 7.636206150054932, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8417125940322876, + "num_tokens": 467906498.0, + "step": 12263 + }, + { + "epoch": 1.560106856634016, + "ewc_loss": 0.0656178742647171, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003168232215102762, + "grad_norm": 7.679074764251709, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8614511489868164, + "num_tokens": 467946426.0, + "step": 12264 + }, + { + "epoch": 1.5602340669126065, + "ewc_loss": 0.06558188796043396, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031646338175050914, + "grad_norm": 7.752729892730713, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8653319478034973, + "num_tokens": 467979930.0, + "step": 12265 + }, + { + "epoch": 1.560361277191197, + "ewc_loss": 0.06518818438053131, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031252633198164403, + "grad_norm": 7.601191520690918, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8569836616516113, + "num_tokens": 468016347.0, + "step": 12266 + }, + { + "epoch": 1.5604884874697875, + "ewc_loss": 0.06557995080947876, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031644405680708587, + "grad_norm": 7.814255714416504, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8694043159484863, + "num_tokens": 468051446.0, + "step": 12267 + }, + { + "epoch": 1.560615697748378, + "ewc_loss": 0.06497229635715485, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031036752625368536, + "grad_norm": 7.5993828773498535, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8788870573043823, + "num_tokens": 468085713.0, + "step": 12268 + }, + { + "epoch": 1.5607429080269686, + "ewc_loss": 0.06538206338882446, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003144651127513498, + "grad_norm": 7.735693454742432, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8750028610229492, + "num_tokens": 468117863.0, + "step": 12269 + }, + { + "epoch": 1.560870118305559, + "ewc_loss": 0.06496865302324295, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031033105915412307, + "grad_norm": 7.5900187492370605, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8597565293312073, + "num_tokens": 468153041.0, + "step": 12270 + }, + { + "epoch": 1.5609973285841496, + "ewc_loss": 0.06535804271697998, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003142249770462513, + "grad_norm": 7.621627330780029, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8601576089859009, + "num_tokens": 468193624.0, + "step": 12271 + }, + { + "epoch": 1.5611245388627402, + "ewc_loss": 0.06514696776866913, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003121142217423767, + "grad_norm": 7.6205668449401855, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8779647350311279, + "num_tokens": 468227091.0, + "step": 12272 + }, + { + "epoch": 1.5612517491413307, + "ewc_loss": 0.06512634456157684, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003119079628959298, + "grad_norm": 7.5789408683776855, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.858281672000885, + "num_tokens": 468265079.0, + "step": 12273 + }, + { + "epoch": 1.5613789594199212, + "ewc_loss": 0.0652303397655487, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003129479300696403, + "grad_norm": 7.601892471313477, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8596643805503845, + "num_tokens": 468307055.0, + "step": 12274 + }, + { + "epoch": 1.5615061696985117, + "ewc_loss": 0.0651869848370552, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003125143703073263, + "grad_norm": 7.629116535186768, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8570120334625244, + "num_tokens": 468344429.0, + "step": 12275 + }, + { + "epoch": 1.5616333799771023, + "ewc_loss": 0.06524111330509186, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003130556142423302, + "grad_norm": 7.629050254821777, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8611212968826294, + "num_tokens": 468379448.0, + "step": 12276 + }, + { + "epoch": 1.5617605902556928, + "ewc_loss": 0.06521271169185638, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003127716190647334, + "grad_norm": 7.598826885223389, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8653205037117004, + "num_tokens": 468423344.0, + "step": 12277 + }, + { + "epoch": 1.5618878005342833, + "ewc_loss": 0.0652710571885109, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031335509265773, + "grad_norm": 7.616252899169922, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8687921166419983, + "num_tokens": 468461657.0, + "step": 12278 + }, + { + "epoch": 1.5620150108128736, + "ewc_loss": 0.06528163701295853, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003134608850814402, + "grad_norm": 7.580819606781006, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8770812749862671, + "num_tokens": 468500428.0, + "step": 12279 + }, + { + "epoch": 1.5621422210914642, + "ewc_loss": 0.06545016169548035, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003151462005916983, + "grad_norm": 7.681933879852295, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8509625792503357, + "num_tokens": 468536310.0, + "step": 12280 + }, + { + "epoch": 1.5622694313700547, + "ewc_loss": 0.06523796170949936, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031302415300160646, + "grad_norm": 7.605158805847168, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8603973388671875, + "num_tokens": 468577954.0, + "step": 12281 + }, + { + "epoch": 1.5623966416486452, + "ewc_loss": 0.06547608971595764, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031540539930574596, + "grad_norm": 7.702376365661621, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.85458904504776, + "num_tokens": 468612131.0, + "step": 12282 + }, + { + "epoch": 1.5625238519272358, + "ewc_loss": 0.06527464836835861, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003133910067845136, + "grad_norm": 7.61677360534668, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8575439453125, + "num_tokens": 468654022.0, + "step": 12283 + }, + { + "epoch": 1.5626510622058263, + "ewc_loss": 0.06545892357826233, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031523380312137306, + "grad_norm": 7.701948642730713, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8527519702911377, + "num_tokens": 468693120.0, + "step": 12284 + }, + { + "epoch": 1.5627782724844166, + "ewc_loss": 0.06521110981702805, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003127556119579822, + "grad_norm": 7.615970611572266, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.861869215965271, + "num_tokens": 468727695.0, + "step": 12285 + }, + { + "epoch": 1.5629054827630071, + "ewc_loss": 0.06544375419616699, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031508205574937165, + "grad_norm": 7.674808502197266, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.858747661113739, + "num_tokens": 468766705.0, + "step": 12286 + }, + { + "epoch": 1.5630326930415976, + "ewc_loss": 0.06522229313850403, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031286748708225787, + "grad_norm": 7.576127529144287, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8525466322898865, + "num_tokens": 468804555.0, + "step": 12287 + }, + { + "epoch": 1.5631599033201882, + "ewc_loss": 0.0654657632112503, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031530213891528547, + "grad_norm": 7.742644309997559, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8578324317932129, + "num_tokens": 468842495.0, + "step": 12288 + }, + { + "epoch": 1.5632871135987787, + "ewc_loss": 0.06517355889081955, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003123801143374294, + "grad_norm": 7.604363441467285, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8598104119300842, + "num_tokens": 468877574.0, + "step": 12289 + }, + { + "epoch": 1.5634143238773692, + "ewc_loss": 0.06550060212612152, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031565054086968303, + "grad_norm": 7.651402473449707, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8727794885635376, + "num_tokens": 468914573.0, + "step": 12290 + }, + { + "epoch": 1.5635415341559598, + "ewc_loss": 0.06534034758806229, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031404802575707436, + "grad_norm": 7.6593337059021, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8730925917625427, + "num_tokens": 468950943.0, + "step": 12291 + }, + { + "epoch": 1.5636687444345503, + "ewc_loss": 0.06543634086847305, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031500792829319835, + "grad_norm": 7.602928638458252, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8775638937950134, + "num_tokens": 468990007.0, + "step": 12292 + }, + { + "epoch": 1.5637959547131408, + "ewc_loss": 0.06544429808855057, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003150875272694975, + "grad_norm": 7.601774215698242, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8643362522125244, + "num_tokens": 469022461.0, + "step": 12293 + }, + { + "epoch": 1.5639231649917313, + "ewc_loss": 0.06547994911670685, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003154440491925925, + "grad_norm": 7.721022605895996, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8683913946151733, + "num_tokens": 469056396.0, + "step": 12294 + }, + { + "epoch": 1.5640503752703219, + "ewc_loss": 0.06519573926925659, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031260191462934017, + "grad_norm": 7.501145362854004, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8715898394584656, + "num_tokens": 469099464.0, + "step": 12295 + }, + { + "epoch": 1.5641775855489124, + "ewc_loss": 0.06584472954273224, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031909183599054813, + "grad_norm": 7.6944379806518555, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8705898523330688, + "num_tokens": 469139830.0, + "step": 12296 + }, + { + "epoch": 1.564304795827503, + "ewc_loss": 0.06529007852077484, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003135453152935952, + "grad_norm": 7.626949787139893, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.862075686454773, + "num_tokens": 469177838.0, + "step": 12297 + }, + { + "epoch": 1.5644320061060935, + "ewc_loss": 0.06556593626737595, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031630389275960624, + "grad_norm": 7.6462602615356445, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8881359100341797, + "num_tokens": 469218727.0, + "step": 12298 + }, + { + "epoch": 1.564559216384684, + "ewc_loss": 0.0653950423002243, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031459497404284775, + "grad_norm": 7.642330646514893, + "learning_rate": 1e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8409612774848938, + "num_tokens": 469261186.0, + "step": 12299 + }, + { + "epoch": 1.5646864266632745, + "ewc_loss": 0.06534042209386826, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003140487533528358, + "grad_norm": 7.592510223388672, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.870538592338562, + "num_tokens": 469301191.0, + "step": 12300 + }, + { + "epoch": 1.564813636941865, + "ewc_loss": 0.06548871099948883, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031553165172226727, + "grad_norm": 7.671448230743408, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8680019378662109, + "num_tokens": 469340132.0, + "step": 12301 + }, + { + "epoch": 1.5649408472204556, + "ewc_loss": 0.06523998081684113, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000313044321956113, + "grad_norm": 7.563370227813721, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8815813064575195, + "num_tokens": 469376211.0, + "step": 12302 + }, + { + "epoch": 1.5650680574990459, + "ewc_loss": 0.06542407721281052, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031488528475165367, + "grad_norm": 7.605771541595459, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8521761894226074, + "num_tokens": 469413961.0, + "step": 12303 + }, + { + "epoch": 1.5651952677776364, + "ewc_loss": 0.06542038917541504, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003148484684061259, + "grad_norm": 7.635475158691406, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8622909784317017, + "num_tokens": 469449507.0, + "step": 12304 + }, + { + "epoch": 1.565322478056227, + "ewc_loss": 0.06537871807813644, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003144317015539855, + "grad_norm": 7.669896125793457, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8482695817947388, + "num_tokens": 469486324.0, + "step": 12305 + }, + { + "epoch": 1.5654496883348175, + "ewc_loss": 0.0653064027428627, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000313708558678627, + "grad_norm": 7.552550315856934, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8575900197029114, + "num_tokens": 469523799.0, + "step": 12306 + }, + { + "epoch": 1.565576898613408, + "ewc_loss": 0.06551893800497055, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031583389500156045, + "grad_norm": 7.66071891784668, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8486639857292175, + "num_tokens": 469558661.0, + "step": 12307 + }, + { + "epoch": 1.5657041088919985, + "ewc_loss": 0.06535904109477997, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003142349305562675, + "grad_norm": 7.558977127075195, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.861350953578949, + "num_tokens": 469603087.0, + "step": 12308 + }, + { + "epoch": 1.5658313191705888, + "ewc_loss": 0.06552977859973907, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031594233587384224, + "grad_norm": 7.6077165603637695, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8623060584068298, + "num_tokens": 469637344.0, + "step": 12309 + }, + { + "epoch": 1.5659585294491793, + "ewc_loss": 0.06541004776954651, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031474506249651313, + "grad_norm": 7.589855670928955, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8715723752975464, + "num_tokens": 469681127.0, + "step": 12310 + }, + { + "epoch": 1.5660857397277699, + "ewc_loss": 0.06561852246522903, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031682976987212896, + "grad_norm": 7.643312454223633, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8601782321929932, + "num_tokens": 469724238.0, + "step": 12311 + }, + { + "epoch": 1.5662129500063604, + "ewc_loss": 0.06540647149085999, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003147092356812209, + "grad_norm": 7.537820816040039, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8558191061019897, + "num_tokens": 469765967.0, + "step": 12312 + }, + { + "epoch": 1.566340160284951, + "ewc_loss": 0.06578262150287628, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003184707893524319, + "grad_norm": 7.674261569976807, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8642770648002625, + "num_tokens": 469801387.0, + "step": 12313 + }, + { + "epoch": 1.5664673705635415, + "ewc_loss": 0.06543482840061188, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003149928234051913, + "grad_norm": 7.612931728363037, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8637411594390869, + "num_tokens": 469835448.0, + "step": 12314 + }, + { + "epoch": 1.566594580842132, + "ewc_loss": 0.06569521129131317, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031759668490849435, + "grad_norm": 7.642909526824951, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8530324697494507, + "num_tokens": 469876647.0, + "step": 12315 + }, + { + "epoch": 1.5667217911207225, + "ewc_loss": 0.06558400392532349, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031648456933908165, + "grad_norm": 7.582916736602783, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8767772316932678, + "num_tokens": 469918158.0, + "step": 12316 + }, + { + "epoch": 1.566849001399313, + "ewc_loss": 0.06564395129680634, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000317084020934999, + "grad_norm": 7.70546293258667, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8477013111114502, + "num_tokens": 469953691.0, + "step": 12317 + }, + { + "epoch": 1.5669762116779036, + "ewc_loss": 0.06538209319114685, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031446549110114574, + "grad_norm": 7.54632043838501, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8656136393547058, + "num_tokens": 469996174.0, + "step": 12318 + }, + { + "epoch": 1.567103421956494, + "ewc_loss": 0.0658114105463028, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000318758626235649, + "grad_norm": 7.7602925300598145, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.848608136177063, + "num_tokens": 470031371.0, + "step": 12319 + }, + { + "epoch": 1.5672306322350846, + "ewc_loss": 0.06539906561374664, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031463519553653896, + "grad_norm": 7.595661163330078, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.866073489189148, + "num_tokens": 470068783.0, + "step": 12320 + }, + { + "epoch": 1.5673578425136752, + "ewc_loss": 0.0657549649477005, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003181941283401102, + "grad_norm": 7.7215962409973145, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8455460071563721, + "num_tokens": 470103490.0, + "step": 12321 + }, + { + "epoch": 1.5674850527922657, + "ewc_loss": 0.06542515754699707, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003148960822727531, + "grad_norm": 7.621540069580078, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8589330911636353, + "num_tokens": 470137213.0, + "step": 12322 + }, + { + "epoch": 1.5676122630708562, + "ewc_loss": 0.06567953526973724, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003174399316776544, + "grad_norm": 7.696197986602783, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.866346001625061, + "num_tokens": 470171816.0, + "step": 12323 + }, + { + "epoch": 1.5677394733494467, + "ewc_loss": 0.0654771625995636, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031541616772301495, + "grad_norm": 7.583079814910889, + "learning_rate": 1e-06, + "loss": 0.5429, + "mean_token_accuracy": 0.8387207388877869, + "num_tokens": 470210517.0, + "step": 12324 + }, + { + "epoch": 1.5678666836280373, + "ewc_loss": 0.06566067039966583, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003172512515448034, + "grad_norm": 7.661779880523682, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8501248955726624, + "num_tokens": 470247085.0, + "step": 12325 + }, + { + "epoch": 1.5679938939066278, + "ewc_loss": 0.0655900090932846, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003165446105413139, + "grad_norm": 7.596297264099121, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8625654578208923, + "num_tokens": 470288483.0, + "step": 12326 + }, + { + "epoch": 1.5681211041852183, + "ewc_loss": 0.06559155881404877, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003165601228829473, + "grad_norm": 7.644272804260254, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8634859919548035, + "num_tokens": 470322173.0, + "step": 12327 + }, + { + "epoch": 1.5682483144638086, + "ewc_loss": 0.06554270535707474, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003160715859849006, + "grad_norm": 7.613821983337402, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8636389970779419, + "num_tokens": 470356617.0, + "step": 12328 + }, + { + "epoch": 1.5683755247423992, + "ewc_loss": 0.06592707335948944, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003174738958477974, + "grad_norm": 7.742903232574463, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8723243474960327, + "num_tokens": 470393095.0, + "step": 12329 + }, + { + "epoch": 1.5685027350209897, + "ewc_loss": 0.06534777581691742, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031412229873239994, + "grad_norm": 7.565868377685547, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8730258345603943, + "num_tokens": 470427785.0, + "step": 12330 + }, + { + "epoch": 1.5686299452995802, + "ewc_loss": 0.06591763347387314, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003173794539179653, + "grad_norm": 7.837743759155273, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8570510149002075, + "num_tokens": 470469971.0, + "step": 12331 + }, + { + "epoch": 1.5687571555781707, + "ewc_loss": 0.0651574656367302, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031221917015500367, + "grad_norm": 7.569601058959961, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.853721022605896, + "num_tokens": 470503935.0, + "step": 12332 + }, + { + "epoch": 1.5688843658567613, + "ewc_loss": 0.06590139865875244, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031721711275167763, + "grad_norm": 7.698604583740234, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8644605875015259, + "num_tokens": 470536014.0, + "step": 12333 + }, + { + "epoch": 1.5690115761353516, + "ewc_loss": 0.06551730632781982, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031337622203864157, + "grad_norm": 7.5293378829956055, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8750967979431152, + "num_tokens": 470575632.0, + "step": 12334 + }, + { + "epoch": 1.569138786413942, + "ewc_loss": 0.06585441529750824, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003167472896166146, + "grad_norm": 7.7313055992126465, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8725622296333313, + "num_tokens": 470611295.0, + "step": 12335 + }, + { + "epoch": 1.5692659966925326, + "ewc_loss": 0.06546423584222794, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003128454845864326, + "grad_norm": 7.539419174194336, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.861583948135376, + "num_tokens": 470646993.0, + "step": 12336 + }, + { + "epoch": 1.5693932069711232, + "ewc_loss": 0.06585340201854706, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003167371905874461, + "grad_norm": 7.753933429718018, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8661267757415771, + "num_tokens": 470680253.0, + "step": 12337 + }, + { + "epoch": 1.5695204172497137, + "ewc_loss": 0.06539798527956009, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003121829649899155, + "grad_norm": 7.52916145324707, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8609493970870972, + "num_tokens": 470716169.0, + "step": 12338 + }, + { + "epoch": 1.5696476275283042, + "ewc_loss": 0.06587556004524231, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031695867073722184, + "grad_norm": 7.818094730377197, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8762024641036987, + "num_tokens": 470754208.0, + "step": 12339 + }, + { + "epoch": 1.5697748378068948, + "ewc_loss": 0.06530243903398514, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003112275153398514, + "grad_norm": 7.466530799865723, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8640276789665222, + "num_tokens": 470801329.0, + "step": 12340 + }, + { + "epoch": 1.5699020480854853, + "ewc_loss": 0.0660349577665329, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003185526584275067, + "grad_norm": 7.733390808105469, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8582146167755127, + "num_tokens": 470838408.0, + "step": 12341 + }, + { + "epoch": 1.5700292583640758, + "ewc_loss": 0.06525428593158722, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003107459924649447, + "grad_norm": 7.518398761749268, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8625255227088928, + "num_tokens": 470874001.0, + "step": 12342 + }, + { + "epoch": 1.5701564686426663, + "ewc_loss": 0.06586312502622604, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031683436827734113, + "grad_norm": 7.66680383682251, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8586357831954956, + "num_tokens": 470916870.0, + "step": 12343 + }, + { + "epoch": 1.5702836789212569, + "ewc_loss": 0.06524057686328888, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031305031734518707, + "grad_norm": 7.589325904846191, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8699573874473572, + "num_tokens": 470953862.0, + "step": 12344 + }, + { + "epoch": 1.5704108891998474, + "ewc_loss": 0.06541655957698822, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031481016776524484, + "grad_norm": 7.559001445770264, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8742141723632812, + "num_tokens": 470991507.0, + "step": 12345 + }, + { + "epoch": 1.570538099478438, + "ewc_loss": 0.0653475821018219, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003141203778795898, + "grad_norm": 7.604255199432373, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8655588626861572, + "num_tokens": 471030109.0, + "step": 12346 + }, + { + "epoch": 1.5706653097570284, + "ewc_loss": 0.06533271074295044, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000313971599098295, + "grad_norm": 7.570084095001221, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.876700758934021, + "num_tokens": 471063829.0, + "step": 12347 + }, + { + "epoch": 1.570792520035619, + "ewc_loss": 0.06539935618638992, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031463810591958463, + "grad_norm": 7.623164176940918, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8532257080078125, + "num_tokens": 471107259.0, + "step": 12348 + }, + { + "epoch": 1.5709197303142095, + "ewc_loss": 0.06525713205337524, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031321580172516406, + "grad_norm": 7.6070098876953125, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8660370111465454, + "num_tokens": 471141516.0, + "step": 12349 + }, + { + "epoch": 1.5710469405928, + "ewc_loss": 0.06544947624206543, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003151393320877105, + "grad_norm": 7.668272972106934, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8632566928863525, + "num_tokens": 471178655.0, + "step": 12350 + }, + { + "epoch": 1.5711741508713906, + "ewc_loss": 0.06507901102304459, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003114346181973815, + "grad_norm": 7.570246696472168, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8747999668121338, + "num_tokens": 471209437.0, + "step": 12351 + }, + { + "epoch": 1.5713013611499809, + "ewc_loss": 0.06535075604915619, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003141520719509572, + "grad_norm": 7.548283100128174, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8786101937294006, + "num_tokens": 471249559.0, + "step": 12352 + }, + { + "epoch": 1.5714285714285714, + "ewc_loss": 0.06558404862880707, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003140435728710145, + "grad_norm": 7.705883979797363, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8681343793869019, + "num_tokens": 471287179.0, + "step": 12353 + }, + { + "epoch": 1.571555781707162, + "ewc_loss": 0.0654529333114624, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003127324744127691, + "grad_norm": 7.554646015167236, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8649677038192749, + "num_tokens": 471328052.0, + "step": 12354 + }, + { + "epoch": 1.5716829919857525, + "ewc_loss": 0.06538917869329453, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031453632982447743, + "grad_norm": 7.622878074645996, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8612101674079895, + "num_tokens": 471362414.0, + "step": 12355 + }, + { + "epoch": 1.571810202264343, + "ewc_loss": 0.06523451954126358, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031298972317017615, + "grad_norm": 7.582744121551514, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8585678339004517, + "num_tokens": 471397910.0, + "step": 12356 + }, + { + "epoch": 1.5719374125429335, + "ewc_loss": 0.06543511152267456, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031499561737291515, + "grad_norm": 7.61929988861084, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8768647313117981, + "num_tokens": 471436951.0, + "step": 12357 + }, + { + "epoch": 1.5720646228215238, + "ewc_loss": 0.06538562476634979, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003120593319181353, + "grad_norm": 7.553597450256348, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.867261528968811, + "num_tokens": 471470139.0, + "step": 12358 + }, + { + "epoch": 1.5721918331001143, + "ewc_loss": 0.06551627069711685, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003158072358928621, + "grad_norm": 7.590043067932129, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8696390390396118, + "num_tokens": 471508591.0, + "step": 12359 + }, + { + "epoch": 1.5723190433787049, + "ewc_loss": 0.06520818173885345, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031272636260837317, + "grad_norm": 7.5698394775390625, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8642418384552002, + "num_tokens": 471549165.0, + "step": 12360 + }, + { + "epoch": 1.5724462536572954, + "ewc_loss": 0.06540000438690186, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031464453786611557, + "grad_norm": 7.583876609802246, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8573627471923828, + "num_tokens": 471589021.0, + "step": 12361 + }, + { + "epoch": 1.572573463935886, + "ewc_loss": 0.06540372967720032, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031468187808059156, + "grad_norm": 7.652628421783447, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8569936156272888, + "num_tokens": 471624263.0, + "step": 12362 + }, + { + "epoch": 1.5727006742144765, + "ewc_loss": 0.0652737021446228, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031338154803961515, + "grad_norm": 7.5929388999938965, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8579661846160889, + "num_tokens": 471663678.0, + "step": 12363 + }, + { + "epoch": 1.572827884493067, + "ewc_loss": 0.06548403948545456, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031548491097055376, + "grad_norm": 7.60056209564209, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8627973794937134, + "num_tokens": 471705887.0, + "step": 12364 + }, + { + "epoch": 1.5729550947716575, + "ewc_loss": 0.06524717807769775, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003131163539364934, + "grad_norm": 7.59885311126709, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8711060881614685, + "num_tokens": 471739796.0, + "step": 12365 + }, + { + "epoch": 1.573082305050248, + "ewc_loss": 0.06541420519351959, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003147865936625749, + "grad_norm": 7.6404571533203125, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8599705100059509, + "num_tokens": 471774723.0, + "step": 12366 + }, + { + "epoch": 1.5732095153288386, + "ewc_loss": 0.06549395620822906, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031314269290305674, + "grad_norm": 7.5654425621032715, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8563706278800964, + "num_tokens": 471814169.0, + "step": 12367 + }, + { + "epoch": 1.573336725607429, + "ewc_loss": 0.06571302562952042, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031533336732536554, + "grad_norm": 7.616664886474609, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.864546537399292, + "num_tokens": 471852367.0, + "step": 12368 + }, + { + "epoch": 1.5734639358860196, + "ewc_loss": 0.06528746336698532, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003135191509500146, + "grad_norm": 7.59874963760376, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.869489312171936, + "num_tokens": 471883438.0, + "step": 12369 + }, + { + "epoch": 1.5735911461646102, + "ewc_loss": 0.06547008454799652, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031534541631117463, + "grad_norm": 7.637022018432617, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8609781265258789, + "num_tokens": 471921051.0, + "step": 12370 + }, + { + "epoch": 1.5737183564432007, + "ewc_loss": 0.0652894601225853, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031353914528153837, + "grad_norm": 7.553378105163574, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8672802448272705, + "num_tokens": 471959685.0, + "step": 12371 + }, + { + "epoch": 1.5738455667217912, + "ewc_loss": 0.06538654118776321, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031450996175408363, + "grad_norm": 7.639886379241943, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8629445433616638, + "num_tokens": 471997345.0, + "step": 12372 + }, + { + "epoch": 1.5739727770003817, + "ewc_loss": 0.06555972993373871, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031380041036754847, + "grad_norm": 7.581270217895508, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8689464330673218, + "num_tokens": 472034455.0, + "step": 12373 + }, + { + "epoch": 1.5740999872789723, + "ewc_loss": 0.06540048122406006, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003146493691019714, + "grad_norm": 7.564130783081055, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8619768619537354, + "num_tokens": 472073630.0, + "step": 12374 + }, + { + "epoch": 1.5742271975575628, + "ewc_loss": 0.0654725730419159, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003153702709823847, + "grad_norm": 7.626440525054932, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8612142205238342, + "num_tokens": 472111556.0, + "step": 12375 + }, + { + "epoch": 1.5743544078361533, + "ewc_loss": 0.06545374542474747, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003151819691993296, + "grad_norm": 7.6210150718688965, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8707337379455566, + "num_tokens": 472148276.0, + "step": 12376 + }, + { + "epoch": 1.5744816181147436, + "ewc_loss": 0.06539417803287506, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031458635930903256, + "grad_norm": 7.682599067687988, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8637537360191345, + "num_tokens": 472181665.0, + "step": 12377 + }, + { + "epoch": 1.5746088283933342, + "ewc_loss": 0.06538733094930649, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003145178488921374, + "grad_norm": 7.661004543304443, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8566486835479736, + "num_tokens": 472222759.0, + "step": 12378 + }, + { + "epoch": 1.5747360386719247, + "ewc_loss": 0.06537282466888428, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031437273719348013, + "grad_norm": 7.628840446472168, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8580851554870605, + "num_tokens": 472260867.0, + "step": 12379 + }, + { + "epoch": 1.5748632489505152, + "ewc_loss": 0.06534866988658905, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031413117540068924, + "grad_norm": 7.628903865814209, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8506730794906616, + "num_tokens": 472300068.0, + "step": 12380 + }, + { + "epoch": 1.5749904592291057, + "ewc_loss": 0.06534045934677124, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003140490734949708, + "grad_norm": 7.705286502838135, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8743510246276855, + "num_tokens": 472336459.0, + "step": 12381 + }, + { + "epoch": 1.5751176695076963, + "ewc_loss": 0.06524249911308289, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031306949676945806, + "grad_norm": 7.637728691101074, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8671029210090637, + "num_tokens": 472376696.0, + "step": 12382 + }, + { + "epoch": 1.5752448797862866, + "ewc_loss": 0.06528431177139282, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003134876606054604, + "grad_norm": 7.62285041809082, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8764147758483887, + "num_tokens": 472414491.0, + "step": 12383 + }, + { + "epoch": 1.575372090064877, + "ewc_loss": 0.06522248685359955, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031286937883123755, + "grad_norm": 7.579167366027832, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8633295297622681, + "num_tokens": 472452188.0, + "step": 12384 + }, + { + "epoch": 1.5754993003434676, + "ewc_loss": 0.06538660079240799, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031451054383069277, + "grad_norm": 7.6520891189575195, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8631199598312378, + "num_tokens": 472489690.0, + "step": 12385 + }, + { + "epoch": 1.5756265106220582, + "ewc_loss": 0.0651792362332344, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003124368959106505, + "grad_norm": 7.559328079223633, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8622632622718811, + "num_tokens": 472529912.0, + "step": 12386 + }, + { + "epoch": 1.5757537209006487, + "ewc_loss": 0.06547179073095322, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003153624420519918, + "grad_norm": 7.703109264373779, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8619768619537354, + "num_tokens": 472562534.0, + "step": 12387 + }, + { + "epoch": 1.5758809311792392, + "ewc_loss": 0.06513049453496933, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003119494649581611, + "grad_norm": 7.635150909423828, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8617228865623474, + "num_tokens": 472598955.0, + "step": 12388 + }, + { + "epoch": 1.5760081414578297, + "ewc_loss": 0.06525260210037231, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031317060347646475, + "grad_norm": 7.617654800415039, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8673690557479858, + "num_tokens": 472638064.0, + "step": 12389 + }, + { + "epoch": 1.5761353517364203, + "ewc_loss": 0.06517037749290466, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031234827474690974, + "grad_norm": 7.571702003479004, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8673632144927979, + "num_tokens": 472676823.0, + "step": 12390 + }, + { + "epoch": 1.5762625620150108, + "ewc_loss": 0.06543388217687607, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003149833355564624, + "grad_norm": 7.6980719566345215, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8623858094215393, + "num_tokens": 472718528.0, + "step": 12391 + }, + { + "epoch": 1.5763897722936013, + "ewc_loss": 0.06515845656394958, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031222912366501987, + "grad_norm": 7.564259052276611, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8695402145385742, + "num_tokens": 472755634.0, + "step": 12392 + }, + { + "epoch": 1.5765169825721919, + "ewc_loss": 0.06543569266796112, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003150014381390065, + "grad_norm": 7.611938953399658, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.875098466873169, + "num_tokens": 472798060.0, + "step": 12393 + }, + { + "epoch": 1.5766441928507824, + "ewc_loss": 0.06526007503271103, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003132452839054167, + "grad_norm": 7.55801248550415, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8684980869293213, + "num_tokens": 472837628.0, + "step": 12394 + }, + { + "epoch": 1.576771403129373, + "ewc_loss": 0.0655151829123497, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031579635106027126, + "grad_norm": 7.673015594482422, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8647257089614868, + "num_tokens": 472877446.0, + "step": 12395 + }, + { + "epoch": 1.5768986134079634, + "ewc_loss": 0.06526327133178711, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031327729811891913, + "grad_norm": 7.649752140045166, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8576294183731079, + "num_tokens": 472918437.0, + "step": 12396 + }, + { + "epoch": 1.577025823686554, + "ewc_loss": 0.06539598852396011, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003146044327877462, + "grad_norm": 7.606505393981934, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8717086911201477, + "num_tokens": 472956242.0, + "step": 12397 + }, + { + "epoch": 1.5771530339651445, + "ewc_loss": 0.06547564268112183, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031540097552351654, + "grad_norm": 7.729191303253174, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8739749193191528, + "num_tokens": 472985620.0, + "step": 12398 + }, + { + "epoch": 1.577280244243735, + "ewc_loss": 0.06525450944900513, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031318963738158345, + "grad_norm": 7.619048118591309, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.867313802242279, + "num_tokens": 473019517.0, + "step": 12399 + }, + { + "epoch": 1.5774074545223256, + "ewc_loss": 0.06544347107410431, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031507923267781734, + "grad_norm": 7.664143085479736, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8830595016479492, + "num_tokens": 473054624.0, + "step": 12400 + }, + { + "epoch": 1.5775346648009159, + "ewc_loss": 0.06519900262355804, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003126345982309431, + "grad_norm": 7.593838691711426, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8547148704528809, + "num_tokens": 473094140.0, + "step": 12401 + }, + { + "epoch": 1.5776618750795064, + "ewc_loss": 0.06540495157241821, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031469407258555293, + "grad_norm": 7.675347328186035, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8467598557472229, + "num_tokens": 473136886.0, + "step": 12402 + }, + { + "epoch": 1.577789085358097, + "ewc_loss": 0.06519724428653717, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031261699041351676, + "grad_norm": 7.593963146209717, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8508918881416321, + "num_tokens": 473173267.0, + "step": 12403 + }, + { + "epoch": 1.5779162956366874, + "ewc_loss": 0.06550422310829163, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003156868042424321, + "grad_norm": 7.700857639312744, + "learning_rate": 1e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.841978907585144, + "num_tokens": 473216477.0, + "step": 12404 + }, + { + "epoch": 1.578043505915278, + "ewc_loss": 0.06515199691057205, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031216448405757546, + "grad_norm": 7.597659111022949, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8559086322784424, + "num_tokens": 473256986.0, + "step": 12405 + }, + { + "epoch": 1.5781707161938685, + "ewc_loss": 0.06545577198266983, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031520225456915796, + "grad_norm": 7.641603946685791, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8712289333343506, + "num_tokens": 473297967.0, + "step": 12406 + }, + { + "epoch": 1.5782979264724588, + "ewc_loss": 0.06539663672447205, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031461092294193804, + "grad_norm": 7.653045654296875, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8565387725830078, + "num_tokens": 473333580.0, + "step": 12407 + }, + { + "epoch": 1.5784251367510493, + "ewc_loss": 0.06541772931814194, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031482180929742754, + "grad_norm": 7.663781642913818, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8687293529510498, + "num_tokens": 473370553.0, + "step": 12408 + }, + { + "epoch": 1.5785523470296399, + "ewc_loss": 0.06537092477083206, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031435376149602234, + "grad_norm": 7.613165378570557, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.844965934753418, + "num_tokens": 473410705.0, + "step": 12409 + }, + { + "epoch": 1.5786795573082304, + "ewc_loss": 0.06568825244903564, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003150856355205178, + "grad_norm": 7.737393856048584, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8673142194747925, + "num_tokens": 473443315.0, + "step": 12410 + }, + { + "epoch": 1.578806767586821, + "ewc_loss": 0.06526901572942734, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003133346908725798, + "grad_norm": 7.564756393432617, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.854840099811554, + "num_tokens": 473488767.0, + "step": 12411 + }, + { + "epoch": 1.5789339778654115, + "ewc_loss": 0.06546756625175476, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031532018329016864, + "grad_norm": 7.745625972747803, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8783577680587769, + "num_tokens": 473525360.0, + "step": 12412 + }, + { + "epoch": 1.579061188144002, + "ewc_loss": 0.06525932252407074, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003132377751171589, + "grad_norm": 7.584926605224609, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.855346143245697, + "num_tokens": 473560625.0, + "step": 12413 + }, + { + "epoch": 1.5791883984225925, + "ewc_loss": 0.06565525382757187, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031719706021249294, + "grad_norm": 7.698841571807861, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8819268345832825, + "num_tokens": 473595484.0, + "step": 12414 + }, + { + "epoch": 1.579315608701183, + "ewc_loss": 0.06521843373775482, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003128288371954113, + "grad_norm": 7.617465496063232, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.861038327217102, + "num_tokens": 473627269.0, + "step": 12415 + }, + { + "epoch": 1.5794428189797736, + "ewc_loss": 0.06553927063941956, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031603724346496165, + "grad_norm": 7.6545329093933105, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8482183218002319, + "num_tokens": 473669159.0, + "step": 12416 + }, + { + "epoch": 1.579570029258364, + "ewc_loss": 0.06536377966403961, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000314282369799912, + "grad_norm": 7.621043682098389, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8741042613983154, + "num_tokens": 473707739.0, + "step": 12417 + }, + { + "epoch": 1.5796972395369546, + "ewc_loss": 0.0654536560177803, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003151810960844159, + "grad_norm": 7.624722003936768, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.864750325679779, + "num_tokens": 473746908.0, + "step": 12418 + }, + { + "epoch": 1.5798244498155452, + "ewc_loss": 0.06535518169403076, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031419636798091233, + "grad_norm": 7.683234691619873, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8610159754753113, + "num_tokens": 473781480.0, + "step": 12419 + }, + { + "epoch": 1.5799516600941357, + "ewc_loss": 0.06540641188621521, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003147086245007813, + "grad_norm": 7.650239944458008, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.863148033618927, + "num_tokens": 473814676.0, + "step": 12420 + }, + { + "epoch": 1.5800788703727262, + "ewc_loss": 0.06543003022670746, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003149448602925986, + "grad_norm": 7.630753517150879, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8583933711051941, + "num_tokens": 473855487.0, + "step": 12421 + }, + { + "epoch": 1.5802060806513167, + "ewc_loss": 0.06550610810518265, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031570560531690717, + "grad_norm": 7.686731338500977, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.87498939037323, + "num_tokens": 473885648.0, + "step": 12422 + }, + { + "epoch": 1.5803332909299073, + "ewc_loss": 0.06534656882286072, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003141102206427604, + "grad_norm": 7.623499870300293, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8600062727928162, + "num_tokens": 473923862.0, + "step": 12423 + }, + { + "epoch": 1.5804605012084978, + "ewc_loss": 0.06545265018939972, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031517105526290834, + "grad_norm": 7.631471157073975, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8668683767318726, + "num_tokens": 473957725.0, + "step": 12424 + }, + { + "epoch": 1.5805877114870883, + "ewc_loss": 0.06527230143547058, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031336749088950455, + "grad_norm": 7.614494323730469, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8632590770721436, + "num_tokens": 473994939.0, + "step": 12425 + }, + { + "epoch": 1.5807149217656786, + "ewc_loss": 0.0654546245932579, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031519075855612755, + "grad_norm": 7.61398458480835, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8594232797622681, + "num_tokens": 474035996.0, + "step": 12426 + }, + { + "epoch": 1.5808421320442692, + "ewc_loss": 0.06560766696929932, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031427983776666224, + "grad_norm": 7.63712215423584, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8721852898597717, + "num_tokens": 474071693.0, + "step": 12427 + }, + { + "epoch": 1.5809693423228597, + "ewc_loss": 0.06527816504240036, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003134261642117053, + "grad_norm": 7.5582685470581055, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8712539672851562, + "num_tokens": 474114415.0, + "step": 12428 + }, + { + "epoch": 1.5810965526014502, + "ewc_loss": 0.06556950509548187, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031633954495191574, + "grad_norm": 7.699057102203369, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8573341369628906, + "num_tokens": 474152060.0, + "step": 12429 + }, + { + "epoch": 1.5812237628800407, + "ewc_loss": 0.06522395461797714, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003128840762656182, + "grad_norm": 7.540848255157471, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8656560182571411, + "num_tokens": 474190605.0, + "step": 12430 + }, + { + "epoch": 1.5813509731586313, + "ewc_loss": 0.06556636840105057, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031630820012651384, + "grad_norm": 7.651875019073486, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8594175577163696, + "num_tokens": 474229447.0, + "step": 12431 + }, + { + "epoch": 1.5814781834372216, + "ewc_loss": 0.06528912484645844, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031353574013337493, + "grad_norm": 7.563209533691406, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.871142566204071, + "num_tokens": 474268088.0, + "step": 12432 + }, + { + "epoch": 1.581605393715812, + "ewc_loss": 0.06555914878845215, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031623602262698114, + "grad_norm": 7.659618377685547, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8588564395904541, + "num_tokens": 474304856.0, + "step": 12433 + }, + { + "epoch": 1.5817326039944026, + "ewc_loss": 0.06530997157096863, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003137442108709365, + "grad_norm": 7.570710182189941, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8675063252449036, + "num_tokens": 474349248.0, + "step": 12434 + }, + { + "epoch": 1.5818598142729932, + "ewc_loss": 0.06555578112602234, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003162022912874818, + "grad_norm": 7.654421806335449, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8744440078735352, + "num_tokens": 474384432.0, + "step": 12435 + }, + { + "epoch": 1.5819870245515837, + "ewc_loss": 0.06532566249370575, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031390110962092876, + "grad_norm": 7.641228199005127, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8606630563735962, + "num_tokens": 474415881.0, + "step": 12436 + }, + { + "epoch": 1.5821142348301742, + "ewc_loss": 0.06542709469795227, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031491543632000685, + "grad_norm": 7.619086742401123, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8630410432815552, + "num_tokens": 474452755.0, + "step": 12437 + }, + { + "epoch": 1.5822414451087647, + "ewc_loss": 0.06533734500408173, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000314017990604043, + "grad_norm": 7.566371440887451, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8587779998779297, + "num_tokens": 474493937.0, + "step": 12438 + }, + { + "epoch": 1.5823686553873553, + "ewc_loss": 0.06547728180885315, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031541730277240276, + "grad_norm": 7.638132095336914, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8696012496948242, + "num_tokens": 474534072.0, + "step": 12439 + }, + { + "epoch": 1.5824958656659458, + "ewc_loss": 0.06519480794668198, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000312592601403594, + "grad_norm": 7.553610801696777, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8557397127151489, + "num_tokens": 474569343.0, + "step": 12440 + }, + { + "epoch": 1.5826230759445363, + "ewc_loss": 0.06549201905727386, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031556468456983566, + "grad_norm": 7.619986057281494, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8538203239440918, + "num_tokens": 474606610.0, + "step": 12441 + }, + { + "epoch": 1.5827502862231269, + "ewc_loss": 0.06531018018722534, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031374633545055985, + "grad_norm": 7.542196750640869, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8538470268249512, + "num_tokens": 474645071.0, + "step": 12442 + }, + { + "epoch": 1.5828774965017174, + "ewc_loss": 0.0655265524983406, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003159100597258657, + "grad_norm": 7.653801441192627, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8517104387283325, + "num_tokens": 474683985.0, + "step": 12443 + }, + { + "epoch": 1.583004706780308, + "ewc_loss": 0.06536059081554413, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003142504720017314, + "grad_norm": 7.587942123413086, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8687723875045776, + "num_tokens": 474726102.0, + "step": 12444 + }, + { + "epoch": 1.5831319170588984, + "ewc_loss": 0.06547635793685913, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031540816416963935, + "grad_norm": 7.575730800628662, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.866848349571228, + "num_tokens": 474766430.0, + "step": 12445 + }, + { + "epoch": 1.583259127337489, + "ewc_loss": 0.06544845551252365, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031512908753938973, + "grad_norm": 7.596390247344971, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8617489337921143, + "num_tokens": 474800648.0, + "step": 12446 + }, + { + "epoch": 1.5833863376160795, + "ewc_loss": 0.06573517620563507, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003155549056828022, + "grad_norm": 8.010433197021484, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8678537011146545, + "num_tokens": 474845540.0, + "step": 12447 + }, + { + "epoch": 1.58351354789467, + "ewc_loss": 0.06502068787813187, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031085143564268947, + "grad_norm": 7.469560623168945, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8610118627548218, + "num_tokens": 474884296.0, + "step": 12448 + }, + { + "epoch": 1.5836407581732606, + "ewc_loss": 0.06578981131315231, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031854264670982957, + "grad_norm": 7.671618938446045, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8683342933654785, + "num_tokens": 474929344.0, + "step": 12449 + }, + { + "epoch": 1.5837679684518509, + "ewc_loss": 0.06501457840204239, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003107903175987303, + "grad_norm": 7.485466480255127, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8687260150909424, + "num_tokens": 474968035.0, + "step": 12450 + }, + { + "epoch": 1.5838951787304414, + "ewc_loss": 0.06581637263298035, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031880821916274726, + "grad_norm": 7.679968357086182, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8573822379112244, + "num_tokens": 475006189.0, + "step": 12451 + }, + { + "epoch": 1.584022389009032, + "ewc_loss": 0.06531975418329239, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031384205794893205, + "grad_norm": 7.572834491729736, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8581534624099731, + "num_tokens": 475039768.0, + "step": 12452 + }, + { + "epoch": 1.5841495992876224, + "ewc_loss": 0.06579797714948654, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031862431205809116, + "grad_norm": 7.658262252807617, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8693261742591858, + "num_tokens": 475079938.0, + "step": 12453 + }, + { + "epoch": 1.584276809566213, + "ewc_loss": 0.06546376645565033, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003152821445837617, + "grad_norm": 7.589428424835205, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.85129714012146, + "num_tokens": 475122405.0, + "step": 12454 + }, + { + "epoch": 1.5844040198448035, + "ewc_loss": 0.06569648534059525, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003176093741785735, + "grad_norm": 7.666989326477051, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8567332029342651, + "num_tokens": 475158237.0, + "step": 12455 + }, + { + "epoch": 1.5845312301233938, + "ewc_loss": 0.06550759822130203, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000315720506478101, + "grad_norm": 7.562165260314941, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8642336130142212, + "num_tokens": 475194966.0, + "step": 12456 + }, + { + "epoch": 1.5846584404019843, + "ewc_loss": 0.06573881208896637, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003180326893925667, + "grad_norm": 7.666046142578125, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8621721267700195, + "num_tokens": 475234483.0, + "step": 12457 + }, + { + "epoch": 1.5847856506805749, + "ewc_loss": 0.06554766744375229, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003161212080158293, + "grad_norm": 7.5970869064331055, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.858311653137207, + "num_tokens": 475275172.0, + "step": 12458 + }, + { + "epoch": 1.5849128609591654, + "ewc_loss": 0.0657249316573143, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031789386412128806, + "grad_norm": 7.6264519691467285, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8669782876968384, + "num_tokens": 475313550.0, + "step": 12459 + }, + { + "epoch": 1.585040071237756, + "ewc_loss": 0.06560170650482178, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003166615788359195, + "grad_norm": 7.599571704864502, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.870905876159668, + "num_tokens": 475357019.0, + "step": 12460 + }, + { + "epoch": 1.5851672815163464, + "ewc_loss": 0.06577140092849731, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003183585067745298, + "grad_norm": 7.676957607269287, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8681421279907227, + "num_tokens": 475394788.0, + "step": 12461 + }, + { + "epoch": 1.585294491794937, + "ewc_loss": 0.06560452282428741, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003166897804476321, + "grad_norm": 7.641341686248779, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8608092069625854, + "num_tokens": 475432354.0, + "step": 12462 + }, + { + "epoch": 1.5854217020735275, + "ewc_loss": 0.06574738770723343, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031811840017326176, + "grad_norm": 7.686785697937012, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8673343658447266, + "num_tokens": 475467284.0, + "step": 12463 + }, + { + "epoch": 1.585548912352118, + "ewc_loss": 0.0655616819858551, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003162613429594785, + "grad_norm": 7.639427661895752, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8547030091285706, + "num_tokens": 475505946.0, + "step": 12464 + }, + { + "epoch": 1.5856761226307086, + "ewc_loss": 0.06569309532642365, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031757549731992185, + "grad_norm": 7.627411842346191, + "learning_rate": 1e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8498700261116028, + "num_tokens": 475542799.0, + "step": 12465 + }, + { + "epoch": 1.585803332909299, + "ewc_loss": 0.06551989912986755, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031584349926561117, + "grad_norm": 7.638316631317139, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8631826639175415, + "num_tokens": 475582064.0, + "step": 12466 + }, + { + "epoch": 1.5859305431878896, + "ewc_loss": 0.06554710865020752, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003161155909765512, + "grad_norm": 7.644408702850342, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8593661785125732, + "num_tokens": 475615521.0, + "step": 12467 + }, + { + "epoch": 1.5860577534664801, + "ewc_loss": 0.06565672159194946, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031721178675070405, + "grad_norm": 7.652101993560791, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8719426989555359, + "num_tokens": 475650500.0, + "step": 12468 + }, + { + "epoch": 1.5861849637450707, + "ewc_loss": 0.06545715034008026, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000315216020680964, + "grad_norm": 7.566844940185547, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8651723861694336, + "num_tokens": 475695824.0, + "step": 12469 + }, + { + "epoch": 1.5863121740236612, + "ewc_loss": 0.06567074358463287, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031735197990201414, + "grad_norm": 7.676025867462158, + "learning_rate": 1e-06, + "loss": 0.547, + "mean_token_accuracy": 0.8408276438713074, + "num_tokens": 475734015.0, + "step": 12470 + }, + { + "epoch": 1.5864393843022517, + "ewc_loss": 0.06546645611524582, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031530909473076463, + "grad_norm": 7.567268371582031, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8613911867141724, + "num_tokens": 475768961.0, + "step": 12471 + }, + { + "epoch": 1.5865665945808423, + "ewc_loss": 0.06576173007488251, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031826188205741346, + "grad_norm": 7.685268402099609, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8786960244178772, + "num_tokens": 475806551.0, + "step": 12472 + }, + { + "epoch": 1.5866938048594328, + "ewc_loss": 0.0654880702495575, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003155251906719059, + "grad_norm": 7.623643398284912, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8577796220779419, + "num_tokens": 475844066.0, + "step": 12473 + }, + { + "epoch": 1.5868210151380233, + "ewc_loss": 0.06567655503749847, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003174100711476058, + "grad_norm": 7.692793846130371, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8670934438705444, + "num_tokens": 475874554.0, + "step": 12474 + }, + { + "epoch": 1.5869482254166136, + "ewc_loss": 0.06541460752487183, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003147906390950084, + "grad_norm": 7.581882953643799, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8690334558486938, + "num_tokens": 475915840.0, + "step": 12475 + }, + { + "epoch": 1.5870754356952042, + "ewc_loss": 0.06566591560840607, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003173036966472864, + "grad_norm": 7.697081089019775, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8617733716964722, + "num_tokens": 475951516.0, + "step": 12476 + }, + { + "epoch": 1.5872026459737947, + "ewc_loss": 0.0653386190533638, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031403073808178306, + "grad_norm": 7.647214412689209, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8670101165771484, + "num_tokens": 475984224.0, + "step": 12477 + }, + { + "epoch": 1.5873298562523852, + "ewc_loss": 0.0655018538236618, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031566311372444034, + "grad_norm": 7.779540538787842, + "learning_rate": 1e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8417599201202393, + "num_tokens": 476015758.0, + "step": 12478 + }, + { + "epoch": 1.5874570665309757, + "ewc_loss": 0.06533490121364594, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031399354338645935, + "grad_norm": 7.607264995574951, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8591645956039429, + "num_tokens": 476051835.0, + "step": 12479 + }, + { + "epoch": 1.5875842768095663, + "ewc_loss": 0.06553788483142853, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031602333183400333, + "grad_norm": 7.648641586303711, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8600382804870605, + "num_tokens": 476090667.0, + "step": 12480 + }, + { + "epoch": 1.5877114870881566, + "ewc_loss": 0.0653189867734909, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003138344327453524, + "grad_norm": 7.629843235015869, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8612180948257446, + "num_tokens": 476130448.0, + "step": 12481 + }, + { + "epoch": 1.587838697366747, + "ewc_loss": 0.06546129286289215, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003152574645355344, + "grad_norm": 7.580874919891357, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8531317710876465, + "num_tokens": 476174495.0, + "step": 12482 + }, + { + "epoch": 1.5879659076453376, + "ewc_loss": 0.0656377375125885, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003170218551531434, + "grad_norm": 7.648446559906006, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8717169165611267, + "num_tokens": 476213412.0, + "step": 12483 + }, + { + "epoch": 1.5880931179239282, + "ewc_loss": 0.06533403694629669, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031398492865264416, + "grad_norm": 7.618836879730225, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8771777153015137, + "num_tokens": 476250441.0, + "step": 12484 + }, + { + "epoch": 1.5882203282025187, + "ewc_loss": 0.06557929515838623, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003164375084452331, + "grad_norm": 7.661888599395752, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8759913444519043, + "num_tokens": 476284466.0, + "step": 12485 + }, + { + "epoch": 1.5883475384811092, + "ewc_loss": 0.06543498486280441, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031499439501203597, + "grad_norm": 7.5980305671691895, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8612411022186279, + "num_tokens": 476327760.0, + "step": 12486 + }, + { + "epoch": 1.5884747487596997, + "ewc_loss": 0.06557271629571915, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031637170468457043, + "grad_norm": 7.583133220672607, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8588769435882568, + "num_tokens": 476369259.0, + "step": 12487 + }, + { + "epoch": 1.5886019590382903, + "ewc_loss": 0.06551385670900345, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000315783079713583, + "grad_norm": 7.647817134857178, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8643625378608704, + "num_tokens": 476403915.0, + "step": 12488 + }, + { + "epoch": 1.5887291693168808, + "ewc_loss": 0.06561675667762756, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003168121329508722, + "grad_norm": 7.674490451812744, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8605407476425171, + "num_tokens": 476440837.0, + "step": 12489 + }, + { + "epoch": 1.5888563795954713, + "ewc_loss": 0.06548351049423218, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031547961407341063, + "grad_norm": 7.6552653312683105, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8620663285255432, + "num_tokens": 476478478.0, + "step": 12490 + }, + { + "epoch": 1.5889835898740619, + "ewc_loss": 0.06560489535331726, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003166934475302696, + "grad_norm": 7.6337456703186035, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8717466592788696, + "num_tokens": 476514675.0, + "step": 12491 + }, + { + "epoch": 1.5891108001526524, + "ewc_loss": 0.06552356481552124, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003158801991958171, + "grad_norm": 7.626720905303955, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8593471050262451, + "num_tokens": 476559015.0, + "step": 12492 + }, + { + "epoch": 1.589238010431243, + "ewc_loss": 0.0655701532959938, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031634606420993805, + "grad_norm": 7.67136812210083, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8476629257202148, + "num_tokens": 476601848.0, + "step": 12493 + }, + { + "epoch": 1.5893652207098334, + "ewc_loss": 0.06550495326519012, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003156940219923854, + "grad_norm": 7.636699199676514, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.870883584022522, + "num_tokens": 476636794.0, + "step": 12494 + }, + { + "epoch": 1.589492430988424, + "ewc_loss": 0.06548614799976349, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031550604035146534, + "grad_norm": 7.681102752685547, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8576300144195557, + "num_tokens": 476671593.0, + "step": 12495 + }, + { + "epoch": 1.5896196412670145, + "ewc_loss": 0.06546644866466522, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003153089783154428, + "grad_norm": 7.677590847015381, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8733717203140259, + "num_tokens": 476706776.0, + "step": 12496 + }, + { + "epoch": 1.589746851545605, + "ewc_loss": 0.0655246302485466, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003158908220939338, + "grad_norm": 7.721221923828125, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8622987270355225, + "num_tokens": 476746700.0, + "step": 12497 + }, + { + "epoch": 1.5898740618241956, + "ewc_loss": 0.06537840515375137, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003144285874441266, + "grad_norm": 7.651239395141602, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8687065839767456, + "num_tokens": 476785837.0, + "step": 12498 + }, + { + "epoch": 1.5900012721027859, + "ewc_loss": 0.06558260321617126, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031647051218897104, + "grad_norm": 7.7074737548828125, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8697940111160278, + "num_tokens": 476818894.0, + "step": 12499 + }, + { + "epoch": 1.5901284823813764, + "ewc_loss": 0.06545823812484741, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003152269637212157, + "grad_norm": 7.63552713394165, + "learning_rate": 1e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8449007272720337, + "num_tokens": 476861773.0, + "step": 12500 + }, + { + "epoch": 1.590255692659967, + "ewc_loss": 0.06562153995037079, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031685997964814305, + "grad_norm": 7.690124034881592, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8684043884277344, + "num_tokens": 476899522.0, + "step": 12501 + }, + { + "epoch": 1.5903829029385574, + "ewc_loss": 0.06556789577007294, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031632345053367317, + "grad_norm": 7.655699253082275, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8559951186180115, + "num_tokens": 476939923.0, + "step": 12502 + }, + { + "epoch": 1.590510113217148, + "ewc_loss": 0.0656610056757927, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003172545984853059, + "grad_norm": 7.688564777374268, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8633095026016235, + "num_tokens": 476979867.0, + "step": 12503 + }, + { + "epoch": 1.5906373234957385, + "ewc_loss": 0.06555772572755814, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003162217908538878, + "grad_norm": 7.695463180541992, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8704972863197327, + "num_tokens": 477016068.0, + "step": 12504 + }, + { + "epoch": 1.5907645337743288, + "ewc_loss": 0.06556154042482376, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003162599459756166, + "grad_norm": 7.6597700119018555, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8594417572021484, + "num_tokens": 477054526.0, + "step": 12505 + }, + { + "epoch": 1.5908917440529193, + "ewc_loss": 0.0655403882265091, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031604841933585703, + "grad_norm": 7.628806114196777, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8577417135238647, + "num_tokens": 477098029.0, + "step": 12506 + }, + { + "epoch": 1.5910189543315099, + "ewc_loss": 0.0656319260597229, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003169638221152127, + "grad_norm": 7.698658466339111, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8693463802337646, + "num_tokens": 477137070.0, + "step": 12507 + }, + { + "epoch": 1.5911461646101004, + "ewc_loss": 0.065479576587677, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003154402947984636, + "grad_norm": 7.591970920562744, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8620384931564331, + "num_tokens": 477175640.0, + "step": 12508 + }, + { + "epoch": 1.591273374888691, + "ewc_loss": 0.06566953659057617, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003173398436047137, + "grad_norm": 7.713069438934326, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8644764423370361, + "num_tokens": 477210935.0, + "step": 12509 + }, + { + "epoch": 1.5914005851672814, + "ewc_loss": 0.06546555459499359, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031530007254332304, + "grad_norm": 7.626251697540283, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8706923723220825, + "num_tokens": 477251567.0, + "step": 12510 + }, + { + "epoch": 1.591527795445872, + "ewc_loss": 0.06574051082134247, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031804959871806204, + "grad_norm": 7.693429946899414, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8648783564567566, + "num_tokens": 477287964.0, + "step": 12511 + }, + { + "epoch": 1.5916550057244625, + "ewc_loss": 0.06554028391838074, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031604734249413013, + "grad_norm": 7.608160972595215, + "learning_rate": 1e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.8379597663879395, + "num_tokens": 477328660.0, + "step": 12512 + }, + { + "epoch": 1.591782216003053, + "ewc_loss": 0.06577964127063751, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031844095792621374, + "grad_norm": 7.690032005310059, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8675634264945984, + "num_tokens": 477364111.0, + "step": 12513 + }, + { + "epoch": 1.5919094262816436, + "ewc_loss": 0.06562712788581848, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003169157716911286, + "grad_norm": 7.646524429321289, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8696913123130798, + "num_tokens": 477402691.0, + "step": 12514 + }, + { + "epoch": 1.592036636560234, + "ewc_loss": 0.06572994589805603, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031794403912499547, + "grad_norm": 7.674169063568115, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8771095275878906, + "num_tokens": 477437526.0, + "step": 12515 + }, + { + "epoch": 1.5921638468388246, + "ewc_loss": 0.06573033332824707, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031794782262295485, + "grad_norm": 7.705085277557373, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8616002798080444, + "num_tokens": 477479175.0, + "step": 12516 + }, + { + "epoch": 1.5922910571174151, + "ewc_loss": 0.06551551818847656, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031579966889694333, + "grad_norm": 7.672547340393066, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8653758764266968, + "num_tokens": 477516896.0, + "step": 12517 + }, + { + "epoch": 1.5924182673960057, + "ewc_loss": 0.06568757444620132, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031752028735354543, + "grad_norm": 7.738016128540039, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8532357215881348, + "num_tokens": 477555543.0, + "step": 12518 + }, + { + "epoch": 1.5925454776745962, + "ewc_loss": 0.06551823019981384, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000315826793666929, + "grad_norm": 7.663067817687988, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8749244809150696, + "num_tokens": 477592251.0, + "step": 12519 + }, + { + "epoch": 1.5926726879531867, + "ewc_loss": 0.06576269865036011, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003182714863214642, + "grad_norm": 7.738779067993164, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8495434522628784, + "num_tokens": 477630039.0, + "step": 12520 + }, + { + "epoch": 1.5927998982317773, + "ewc_loss": 0.06547459959983826, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031539052724838257, + "grad_norm": 7.6510186195373535, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8611757159233093, + "num_tokens": 477661028.0, + "step": 12521 + }, + { + "epoch": 1.5929271085103678, + "ewc_loss": 0.06577825546264648, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031842710450291634, + "grad_norm": 7.7176923751831055, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8766664266586304, + "num_tokens": 477699612.0, + "step": 12522 + }, + { + "epoch": 1.5930543187889583, + "ewc_loss": 0.06551562249660492, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003158007748425007, + "grad_norm": 7.637479305267334, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8577845692634583, + "num_tokens": 477737300.0, + "step": 12523 + }, + { + "epoch": 1.5931815290675486, + "ewc_loss": 0.06581553816795349, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031879989546723664, + "grad_norm": 7.728212356567383, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8511589765548706, + "num_tokens": 477776031.0, + "step": 12524 + }, + { + "epoch": 1.5933087393461391, + "ewc_loss": 0.06553030014038086, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003159474872518331, + "grad_norm": 7.616395950317383, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.863562285900116, + "num_tokens": 477813640.0, + "step": 12525 + }, + { + "epoch": 1.5934359496247297, + "ewc_loss": 0.06576179713010788, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031826249323785305, + "grad_norm": 7.759066581726074, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8590070605278015, + "num_tokens": 477848652.0, + "step": 12526 + }, + { + "epoch": 1.5935631599033202, + "ewc_loss": 0.06545063853263855, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003151509154122323, + "grad_norm": 7.644426345825195, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.859836995601654, + "num_tokens": 477883710.0, + "step": 12527 + }, + { + "epoch": 1.5936903701819107, + "ewc_loss": 0.065740205347538, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031804657191969454, + "grad_norm": 7.701478958129883, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8567051887512207, + "num_tokens": 477920316.0, + "step": 12528 + }, + { + "epoch": 1.5938175804605013, + "ewc_loss": 0.06554758548736572, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031612039310857654, + "grad_norm": 7.635963439941406, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8718189001083374, + "num_tokens": 477955523.0, + "step": 12529 + }, + { + "epoch": 1.5939447907390916, + "ewc_loss": 0.065752312541008, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031816770206205547, + "grad_norm": 7.701186180114746, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8544131517410278, + "num_tokens": 477995768.0, + "step": 12530 + }, + { + "epoch": 1.594072001017682, + "ewc_loss": 0.06558537483215332, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031649827724322677, + "grad_norm": 7.662223815917969, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8629724979400635, + "num_tokens": 478037718.0, + "step": 12531 + }, + { + "epoch": 1.5941992112962726, + "ewc_loss": 0.06575165688991547, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031816112459637225, + "grad_norm": 7.685798168182373, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8509961366653442, + "num_tokens": 478077388.0, + "step": 12532 + }, + { + "epoch": 1.5943264215748632, + "ewc_loss": 0.06566303968429565, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031727494206279516, + "grad_norm": 7.695058822631836, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.877214252948761, + "num_tokens": 478109494.0, + "step": 12533 + }, + { + "epoch": 1.5944536318534537, + "ewc_loss": 0.06565225124359131, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031716705416329205, + "grad_norm": 7.67424201965332, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8580687642097473, + "num_tokens": 478142420.0, + "step": 12534 + }, + { + "epoch": 1.5945808421320442, + "ewc_loss": 0.06565512716770172, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031719577964395285, + "grad_norm": 7.657041072845459, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8665463924407959, + "num_tokens": 478183848.0, + "step": 12535 + }, + { + "epoch": 1.5947080524106347, + "ewc_loss": 0.06569291651248932, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003175736637786031, + "grad_norm": 7.704120635986328, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8717201352119446, + "num_tokens": 478225242.0, + "step": 12536 + }, + { + "epoch": 1.5948352626892253, + "ewc_loss": 0.06560452282428741, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003166897804476321, + "grad_norm": 7.644083499908447, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.868835985660553, + "num_tokens": 478263167.0, + "step": 12537 + }, + { + "epoch": 1.5949624729678158, + "ewc_loss": 0.06562989205121994, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031694344943389297, + "grad_norm": 7.72142219543457, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8643838763237, + "num_tokens": 478297752.0, + "step": 12538 + }, + { + "epoch": 1.5950896832464063, + "ewc_loss": 0.06559650599956512, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003166095702908933, + "grad_norm": 7.664690971374512, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8650560975074768, + "num_tokens": 478332955.0, + "step": 12539 + }, + { + "epoch": 1.5952168935249968, + "ewc_loss": 0.06567235291004181, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031736810342408717, + "grad_norm": 7.694931507110596, + "learning_rate": 1e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.841564416885376, + "num_tokens": 478367298.0, + "step": 12540 + }, + { + "epoch": 1.5953441038035874, + "ewc_loss": 0.0656510666012764, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031715520890429616, + "grad_norm": 7.629919528961182, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8668206930160522, + "num_tokens": 478411183.0, + "step": 12541 + }, + { + "epoch": 1.595471314082178, + "ewc_loss": 0.06577470898628235, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003183916269335896, + "grad_norm": 7.690014839172363, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8593758344650269, + "num_tokens": 478451184.0, + "step": 12542 + }, + { + "epoch": 1.5955985243607684, + "ewc_loss": 0.0656290352344513, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003169348929077387, + "grad_norm": 7.703737258911133, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8574427366256714, + "num_tokens": 478490913.0, + "step": 12543 + }, + { + "epoch": 1.595725734639359, + "ewc_loss": 0.06573279947042465, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003179725317750126, + "grad_norm": 7.643965721130371, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8646792769432068, + "num_tokens": 478529769.0, + "step": 12544 + }, + { + "epoch": 1.5958529449179495, + "ewc_loss": 0.0657675713300705, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003183202934451401, + "grad_norm": 7.724893093109131, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8593688011169434, + "num_tokens": 478566665.0, + "step": 12545 + }, + { + "epoch": 1.59598015519654, + "ewc_loss": 0.06550188362598419, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000315663346555084, + "grad_norm": 7.632374286651611, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8904797434806824, + "num_tokens": 478600814.0, + "step": 12546 + }, + { + "epoch": 1.5961073654751305, + "ewc_loss": 0.06584154069423676, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031905993819236755, + "grad_norm": 7.710504055023193, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8672199845314026, + "num_tokens": 478641610.0, + "step": 12547 + }, + { + "epoch": 1.5962345757537209, + "ewc_loss": 0.06548431515693665, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003154876467306167, + "grad_norm": 7.623015880584717, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8708873987197876, + "num_tokens": 478678183.0, + "step": 12548 + }, + { + "epoch": 1.5963617860323114, + "ewc_loss": 0.06580060720443726, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003186506510246545, + "grad_norm": 7.778811454772949, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8636026978492737, + "num_tokens": 478710881.0, + "step": 12549 + }, + { + "epoch": 1.596488996310902, + "ewc_loss": 0.06570690870285034, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031527222017757595, + "grad_norm": 7.634609699249268, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.876776933670044, + "num_tokens": 478748550.0, + "step": 12550 + }, + { + "epoch": 1.5966162065894924, + "ewc_loss": 0.06578521430492401, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031849672086536884, + "grad_norm": 7.684022903442383, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8521857857704163, + "num_tokens": 478790417.0, + "step": 12551 + }, + { + "epoch": 1.596743416868083, + "ewc_loss": 0.06564423441886902, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031708687311038375, + "grad_norm": 7.634881019592285, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8697158098220825, + "num_tokens": 478828856.0, + "step": 12552 + }, + { + "epoch": 1.5968706271466735, + "ewc_loss": 0.06575027853250504, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003181473002769053, + "grad_norm": 7.703314781188965, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8605051040649414, + "num_tokens": 478870240.0, + "step": 12553 + }, + { + "epoch": 1.5969978374252638, + "ewc_loss": 0.06566023826599121, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031724688597023487, + "grad_norm": 7.662763595581055, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8539553880691528, + "num_tokens": 478911920.0, + "step": 12554 + }, + { + "epoch": 1.5971250477038543, + "ewc_loss": 0.06572261452674866, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003178706392645836, + "grad_norm": 7.702978134155273, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8560514450073242, + "num_tokens": 478949083.0, + "step": 12555 + }, + { + "epoch": 1.5972522579824449, + "ewc_loss": 0.06569217145442963, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003175663005094975, + "grad_norm": 7.630654335021973, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8615695238113403, + "num_tokens": 478984087.0, + "step": 12556 + }, + { + "epoch": 1.5973794682610354, + "ewc_loss": 0.0657711774110794, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031835626577958465, + "grad_norm": 7.741876602172852, + "learning_rate": 1e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8431249856948853, + "num_tokens": 479019364.0, + "step": 12557 + }, + { + "epoch": 1.597506678539626, + "ewc_loss": 0.06566721200942993, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031731664785183966, + "grad_norm": 7.669861316680908, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.86600661277771, + "num_tokens": 479059002.0, + "step": 12558 + }, + { + "epoch": 1.5976338888182164, + "ewc_loss": 0.06576088070869446, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003182533837389201, + "grad_norm": 7.7782745361328125, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8626402616500854, + "num_tokens": 479092676.0, + "step": 12559 + }, + { + "epoch": 1.597761099096807, + "ewc_loss": 0.06554220616817474, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003160665510222316, + "grad_norm": 7.921944618225098, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8662174940109253, + "num_tokens": 479125951.0, + "step": 12560 + }, + { + "epoch": 1.5978883093753975, + "ewc_loss": 0.06538373231887817, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031448181835003197, + "grad_norm": 7.580743789672852, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8800453543663025, + "num_tokens": 479166230.0, + "step": 12561 + }, + { + "epoch": 1.598015519653988, + "ewc_loss": 0.06575620174407959, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003182064974680543, + "grad_norm": 7.763637542724609, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8589252829551697, + "num_tokens": 479205813.0, + "step": 12562 + }, + { + "epoch": 1.5981427299325786, + "ewc_loss": 0.06533412635326385, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031398580176755786, + "grad_norm": 7.578670024871826, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8601269721984863, + "num_tokens": 479244960.0, + "step": 12563 + }, + { + "epoch": 1.598269940211169, + "ewc_loss": 0.06595132499933243, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032015779288485646, + "grad_norm": 7.755947589874268, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.870010256767273, + "num_tokens": 479289617.0, + "step": 12564 + }, + { + "epoch": 1.5983971504897596, + "ewc_loss": 0.06555763632059097, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031622088863514364, + "grad_norm": 7.702433109283447, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8622151613235474, + "num_tokens": 479326377.0, + "step": 12565 + }, + { + "epoch": 1.5985243607683501, + "ewc_loss": 0.06587567925453186, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003169599222019315, + "grad_norm": 7.716169834136963, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8618844151496887, + "num_tokens": 479360563.0, + "step": 12566 + }, + { + "epoch": 1.5986515710469407, + "ewc_loss": 0.0657958835363388, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031616195337846875, + "grad_norm": 7.642806529998779, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8745508193969727, + "num_tokens": 479403042.0, + "step": 12567 + }, + { + "epoch": 1.5987787813255312, + "ewc_loss": 0.06589409708976746, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031714406213723123, + "grad_norm": 7.692376613616943, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8689600229263306, + "num_tokens": 479437099.0, + "step": 12568 + }, + { + "epoch": 1.5989059916041217, + "ewc_loss": 0.06561708450317383, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003168153343722224, + "grad_norm": 7.662644386291504, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8604474067687988, + "num_tokens": 479479326.0, + "step": 12569 + }, + { + "epoch": 1.5990332018827123, + "ewc_loss": 0.06576188653707504, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003182633954565972, + "grad_norm": 7.795847415924072, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8432720899581909, + "num_tokens": 479515320.0, + "step": 12570 + }, + { + "epoch": 1.5991604121613028, + "ewc_loss": 0.06551024317741394, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031574699096381664, + "grad_norm": 7.679617881774902, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8526875972747803, + "num_tokens": 479559960.0, + "step": 12571 + }, + { + "epoch": 1.5992876224398933, + "ewc_loss": 0.06575404852628708, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003181850188411772, + "grad_norm": 7.730029582977295, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8628520369529724, + "num_tokens": 479599115.0, + "step": 12572 + }, + { + "epoch": 1.5994148327184836, + "ewc_loss": 0.06558136641979218, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003164582303725183, + "grad_norm": 7.643112659454346, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8722464442253113, + "num_tokens": 479636116.0, + "step": 12573 + }, + { + "epoch": 1.5995420429970741, + "ewc_loss": 0.06573039293289185, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003179484629072249, + "grad_norm": 7.752134323120117, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.853654682636261, + "num_tokens": 479676880.0, + "step": 12574 + }, + { + "epoch": 1.5996692532756647, + "ewc_loss": 0.06560520827770233, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003166965616401285, + "grad_norm": 7.653135299682617, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8643410205841064, + "num_tokens": 479717424.0, + "step": 12575 + }, + { + "epoch": 1.5997964635542552, + "ewc_loss": 0.06575556099414825, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003182001528330147, + "grad_norm": 7.753425121307373, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8620040416717529, + "num_tokens": 479756240.0, + "step": 12576 + }, + { + "epoch": 1.5999236738328457, + "ewc_loss": 0.06552299857139587, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003158744948450476, + "grad_norm": 7.664506435394287, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8489047288894653, + "num_tokens": 479791863.0, + "step": 12577 + }, + { + "epoch": 1.6000508841114363, + "ewc_loss": 0.065730981528759, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031795434188097715, + "grad_norm": 7.808770179748535, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8726320862770081, + "num_tokens": 479827252.0, + "step": 12578 + }, + { + "epoch": 1.6001780943900266, + "ewc_loss": 0.06572364270687103, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031543956720270216, + "grad_norm": 7.68982458114624, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8606606721878052, + "num_tokens": 479863100.0, + "step": 12579 + }, + { + "epoch": 1.600305304668617, + "ewc_loss": 0.06566151976585388, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031725969165563583, + "grad_norm": 7.744101524353027, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8520129919052124, + "num_tokens": 479899684.0, + "step": 12580 + }, + { + "epoch": 1.6004325149472076, + "ewc_loss": 0.0654432475566864, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003150769916828722, + "grad_norm": 7.630610942840576, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8651805520057678, + "num_tokens": 479937656.0, + "step": 12581 + }, + { + "epoch": 1.6005597252257981, + "ewc_loss": 0.06571083515882492, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031775288516655564, + "grad_norm": 7.756958484649658, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8575873970985413, + "num_tokens": 479973357.0, + "step": 12582 + }, + { + "epoch": 1.6006869355043887, + "ewc_loss": 0.06545137614011765, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031515827868133783, + "grad_norm": 7.617152690887451, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8644933104515076, + "num_tokens": 480017862.0, + "step": 12583 + }, + { + "epoch": 1.6008141457829792, + "ewc_loss": 0.06586569547653198, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003193015290889889, + "grad_norm": 7.7916154861450195, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8687176704406738, + "num_tokens": 480057039.0, + "step": 12584 + }, + { + "epoch": 1.6009413560615697, + "ewc_loss": 0.06548405438661575, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003154850855935365, + "grad_norm": 7.6620707511901855, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8660361170768738, + "num_tokens": 480093396.0, + "step": 12585 + }, + { + "epoch": 1.6010685663401603, + "ewc_loss": 0.06581676751375198, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031881220638751984, + "grad_norm": 7.779869079589844, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8591861724853516, + "num_tokens": 480125200.0, + "step": 12586 + }, + { + "epoch": 1.6011957766187508, + "ewc_loss": 0.06550611555576324, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031570569262839854, + "grad_norm": 7.717119216918945, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8599671721458435, + "num_tokens": 480158085.0, + "step": 12587 + }, + { + "epoch": 1.6013229868973413, + "ewc_loss": 0.06570744514465332, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031771903741173446, + "grad_norm": 7.797683238983154, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.878891110420227, + "num_tokens": 480192979.0, + "step": 12588 + }, + { + "epoch": 1.6014501971759318, + "ewc_loss": 0.06545409560203552, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003151854616589844, + "grad_norm": 7.660416126251221, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8741913437843323, + "num_tokens": 480227984.0, + "step": 12589 + }, + { + "epoch": 1.6015774074545224, + "ewc_loss": 0.06576239317655563, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003182684595230967, + "grad_norm": 7.720846652984619, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8808233737945557, + "num_tokens": 480264280.0, + "step": 12590 + }, + { + "epoch": 1.601704617733113, + "ewc_loss": 0.0655774176120758, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003164186782669276, + "grad_norm": 7.672735691070557, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8482481241226196, + "num_tokens": 480300670.0, + "step": 12591 + }, + { + "epoch": 1.6018318280117034, + "ewc_loss": 0.0657462552189827, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003181070787832141, + "grad_norm": 7.733950614929199, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8611919283866882, + "num_tokens": 480334460.0, + "step": 12592 + }, + { + "epoch": 1.601959038290294, + "ewc_loss": 0.06563793122768402, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031702383421361446, + "grad_norm": 7.6387128829956055, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8723013997077942, + "num_tokens": 480376066.0, + "step": 12593 + }, + { + "epoch": 1.6020862485688845, + "ewc_loss": 0.06580371409654617, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003186816757079214, + "grad_norm": 7.707214832305908, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8699679970741272, + "num_tokens": 480417575.0, + "step": 12594 + }, + { + "epoch": 1.602213458847475, + "ewc_loss": 0.06595458090305328, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031774892704561353, + "grad_norm": 7.730644226074219, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.854033887386322, + "num_tokens": 480454106.0, + "step": 12595 + }, + { + "epoch": 1.6023406691260655, + "ewc_loss": 0.06565368175506592, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003171814023517072, + "grad_norm": 7.679469585418701, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8505427837371826, + "num_tokens": 480493031.0, + "step": 12596 + }, + { + "epoch": 1.6024678794046558, + "ewc_loss": 0.06596551835536957, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031785829924046993, + "grad_norm": 7.682662487030029, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8642412424087524, + "num_tokens": 480528326.0, + "step": 12597 + }, + { + "epoch": 1.6025950896832464, + "ewc_loss": 0.06587233394384384, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003169264819007367, + "grad_norm": 7.67957067489624, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8703180551528931, + "num_tokens": 480565399.0, + "step": 12598 + }, + { + "epoch": 1.602722299961837, + "ewc_loss": 0.06589175760746002, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031712072086520493, + "grad_norm": 7.716818332672119, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.878581702709198, + "num_tokens": 480600028.0, + "step": 12599 + }, + { + "epoch": 1.6028495102404274, + "ewc_loss": 0.06575798243284225, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031578296329826117, + "grad_norm": 7.670003414154053, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8637844324111938, + "num_tokens": 480635759.0, + "step": 12600 + }, + { + "epoch": 1.602976720519018, + "ewc_loss": 0.06559726595878601, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031661713728681207, + "grad_norm": 7.694314956665039, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.869202196598053, + "num_tokens": 480669528.0, + "step": 12601 + }, + { + "epoch": 1.6031039307976085, + "ewc_loss": 0.0658058226108551, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031626137206330895, + "grad_norm": 7.719000339508057, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8611265420913696, + "num_tokens": 480709425.0, + "step": 12602 + }, + { + "epoch": 1.6032311410761988, + "ewc_loss": 0.06583254039287567, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031652854522690177, + "grad_norm": 7.6895432472229, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8486467003822327, + "num_tokens": 480745010.0, + "step": 12603 + }, + { + "epoch": 1.6033583513547893, + "ewc_loss": 0.0658206045627594, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031640921952202916, + "grad_norm": 7.679814338684082, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8697395920753479, + "num_tokens": 480780320.0, + "step": 12604 + }, + { + "epoch": 1.6034855616333799, + "ewc_loss": 0.06596142053604126, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003178173501510173, + "grad_norm": 7.694668769836426, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8703826665878296, + "num_tokens": 480819335.0, + "step": 12605 + }, + { + "epoch": 1.6036127719119704, + "ewc_loss": 0.06575312465429306, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003157343890052289, + "grad_norm": 7.639966011047363, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8756024837493896, + "num_tokens": 480852702.0, + "step": 12606 + }, + { + "epoch": 1.603739982190561, + "ewc_loss": 0.06601135432720184, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003183166845701635, + "grad_norm": 7.795167922973633, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8573300242424011, + "num_tokens": 480890258.0, + "step": 12607 + }, + { + "epoch": 1.6038671924691514, + "ewc_loss": 0.06571673601865768, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003153704747091979, + "grad_norm": 7.6155524253845215, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8636478185653687, + "num_tokens": 480929555.0, + "step": 12608 + }, + { + "epoch": 1.603994402747742, + "ewc_loss": 0.06608432531356812, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031904634670354426, + "grad_norm": 7.6967644691467285, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8861398100852966, + "num_tokens": 480970295.0, + "step": 12609 + }, + { + "epoch": 1.6041216130263325, + "ewc_loss": 0.06581942737102509, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031639737426303327, + "grad_norm": 7.670330047607422, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8653507828712463, + "num_tokens": 481011213.0, + "step": 12610 + }, + { + "epoch": 1.604248823304923, + "ewc_loss": 0.06596174091100693, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003178205224685371, + "grad_norm": 7.735306262969971, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8575842976570129, + "num_tokens": 481045273.0, + "step": 12611 + }, + { + "epoch": 1.6043760335835135, + "ewc_loss": 0.06584088504314423, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003166119859088212, + "grad_norm": 7.696074962615967, + "learning_rate": 1e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8429469466209412, + "num_tokens": 481084567.0, + "step": 12612 + }, + { + "epoch": 1.604503243862104, + "ewc_loss": 0.06592364609241486, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003174396406393498, + "grad_norm": 7.68336820602417, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8648545742034912, + "num_tokens": 481119455.0, + "step": 12613 + }, + { + "epoch": 1.6046304541406946, + "ewc_loss": 0.06586813926696777, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031688454328104854, + "grad_norm": 7.719773292541504, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8496886491775513, + "num_tokens": 481154718.0, + "step": 12614 + }, + { + "epoch": 1.6047576644192851, + "ewc_loss": 0.06576117873191833, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003158148902002722, + "grad_norm": 7.59754753112793, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8578396439552307, + "num_tokens": 481193716.0, + "step": 12615 + }, + { + "epoch": 1.6048848746978757, + "ewc_loss": 0.06612353026866913, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031943837529979646, + "grad_norm": 7.793148517608643, + "learning_rate": 1e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8398952484130859, + "num_tokens": 481231378.0, + "step": 12616 + }, + { + "epoch": 1.6050120849764662, + "ewc_loss": 0.06574288010597229, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031563188531436026, + "grad_norm": 7.640280723571777, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8608391284942627, + "num_tokens": 481267907.0, + "step": 12617 + }, + { + "epoch": 1.6051392952550567, + "ewc_loss": 0.06611911952495575, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031939431210048497, + "grad_norm": 7.691131114959717, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8650774359703064, + "num_tokens": 481309899.0, + "step": 12618 + }, + { + "epoch": 1.6052665055336472, + "ewc_loss": 0.06591987609863281, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031740189297124743, + "grad_norm": 7.698081970214844, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8726778030395508, + "num_tokens": 481348845.0, + "step": 12619 + }, + { + "epoch": 1.6053937158122378, + "ewc_loss": 0.06605485081672668, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003187516413163394, + "grad_norm": 7.65349817276001, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8553574681282043, + "num_tokens": 481384999.0, + "step": 12620 + }, + { + "epoch": 1.6055209260908283, + "ewc_loss": 0.06600304692983627, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003182335931342095, + "grad_norm": 7.73720121383667, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8667933940887451, + "num_tokens": 481417287.0, + "step": 12621 + }, + { + "epoch": 1.6056481363694186, + "ewc_loss": 0.06603077799081802, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031851092353463173, + "grad_norm": 7.721640586853027, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8675365447998047, + "num_tokens": 481455943.0, + "step": 12622 + }, + { + "epoch": 1.6057753466480091, + "ewc_loss": 0.06605342030525208, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031873732223175466, + "grad_norm": 7.698087692260742, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8786238431930542, + "num_tokens": 481492017.0, + "step": 12623 + }, + { + "epoch": 1.6059025569265997, + "ewc_loss": 0.06588444113731384, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031704758293926716, + "grad_norm": 7.626453399658203, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8555439114570618, + "num_tokens": 481535115.0, + "step": 12624 + }, + { + "epoch": 1.6060297672051902, + "ewc_loss": 0.06616084277629852, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003198115446139127, + "grad_norm": 7.696775436401367, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8479326963424683, + "num_tokens": 481580048.0, + "step": 12625 + }, + { + "epoch": 1.6061569774837807, + "ewc_loss": 0.06593303382396698, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003175334131810814, + "grad_norm": 7.680986404418945, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8741731643676758, + "num_tokens": 481616748.0, + "step": 12626 + }, + { + "epoch": 1.6062841877623713, + "ewc_loss": 0.06591566652059555, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031980121275410056, + "grad_norm": 7.735589027404785, + "learning_rate": 1e-06, + "loss": 0.5497, + "mean_token_accuracy": 0.834380567073822, + "num_tokens": 481659123.0, + "step": 12627 + }, + { + "epoch": 1.6064113980409616, + "ewc_loss": 0.06569945812225342, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031763911829330027, + "grad_norm": 7.62041711807251, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8534319400787354, + "num_tokens": 481704791.0, + "step": 12628 + }, + { + "epoch": 1.606538608319552, + "ewc_loss": 0.06597854197025299, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003204300010111183, + "grad_norm": 7.854966163635254, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.850393533706665, + "num_tokens": 481742913.0, + "step": 12629 + }, + { + "epoch": 1.6066658185981426, + "ewc_loss": 0.06557535380125046, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031639807275496423, + "grad_norm": 7.659189701080322, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.850784182548523, + "num_tokens": 481783549.0, + "step": 12630 + }, + { + "epoch": 1.6067930288767331, + "ewc_loss": 0.0659780502319336, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003204250824637711, + "grad_norm": 7.787047386169434, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8685482740402222, + "num_tokens": 481818486.0, + "step": 12631 + }, + { + "epoch": 1.6069202391553237, + "ewc_loss": 0.06556659936904907, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003163104993291199, + "grad_norm": 7.70210599899292, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8632441163063049, + "num_tokens": 481849969.0, + "step": 12632 + }, + { + "epoch": 1.6070474494339142, + "ewc_loss": 0.0657721757888794, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031836627749726176, + "grad_norm": 7.745399475097656, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8464804887771606, + "num_tokens": 481886108.0, + "step": 12633 + }, + { + "epoch": 1.6071746597125047, + "ewc_loss": 0.06565694510936737, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003172140277456492, + "grad_norm": 7.702447414398193, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8749004006385803, + "num_tokens": 481919389.0, + "step": 12634 + }, + { + "epoch": 1.6073018699910953, + "ewc_loss": 0.06588602066040039, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031706332811154425, + "grad_norm": 7.699556350708008, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.876503586769104, + "num_tokens": 481958531.0, + "step": 12635 + }, + { + "epoch": 1.6074290802696858, + "ewc_loss": 0.06572042405605316, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003178487822879106, + "grad_norm": 7.655717849731445, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8576270937919617, + "num_tokens": 481999864.0, + "step": 12636 + }, + { + "epoch": 1.6075562905482763, + "ewc_loss": 0.06570924818515778, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003177370526827872, + "grad_norm": 7.695583820343018, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8868428468704224, + "num_tokens": 482033890.0, + "step": 12637 + }, + { + "epoch": 1.6076835008268668, + "ewc_loss": 0.06577177345752716, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003183622320648283, + "grad_norm": 7.690652370452881, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8679782748222351, + "num_tokens": 482071186.0, + "step": 12638 + }, + { + "epoch": 1.6078107111054574, + "ewc_loss": 0.06569549441337585, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003175994206685573, + "grad_norm": 7.654632091522217, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8625023365020752, + "num_tokens": 482107218.0, + "step": 12639 + }, + { + "epoch": 1.607937921384048, + "ewc_loss": 0.06587566435337067, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003194012097083032, + "grad_norm": 7.700071811676025, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8585885763168335, + "num_tokens": 482146101.0, + "step": 12640 + }, + { + "epoch": 1.6080651316626384, + "ewc_loss": 0.06565846502780914, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031722913263365626, + "grad_norm": 7.700988292694092, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.877244770526886, + "num_tokens": 482178531.0, + "step": 12641 + }, + { + "epoch": 1.608192341941229, + "ewc_loss": 0.06584923714399338, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003191369178239256, + "grad_norm": 7.753711700439453, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8714334964752197, + "num_tokens": 482217019.0, + "step": 12642 + }, + { + "epoch": 1.6083195522198195, + "ewc_loss": 0.06587719917297363, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003169751144014299, + "grad_norm": 7.65780782699585, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8647605180740356, + "num_tokens": 482250547.0, + "step": 12643 + }, + { + "epoch": 1.60844676249841, + "ewc_loss": 0.06613193452358246, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003195224853698164, + "grad_norm": 7.753852844238281, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8587795495986938, + "num_tokens": 482287009.0, + "step": 12644 + }, + { + "epoch": 1.6085739727770005, + "ewc_loss": 0.06586587429046631, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031686187139712274, + "grad_norm": 7.646512031555176, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.850358784198761, + "num_tokens": 482327373.0, + "step": 12645 + }, + { + "epoch": 1.6087011830555908, + "ewc_loss": 0.06614150106906891, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003196180914528668, + "grad_norm": 7.777653694152832, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8520922660827637, + "num_tokens": 482363409.0, + "step": 12646 + }, + { + "epoch": 1.6088283933341814, + "ewc_loss": 0.06585191190242767, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003167222021147609, + "grad_norm": 7.633666038513184, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8522516489028931, + "num_tokens": 482396499.0, + "step": 12647 + }, + { + "epoch": 1.608955603612772, + "ewc_loss": 0.0661339983344078, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003195430908817798, + "grad_norm": 7.7830047607421875, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8648785352706909, + "num_tokens": 482435059.0, + "step": 12648 + }, + { + "epoch": 1.6090828138913624, + "ewc_loss": 0.06581766903400421, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031637976644560695, + "grad_norm": 7.6502861976623535, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8544887900352478, + "num_tokens": 482472793.0, + "step": 12649 + }, + { + "epoch": 1.609210024169953, + "ewc_loss": 0.0661490187048912, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031969332485459745, + "grad_norm": 7.766829967498779, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8637269735336304, + "num_tokens": 482508857.0, + "step": 12650 + }, + { + "epoch": 1.6093372344485435, + "ewc_loss": 0.06587043404579163, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000316907447995618, + "grad_norm": 7.630199432373047, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.848278284072876, + "num_tokens": 482548526.0, + "step": 12651 + }, + { + "epoch": 1.6094644447271338, + "ewc_loss": 0.06615182757377625, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003197214100509882, + "grad_norm": 7.7401957511901855, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8551898002624512, + "num_tokens": 482586461.0, + "step": 12652 + }, + { + "epoch": 1.6095916550057243, + "ewc_loss": 0.06591246277093887, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031732776551507413, + "grad_norm": 7.657883167266846, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8542947769165039, + "num_tokens": 482624642.0, + "step": 12653 + }, + { + "epoch": 1.6097188652843148, + "ewc_loss": 0.06614319235086441, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003196350298821926, + "grad_norm": 7.692514896392822, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.861258864402771, + "num_tokens": 482664096.0, + "step": 12654 + }, + { + "epoch": 1.6098460755629054, + "ewc_loss": 0.06597016751766205, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031790483626537025, + "grad_norm": 7.692363262176514, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8738844394683838, + "num_tokens": 482701230.0, + "step": 12655 + }, + { + "epoch": 1.609973285841496, + "ewc_loss": 0.06601227819919586, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003183258813805878, + "grad_norm": 7.640176773071289, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8640822768211365, + "num_tokens": 482741611.0, + "step": 12656 + }, + { + "epoch": 1.6101004961200864, + "ewc_loss": 0.06612682342529297, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003194713790435344, + "grad_norm": 7.732481479644775, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8735330700874329, + "num_tokens": 482779482.0, + "step": 12657 + }, + { + "epoch": 1.610227706398677, + "ewc_loss": 0.06580329686403275, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031623608083464205, + "grad_norm": 7.643945217132568, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8715378046035767, + "num_tokens": 482816279.0, + "step": 12658 + }, + { + "epoch": 1.6103549166772675, + "ewc_loss": 0.06619645655155182, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003201676590833813, + "grad_norm": 7.736289978027344, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8541792035102844, + "num_tokens": 482859941.0, + "step": 12659 + }, + { + "epoch": 1.610482126955858, + "ewc_loss": 0.06594540178775787, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031765716266818345, + "grad_norm": 7.665928363800049, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8671528100967407, + "num_tokens": 482894348.0, + "step": 12660 + }, + { + "epoch": 1.6106093372344485, + "ewc_loss": 0.06618838012218475, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032008689595386386, + "grad_norm": 7.765295505523682, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8582444787025452, + "num_tokens": 482930882.0, + "step": 12661 + }, + { + "epoch": 1.610736547513039, + "ewc_loss": 0.0659286379814148, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031748946639709175, + "grad_norm": 7.6422438621521, + "learning_rate": 1e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8459609746932983, + "num_tokens": 482969515.0, + "step": 12662 + }, + { + "epoch": 1.6108637577916296, + "ewc_loss": 0.06622418761253357, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032044496037997305, + "grad_norm": 7.71350622177124, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.859317421913147, + "num_tokens": 483008074.0, + "step": 12663 + }, + { + "epoch": 1.6109909680702201, + "ewc_loss": 0.06611870229244232, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003193901211488992, + "grad_norm": 7.785035610198975, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8557393550872803, + "num_tokens": 483038920.0, + "step": 12664 + }, + { + "epoch": 1.6111181783488107, + "ewc_loss": 0.06600677967071533, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031827096245251596, + "grad_norm": 7.662235736846924, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8730214834213257, + "num_tokens": 483075559.0, + "step": 12665 + }, + { + "epoch": 1.6112453886274012, + "ewc_loss": 0.06619495898485184, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000320152728818357, + "grad_norm": 7.730851650238037, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8664730787277222, + "num_tokens": 483114378.0, + "step": 12666 + }, + { + "epoch": 1.6113725989059917, + "ewc_loss": 0.06598062068223953, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003180093190167099, + "grad_norm": 7.696925163269043, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8681444525718689, + "num_tokens": 483147911.0, + "step": 12667 + }, + { + "epoch": 1.6114998091845822, + "ewc_loss": 0.06608498096466064, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031905295327305794, + "grad_norm": 7.712432861328125, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8772510886192322, + "num_tokens": 483188275.0, + "step": 12668 + }, + { + "epoch": 1.6116270194631728, + "ewc_loss": 0.06591154634952545, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031731853960081935, + "grad_norm": 7.6685686111450195, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8749551773071289, + "num_tokens": 483230804.0, + "step": 12669 + }, + { + "epoch": 1.6117542297417633, + "ewc_loss": 0.06608085334300995, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031901165493763983, + "grad_norm": 7.765319347381592, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8659639954566956, + "num_tokens": 483262368.0, + "step": 12670 + }, + { + "epoch": 1.6118814400203536, + "ewc_loss": 0.06575355678796768, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003157386963721365, + "grad_norm": 7.625144958496094, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8509502410888672, + "num_tokens": 483300894.0, + "step": 12671 + }, + { + "epoch": 1.6120086502989441, + "ewc_loss": 0.06626869738101959, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003208901034668088, + "grad_norm": 7.815377235412598, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8703972101211548, + "num_tokens": 483337377.0, + "step": 12672 + }, + { + "epoch": 1.6121358605775347, + "ewc_loss": 0.06574694067239761, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031567251426167786, + "grad_norm": 7.621473789215088, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8621091246604919, + "num_tokens": 483376572.0, + "step": 12673 + }, + { + "epoch": 1.6122630708561252, + "ewc_loss": 0.06592317670583725, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031987630063667893, + "grad_norm": 7.733738422393799, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8505786657333374, + "num_tokens": 483415584.0, + "step": 12674 + }, + { + "epoch": 1.6123902811347157, + "ewc_loss": 0.06560896337032318, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003167341637890786, + "grad_norm": 7.631319046020508, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.863745927810669, + "num_tokens": 483459584.0, + "step": 12675 + }, + { + "epoch": 1.6125174914133062, + "ewc_loss": 0.0658520832657814, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003191653813701123, + "grad_norm": 7.805978775024414, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8569919466972351, + "num_tokens": 483504260.0, + "step": 12676 + }, + { + "epoch": 1.6126447016918966, + "ewc_loss": 0.06582167744636536, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003164198715239763, + "grad_norm": 7.649913311004639, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8663035035133362, + "num_tokens": 483547271.0, + "step": 12677 + }, + { + "epoch": 1.612771911970487, + "ewc_loss": 0.06583096086978912, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003189540875609964, + "grad_norm": 7.737125873565674, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8671789169311523, + "num_tokens": 483582255.0, + "step": 12678 + }, + { + "epoch": 1.6128991222490776, + "ewc_loss": 0.06563660502433777, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003170106210745871, + "grad_norm": 7.663559436798096, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8584916591644287, + "num_tokens": 483622290.0, + "step": 12679 + }, + { + "epoch": 1.6130263325276681, + "ewc_loss": 0.06570716202259064, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003177161852363497, + "grad_norm": 7.677943229675293, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8758621215820312, + "num_tokens": 483657253.0, + "step": 12680 + }, + { + "epoch": 1.6131535428062587, + "ewc_loss": 0.06619580090045929, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032016116892918944, + "grad_norm": 7.797579288482666, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8674287796020508, + "num_tokens": 483693062.0, + "step": 12681 + }, + { + "epoch": 1.6132807530848492, + "ewc_loss": 0.06576250493526459, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031582813244313, + "grad_norm": 7.639454364776611, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8583459854125977, + "num_tokens": 483735845.0, + "step": 12682 + }, + { + "epoch": 1.6134079633634397, + "ewc_loss": 0.0662214607000351, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032041777740232646, + "grad_norm": 7.746434211730957, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8672183156013489, + "num_tokens": 483783833.0, + "step": 12683 + }, + { + "epoch": 1.6135351736420303, + "ewc_loss": 0.0658310055732727, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031651314930059016, + "grad_norm": 7.606455326080322, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8622006177902222, + "num_tokens": 483820175.0, + "step": 12684 + }, + { + "epoch": 1.6136623839206208, + "ewc_loss": 0.06619545817375183, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032015767646953464, + "grad_norm": 7.7154221534729, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8739991188049316, + "num_tokens": 483863454.0, + "step": 12685 + }, + { + "epoch": 1.6137895941992113, + "ewc_loss": 0.06564394384622574, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003170839627273381, + "grad_norm": 7.701787948608398, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8834226131439209, + "num_tokens": 483900657.0, + "step": 12686 + }, + { + "epoch": 1.6139168044778018, + "ewc_loss": 0.06585666537284851, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031921116169542074, + "grad_norm": 7.719664096832275, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8576017618179321, + "num_tokens": 483935912.0, + "step": 12687 + }, + { + "epoch": 1.6140440147563924, + "ewc_loss": 0.06571285426616669, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003177730832248926, + "grad_norm": 7.653773784637451, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.864223062992096, + "num_tokens": 483968594.0, + "step": 12688 + }, + { + "epoch": 1.614171225034983, + "ewc_loss": 0.06596748530864716, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003203193482477218, + "grad_norm": 7.755698204040527, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8727509379386902, + "num_tokens": 484010866.0, + "step": 12689 + }, + { + "epoch": 1.6142984353135734, + "ewc_loss": 0.06576426327228546, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031828717328608036, + "grad_norm": 7.698962688446045, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8594452142715454, + "num_tokens": 484051355.0, + "step": 12690 + }, + { + "epoch": 1.614425645592164, + "ewc_loss": 0.06592406332492828, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003198851482011378, + "grad_norm": 7.733043670654297, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8734123110771179, + "num_tokens": 484087049.0, + "step": 12691 + }, + { + "epoch": 1.6145528558707545, + "ewc_loss": 0.06575460731983185, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003181906067766249, + "grad_norm": 7.713061332702637, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8524459600448608, + "num_tokens": 484124914.0, + "step": 12692 + }, + { + "epoch": 1.614680066149345, + "ewc_loss": 0.06597906351089478, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032043521059677005, + "grad_norm": 7.798759460449219, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8791486024856567, + "num_tokens": 484154524.0, + "step": 12693 + }, + { + "epoch": 1.6148072764279355, + "ewc_loss": 0.06568112224340439, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031745576416142285, + "grad_norm": 7.708719730377197, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8619414567947388, + "num_tokens": 484195091.0, + "step": 12694 + }, + { + "epoch": 1.6149344867065258, + "ewc_loss": 0.06577952206134796, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031843979377299547, + "grad_norm": 7.707690715789795, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8662264347076416, + "num_tokens": 484236787.0, + "step": 12695 + }, + { + "epoch": 1.6150616969851164, + "ewc_loss": 0.0657307505607605, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031795198447071016, + "grad_norm": 7.667181968688965, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8595269918441772, + "num_tokens": 484281748.0, + "step": 12696 + }, + { + "epoch": 1.615188907263707, + "ewc_loss": 0.06589677929878235, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031961232889443636, + "grad_norm": 7.718348026275635, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8712127208709717, + "num_tokens": 484321181.0, + "step": 12697 + }, + { + "epoch": 1.6153161175422974, + "ewc_loss": 0.06589072942733765, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003195518802385777, + "grad_norm": 7.755565643310547, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.861609697341919, + "num_tokens": 484360476.0, + "step": 12698 + }, + { + "epoch": 1.615443327820888, + "ewc_loss": 0.06576952338218689, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003183397348038852, + "grad_norm": 7.71422004699707, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8647409677505493, + "num_tokens": 484395065.0, + "step": 12699 + }, + { + "epoch": 1.6155705380994785, + "ewc_loss": 0.06590718030929565, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003197162877768278, + "grad_norm": 7.687928199768066, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8662827014923096, + "num_tokens": 484439309.0, + "step": 12700 + }, + { + "epoch": 1.6156977483780688, + "ewc_loss": 0.06590434908866882, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003196880279574543, + "grad_norm": 7.783137321472168, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8643124103546143, + "num_tokens": 484473018.0, + "step": 12701 + }, + { + "epoch": 1.6158249586566593, + "ewc_loss": 0.06574486196041107, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031809319625608623, + "grad_norm": 7.706798553466797, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8700931072235107, + "num_tokens": 484518306.0, + "step": 12702 + }, + { + "epoch": 1.6159521689352498, + "ewc_loss": 0.06593538820743561, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003199984203092754, + "grad_norm": 7.790666580200195, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8499418497085571, + "num_tokens": 484556348.0, + "step": 12703 + }, + { + "epoch": 1.6160793792138404, + "ewc_loss": 0.06565124541521072, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000317156984237954, + "grad_norm": 7.722787857055664, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.874525785446167, + "num_tokens": 484593136.0, + "step": 12704 + }, + { + "epoch": 1.616206589492431, + "ewc_loss": 0.06582117825746536, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031885632779449224, + "grad_norm": 7.684007167816162, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8736509084701538, + "num_tokens": 484628025.0, + "step": 12705 + }, + { + "epoch": 1.6163337997710214, + "ewc_loss": 0.06592799723148346, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003199244965799153, + "grad_norm": 7.706508636474609, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8699815273284912, + "num_tokens": 484666703.0, + "step": 12706 + }, + { + "epoch": 1.616461010049612, + "ewc_loss": 0.06581898033618927, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003188343544024974, + "grad_norm": 7.698715686798096, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8722028732299805, + "num_tokens": 484703417.0, + "step": 12707 + }, + { + "epoch": 1.6165882203282025, + "ewc_loss": 0.06605483591556549, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003211928706150502, + "grad_norm": 7.7719526290893555, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.877562403678894, + "num_tokens": 484741830.0, + "step": 12708 + }, + { + "epoch": 1.616715430606793, + "ewc_loss": 0.06608879566192627, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031909107929095626, + "grad_norm": 7.767723083496094, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8633247017860413, + "num_tokens": 484780140.0, + "step": 12709 + }, + { + "epoch": 1.6168426408853835, + "ewc_loss": 0.06588573008775711, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031950182165019214, + "grad_norm": 7.742760181427002, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8609645366668701, + "num_tokens": 484813856.0, + "step": 12710 + }, + { + "epoch": 1.616969851163974, + "ewc_loss": 0.06606914848089218, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003188946284353733, + "grad_norm": 7.723495960235596, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8735144138336182, + "num_tokens": 484851383.0, + "step": 12711 + }, + { + "epoch": 1.6170970614425646, + "ewc_loss": 0.06586483120918274, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031929282704368234, + "grad_norm": 7.83863639831543, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8732507228851318, + "num_tokens": 484884361.0, + "step": 12712 + }, + { + "epoch": 1.6172242717211551, + "ewc_loss": 0.06561994552612305, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031684397254139185, + "grad_norm": 7.766391754150391, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8444015979766846, + "num_tokens": 484922106.0, + "step": 12713 + }, + { + "epoch": 1.6173514819997457, + "ewc_loss": 0.0657719224691391, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000318363745464012, + "grad_norm": 7.752513885498047, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8634873628616333, + "num_tokens": 484963100.0, + "step": 12714 + }, + { + "epoch": 1.6174786922783362, + "ewc_loss": 0.06565539538860321, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003171985154040158, + "grad_norm": 7.698224067687988, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8746112585067749, + "num_tokens": 485006896.0, + "step": 12715 + }, + { + "epoch": 1.6176059025569267, + "ewc_loss": 0.06598339974880219, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031803708407096565, + "grad_norm": 7.745157718658447, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.864556610584259, + "num_tokens": 485045968.0, + "step": 12716 + }, + { + "epoch": 1.6177331128355172, + "ewc_loss": 0.0659133642911911, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003173367294948548, + "grad_norm": 7.6866631507873535, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.862392783164978, + "num_tokens": 485084736.0, + "step": 12717 + }, + { + "epoch": 1.6178603231141078, + "ewc_loss": 0.06579858809709549, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003186303947586566, + "grad_norm": 7.701914310455322, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8708750605583191, + "num_tokens": 485123130.0, + "step": 12718 + }, + { + "epoch": 1.6179875333926983, + "ewc_loss": 0.06600640714168549, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031826720805838704, + "grad_norm": 7.731103420257568, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8614146709442139, + "num_tokens": 485158072.0, + "step": 12719 + }, + { + "epoch": 1.6181147436712886, + "ewc_loss": 0.06575053930282593, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031814994872547686, + "grad_norm": 7.760318279266357, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8726341128349304, + "num_tokens": 485196783.0, + "step": 12720 + }, + { + "epoch": 1.6182419539498791, + "ewc_loss": 0.06600026786327362, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003182057698722929, + "grad_norm": 7.758086681365967, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.869075357913971, + "num_tokens": 485231433.0, + "step": 12721 + }, + { + "epoch": 1.6183691642284697, + "ewc_loss": 0.06596097350120544, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031781280995346606, + "grad_norm": 7.760656833648682, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8620880842208862, + "num_tokens": 485269016.0, + "step": 12722 + }, + { + "epoch": 1.6184963745070602, + "ewc_loss": 0.06601077318191528, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003183108929079026, + "grad_norm": 7.7138237953186035, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8534547090530396, + "num_tokens": 485308293.0, + "step": 12723 + }, + { + "epoch": 1.6186235847856507, + "ewc_loss": 0.06608896702528, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003190927964169532, + "grad_norm": 7.778642654418945, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8737119436264038, + "num_tokens": 485340758.0, + "step": 12724 + }, + { + "epoch": 1.6187507950642412, + "ewc_loss": 0.0659702941775322, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031790605862624943, + "grad_norm": 7.754451274871826, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.879157543182373, + "num_tokens": 485384047.0, + "step": 12725 + }, + { + "epoch": 1.6188780053428315, + "ewc_loss": 0.06601282954216003, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031833144021220505, + "grad_norm": 7.730165481567383, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8666967153549194, + "num_tokens": 485416909.0, + "step": 12726 + }, + { + "epoch": 1.619005215621422, + "ewc_loss": 0.06603047251701355, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031850786763243377, + "grad_norm": 7.754448890686035, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8635832667350769, + "num_tokens": 485449532.0, + "step": 12727 + }, + { + "epoch": 1.6191324259000126, + "ewc_loss": 0.06596670299768448, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003178701736032963, + "grad_norm": 7.724174976348877, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8744012713432312, + "num_tokens": 485491171.0, + "step": 12728 + }, + { + "epoch": 1.6192596361786031, + "ewc_loss": 0.06583912670612335, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031903578201308846, + "grad_norm": 7.782121658325195, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8602483868598938, + "num_tokens": 485525511.0, + "step": 12729 + }, + { + "epoch": 1.6193868464571937, + "ewc_loss": 0.06564106792211533, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003170552081428468, + "grad_norm": 7.796769618988037, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8608474135398865, + "num_tokens": 485562517.0, + "step": 12730 + }, + { + "epoch": 1.6195140567357842, + "ewc_loss": 0.06598374247550964, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031804054742679, + "grad_norm": 7.706751346588135, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.866753101348877, + "num_tokens": 485601058.0, + "step": 12731 + }, + { + "epoch": 1.6196412670143747, + "ewc_loss": 0.06601200997829437, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003183232620358467, + "grad_norm": 7.767730236053467, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8779717683792114, + "num_tokens": 485634737.0, + "step": 12732 + }, + { + "epoch": 1.6197684772929652, + "ewc_loss": 0.06594190001487732, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031762217986397445, + "grad_norm": 7.725461006164551, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8677793741226196, + "num_tokens": 485671300.0, + "step": 12733 + }, + { + "epoch": 1.6198956875715558, + "ewc_loss": 0.06611546874046326, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003193578158970922, + "grad_norm": 7.920251369476318, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8719514012336731, + "num_tokens": 485708712.0, + "step": 12734 + }, + { + "epoch": 1.6200228978501463, + "ewc_loss": 0.06570768356323242, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003152799909003079, + "grad_norm": 7.643555641174316, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.876362681388855, + "num_tokens": 485746820.0, + "step": 12735 + }, + { + "epoch": 1.6201501081287368, + "ewc_loss": 0.06626364588737488, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003208396374247968, + "grad_norm": 7.829071044921875, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8655470609664917, + "num_tokens": 485785921.0, + "step": 12736 + }, + { + "epoch": 1.6202773184073274, + "ewc_loss": 0.06553985178470612, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00031604309333488345, + "grad_norm": 7.618412494659424, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8657547831535339, + "num_tokens": 485825766.0, + "step": 12737 + }, + { + "epoch": 1.6204045286859179, + "ewc_loss": 0.06602329015731812, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003208774433005601, + "grad_norm": 7.834504127502441, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8584623336791992, + "num_tokens": 485862288.0, + "step": 12738 + }, + { + "epoch": 1.6205317389645084, + "ewc_loss": 0.06578830629587173, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003160861961077899, + "grad_norm": 7.696643352508545, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8589687347412109, + "num_tokens": 485903582.0, + "step": 12739 + }, + { + "epoch": 1.620658949243099, + "ewc_loss": 0.06614129990339279, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031961611239239573, + "grad_norm": 7.7854766845703125, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8614062666893005, + "num_tokens": 485950931.0, + "step": 12740 + }, + { + "epoch": 1.6207861595216895, + "ewc_loss": 0.06595426797866821, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031774581293575466, + "grad_norm": 7.778132915496826, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8667513132095337, + "num_tokens": 485984185.0, + "step": 12741 + }, + { + "epoch": 1.62091336980028, + "ewc_loss": 0.06605266034603119, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031872972613200545, + "grad_norm": 7.782967567443848, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8699289560317993, + "num_tokens": 486018796.0, + "step": 12742 + }, + { + "epoch": 1.6210405800788705, + "ewc_loss": 0.0660201907157898, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003184050729032606, + "grad_norm": 7.769684314727783, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8522151708602905, + "num_tokens": 486055328.0, + "step": 12743 + }, + { + "epoch": 1.6211677903574608, + "ewc_loss": 0.06570853292942047, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003177298349328339, + "grad_norm": 7.755313873291016, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8584764003753662, + "num_tokens": 486092340.0, + "step": 12744 + }, + { + "epoch": 1.6212950006360514, + "ewc_loss": 0.0660269483923912, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003184725937899202, + "grad_norm": 7.812305450439453, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8520900011062622, + "num_tokens": 486123195.0, + "step": 12745 + }, + { + "epoch": 1.621422210914642, + "ewc_loss": 0.06598658859729767, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003180690109729767, + "grad_norm": 7.688446044921875, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8750306963920593, + "num_tokens": 486167145.0, + "step": 12746 + }, + { + "epoch": 1.6215494211932324, + "ewc_loss": 0.06625765562057495, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003207796544302255, + "grad_norm": 7.917843341827393, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8578373193740845, + "num_tokens": 486207925.0, + "step": 12747 + }, + { + "epoch": 1.621676631471823, + "ewc_loss": 0.06576795876026154, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031588267302140594, + "grad_norm": 7.679924964904785, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8679220676422119, + "num_tokens": 486245446.0, + "step": 12748 + }, + { + "epoch": 1.6218038417504135, + "ewc_loss": 0.06637075543403625, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003219106874894351, + "grad_norm": 7.803463935852051, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.865026593208313, + "num_tokens": 486280461.0, + "step": 12749 + }, + { + "epoch": 1.6219310520290038, + "ewc_loss": 0.06585033237934113, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031670639873482287, + "grad_norm": 7.681004524230957, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8525551557540894, + "num_tokens": 486323926.0, + "step": 12750 + }, + { + "epoch": 1.6220582623075943, + "ewc_loss": 0.06627525389194489, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003209557035006583, + "grad_norm": 7.792811870574951, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8569253087043762, + "num_tokens": 486360016.0, + "step": 12751 + }, + { + "epoch": 1.6221854725861848, + "ewc_loss": 0.06605826318264008, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003187857219018042, + "grad_norm": 7.697623252868652, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8671404123306274, + "num_tokens": 486394994.0, + "step": 12752 + }, + { + "epoch": 1.6223126828647754, + "ewc_loss": 0.06630168855190277, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032121999538503587, + "grad_norm": 7.820923805236816, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8630033135414124, + "num_tokens": 486436909.0, + "step": 12753 + }, + { + "epoch": 1.622439893143366, + "ewc_loss": 0.06602939963340759, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003184971574228257, + "grad_norm": 7.692224979400635, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8706610202789307, + "num_tokens": 486473201.0, + "step": 12754 + }, + { + "epoch": 1.6225671034219564, + "ewc_loss": 0.06644226610660553, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032262582681141794, + "grad_norm": 7.811028480529785, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8452837467193604, + "num_tokens": 486510733.0, + "step": 12755 + }, + { + "epoch": 1.622694313700547, + "ewc_loss": 0.06607329845428467, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031893610139377415, + "grad_norm": 7.774881839752197, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8802288770675659, + "num_tokens": 486546666.0, + "step": 12756 + }, + { + "epoch": 1.6228215239791375, + "ewc_loss": 0.06623748689889908, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003205779939889908, + "grad_norm": 7.7696356773376465, + "learning_rate": 1e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8452073335647583, + "num_tokens": 486585639.0, + "step": 12757 + }, + { + "epoch": 1.622948734257728, + "ewc_loss": 0.06616386771202087, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003198417543899268, + "grad_norm": 7.750124931335449, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8689171075820923, + "num_tokens": 486623028.0, + "step": 12758 + }, + { + "epoch": 1.6230759445363185, + "ewc_loss": 0.06620924174785614, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003202955122105777, + "grad_norm": 7.799210548400879, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8594249486923218, + "num_tokens": 486655906.0, + "step": 12759 + }, + { + "epoch": 1.623203154814909, + "ewc_loss": 0.06633753329515457, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003191370633430779, + "grad_norm": 7.731484413146973, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.862838625907898, + "num_tokens": 486689461.0, + "step": 12760 + }, + { + "epoch": 1.6233303650934996, + "ewc_loss": 0.06622633337974548, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032046649721451104, + "grad_norm": 7.74473762512207, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8476862907409668, + "num_tokens": 486728373.0, + "step": 12761 + }, + { + "epoch": 1.6234575753720901, + "ewc_loss": 0.06614929437637329, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003196960315108299, + "grad_norm": 7.687562942504883, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8754547238349915, + "num_tokens": 486769955.0, + "step": 12762 + }, + { + "epoch": 1.6235847856506807, + "ewc_loss": 0.06616430729627609, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031984620727598667, + "grad_norm": 7.787804126739502, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8686802387237549, + "num_tokens": 486806468.0, + "step": 12763 + }, + { + "epoch": 1.6237119959292712, + "ewc_loss": 0.0660485029220581, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003186881949659437, + "grad_norm": 7.717218399047852, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8505579233169556, + "num_tokens": 486846526.0, + "step": 12764 + }, + { + "epoch": 1.6238392062078617, + "ewc_loss": 0.06629304587841034, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003211335570085794, + "grad_norm": 7.784005641937256, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8814842700958252, + "num_tokens": 486886323.0, + "step": 12765 + }, + { + "epoch": 1.6239664164864522, + "ewc_loss": 0.06602872908115387, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031849046354182065, + "grad_norm": 7.690860748291016, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8453041315078735, + "num_tokens": 486918099.0, + "step": 12766 + }, + { + "epoch": 1.6240936267650428, + "ewc_loss": 0.06641093641519547, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032231249497272074, + "grad_norm": 7.784643173217773, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.877792477607727, + "num_tokens": 486953789.0, + "step": 12767 + }, + { + "epoch": 1.6242208370436333, + "ewc_loss": 0.06619606167078018, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032016373006626964, + "grad_norm": 7.702339172363281, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8636919260025024, + "num_tokens": 486996165.0, + "step": 12768 + }, + { + "epoch": 1.6243480473222236, + "ewc_loss": 0.06638777256011963, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003220807993784547, + "grad_norm": 7.837233543395996, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.875861644744873, + "num_tokens": 487028249.0, + "step": 12769 + }, + { + "epoch": 1.6244752576008141, + "ewc_loss": 0.06619736552238464, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032017676858231425, + "grad_norm": 7.685606479644775, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8660053610801697, + "num_tokens": 487070099.0, + "step": 12770 + }, + { + "epoch": 1.6246024678794047, + "ewc_loss": 0.06651756912469864, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000323378830216825, + "grad_norm": 7.7898969650268555, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8681546449661255, + "num_tokens": 487107117.0, + "step": 12771 + }, + { + "epoch": 1.6247296781579952, + "ewc_loss": 0.06620945036411285, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032029763679020107, + "grad_norm": 7.768916130065918, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8517168760299683, + "num_tokens": 487146024.0, + "step": 12772 + }, + { + "epoch": 1.6248568884365857, + "ewc_loss": 0.06634785234928131, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003216816985514015, + "grad_norm": 7.722214221954346, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.851631760597229, + "num_tokens": 487189864.0, + "step": 12773 + }, + { + "epoch": 1.6249840987151762, + "ewc_loss": 0.06636805087327957, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032188365003094077, + "grad_norm": 7.804157733917236, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8657571077346802, + "num_tokens": 487230307.0, + "step": 12774 + }, + { + "epoch": 1.6251113089937665, + "ewc_loss": 0.0661429762840271, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031963290530256927, + "grad_norm": 7.752243995666504, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8672847747802734, + "num_tokens": 487261387.0, + "step": 12775 + }, + { + "epoch": 1.625238519272357, + "ewc_loss": 0.06637893617153168, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032199244014918804, + "grad_norm": 7.748095989227295, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8535134792327881, + "num_tokens": 487298611.0, + "step": 12776 + }, + { + "epoch": 1.6253657295509476, + "ewc_loss": 0.06622198224067688, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003204229287803173, + "grad_norm": 7.671558380126953, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8633179664611816, + "num_tokens": 487338172.0, + "step": 12777 + }, + { + "epoch": 1.6254929398295381, + "ewc_loss": 0.0664779543876648, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003229826979804784, + "grad_norm": 7.817628860473633, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8657164573669434, + "num_tokens": 487375126.0, + "step": 12778 + }, + { + "epoch": 1.6256201501081287, + "ewc_loss": 0.06625360995531082, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003207392292097211, + "grad_norm": 7.685412883758545, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8753490447998047, + "num_tokens": 487416539.0, + "step": 12779 + }, + { + "epoch": 1.6257473603867192, + "ewc_loss": 0.06644565612077713, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003226596745662391, + "grad_norm": 7.793664932250977, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.865777850151062, + "num_tokens": 487448429.0, + "step": 12780 + }, + { + "epoch": 1.6258745706653097, + "ewc_loss": 0.06620684266090393, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003202715888619423, + "grad_norm": 7.710705757141113, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8631500005722046, + "num_tokens": 487486054.0, + "step": 12781 + }, + { + "epoch": 1.6260017809439002, + "ewc_loss": 0.06645750254392624, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003227781562600285, + "grad_norm": 7.778134346008301, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8713384866714478, + "num_tokens": 487523771.0, + "step": 12782 + }, + { + "epoch": 1.6261289912224908, + "ewc_loss": 0.06622213125228882, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003204244130756706, + "grad_norm": 7.680225849151611, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8705825209617615, + "num_tokens": 487561334.0, + "step": 12783 + }, + { + "epoch": 1.6262562015010813, + "ewc_loss": 0.0664646178483963, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003228493151254952, + "grad_norm": 7.755460739135742, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8651223182678223, + "num_tokens": 487606382.0, + "step": 12784 + }, + { + "epoch": 1.6263834117796718, + "ewc_loss": 0.06627260148525238, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003209291899111122, + "grad_norm": 7.693269729614258, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8606164455413818, + "num_tokens": 487647640.0, + "step": 12785 + }, + { + "epoch": 1.6265106220582624, + "ewc_loss": 0.06648419797420502, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003230450674891472, + "grad_norm": 7.842752456665039, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8612533211708069, + "num_tokens": 487682604.0, + "step": 12786 + }, + { + "epoch": 1.6266378323368529, + "ewc_loss": 0.06612757593393326, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031947888783179224, + "grad_norm": 7.720038890838623, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8677376508712769, + "num_tokens": 487718271.0, + "step": 12787 + }, + { + "epoch": 1.6267650426154434, + "ewc_loss": 0.06649769097566605, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003231800510548055, + "grad_norm": 7.921113014221191, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8510637879371643, + "num_tokens": 487752872.0, + "step": 12788 + }, + { + "epoch": 1.626892252894034, + "ewc_loss": 0.06604751944541931, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031867832876741886, + "grad_norm": 7.66354513168335, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8552201986312866, + "num_tokens": 487798309.0, + "step": 12789 + }, + { + "epoch": 1.6270194631726245, + "ewc_loss": 0.06657209247350693, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032392406137660146, + "grad_norm": 7.917431354522705, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.85418701171875, + "num_tokens": 487834052.0, + "step": 12790 + }, + { + "epoch": 1.627146673451215, + "ewc_loss": 0.06598038226366043, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031800693250261247, + "grad_norm": 7.703017234802246, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8532499074935913, + "num_tokens": 487868662.0, + "step": 12791 + }, + { + "epoch": 1.6272738837298055, + "ewc_loss": 0.06650863587856293, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032328948145732284, + "grad_norm": 8.036951065063477, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8613787889480591, + "num_tokens": 487907425.0, + "step": 12792 + }, + { + "epoch": 1.6274010940083958, + "ewc_loss": 0.06571304798126221, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003153336583636701, + "grad_norm": 7.563221454620361, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8729764223098755, + "num_tokens": 487942979.0, + "step": 12793 + }, + { + "epoch": 1.6275283042869864, + "ewc_loss": 0.06685452908277512, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003267484134994447, + "grad_norm": 8.115008354187012, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8685109615325928, + "num_tokens": 487982402.0, + "step": 12794 + }, + { + "epoch": 1.6276555145655769, + "ewc_loss": 0.06561607122421265, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003143638314213604, + "grad_norm": 7.581623554229736, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8716773390769958, + "num_tokens": 488020071.0, + "step": 12795 + }, + { + "epoch": 1.6277827248441674, + "ewc_loss": 0.06705547869205475, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032875791657716036, + "grad_norm": 8.010591506958008, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8505279421806335, + "num_tokens": 488057457.0, + "step": 12796 + }, + { + "epoch": 1.627909935122758, + "ewc_loss": 0.06581790745258331, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003163822111673653, + "grad_norm": 7.644265174865723, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8608686923980713, + "num_tokens": 488094002.0, + "step": 12797 + }, + { + "epoch": 1.6280371454013485, + "ewc_loss": 0.06668542325496674, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003250573936384171, + "grad_norm": 7.867964744567871, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8495165705680847, + "num_tokens": 488134995.0, + "step": 12798 + }, + { + "epoch": 1.6281643556799388, + "ewc_loss": 0.0660790205001831, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031899334862828255, + "grad_norm": 7.737972736358643, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8676895499229431, + "num_tokens": 488169942.0, + "step": 12799 + }, + { + "epoch": 1.6282915659585293, + "ewc_loss": 0.06624481081962585, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032065119012258947, + "grad_norm": 7.77285099029541, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8655951023101807, + "num_tokens": 488210852.0, + "step": 12800 + }, + { + "epoch": 1.6284187762371198, + "ewc_loss": 0.06618835777044296, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003200867213308811, + "grad_norm": 7.71973991394043, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8672133684158325, + "num_tokens": 488253067.0, + "step": 12801 + }, + { + "epoch": 1.6285459865157104, + "ewc_loss": 0.06623997539281845, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003206028777640313, + "grad_norm": 7.799610614776611, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8721756935119629, + "num_tokens": 488293498.0, + "step": 12802 + }, + { + "epoch": 1.628673196794301, + "ewc_loss": 0.06610199809074402, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003192231524735689, + "grad_norm": 7.721834182739258, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8672014474868774, + "num_tokens": 488333931.0, + "step": 12803 + }, + { + "epoch": 1.6288004070728914, + "ewc_loss": 0.06619451940059662, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000320148334139958, + "grad_norm": 7.823647499084473, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8693883419036865, + "num_tokens": 488371724.0, + "step": 12804 + }, + { + "epoch": 1.628927617351482, + "ewc_loss": 0.06608392298221588, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003190423594787717, + "grad_norm": 7.819673538208008, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8592652678489685, + "num_tokens": 488415171.0, + "step": 12805 + }, + { + "epoch": 1.6290548276300725, + "ewc_loss": 0.06606636941432953, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003188667760696262, + "grad_norm": 7.733914375305176, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8584901094436646, + "num_tokens": 488451672.0, + "step": 12806 + }, + { + "epoch": 1.629182037908663, + "ewc_loss": 0.06617453694343567, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031994847813621163, + "grad_norm": 7.720122337341309, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8788114786148071, + "num_tokens": 488492657.0, + "step": 12807 + }, + { + "epoch": 1.6293092481872535, + "ewc_loss": 0.06608881056308746, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031909122481010854, + "grad_norm": 7.665134906768799, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.871447741985321, + "num_tokens": 488530681.0, + "step": 12808 + }, + { + "epoch": 1.629436458465844, + "ewc_loss": 0.06625096499919891, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032071274472400546, + "grad_norm": 7.783409118652344, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8743438720703125, + "num_tokens": 488560542.0, + "step": 12809 + }, + { + "epoch": 1.6295636687444346, + "ewc_loss": 0.06617473065853119, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003199503989890218, + "grad_norm": 7.763789653778076, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.866378903388977, + "num_tokens": 488597491.0, + "step": 12810 + }, + { + "epoch": 1.6296908790230251, + "ewc_loss": 0.06631246209144592, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003213277377653867, + "grad_norm": 7.794025897979736, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8707022070884705, + "num_tokens": 488634556.0, + "step": 12811 + }, + { + "epoch": 1.6298180893016156, + "ewc_loss": 0.06611140817403793, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003193172160536051, + "grad_norm": 7.781735420227051, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8612700700759888, + "num_tokens": 488668174.0, + "step": 12812 + }, + { + "epoch": 1.6299452995802062, + "ewc_loss": 0.0662551075220108, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003207541594747454, + "grad_norm": 7.766139030456543, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8684988021850586, + "num_tokens": 488705994.0, + "step": 12813 + }, + { + "epoch": 1.6300725098587967, + "ewc_loss": 0.06614713370800018, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031967440736480057, + "grad_norm": 7.7350287437438965, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8541860580444336, + "num_tokens": 488746908.0, + "step": 12814 + }, + { + "epoch": 1.6301997201373872, + "ewc_loss": 0.06614649295806885, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031966809183359146, + "grad_norm": 7.686173915863037, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8652283549308777, + "num_tokens": 488785607.0, + "step": 12815 + }, + { + "epoch": 1.6303269304159778, + "ewc_loss": 0.06618329882621765, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003200361388735473, + "grad_norm": 7.729550838470459, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8624745607376099, + "num_tokens": 488823831.0, + "step": 12816 + }, + { + "epoch": 1.630454140694568, + "ewc_loss": 0.0661424845457077, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003196279867552221, + "grad_norm": 7.716118812561035, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8712694644927979, + "num_tokens": 488859944.0, + "step": 12817 + }, + { + "epoch": 1.6305813509731586, + "ewc_loss": 0.06625992804765701, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032080241362564266, + "grad_norm": 7.74819278717041, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8499865531921387, + "num_tokens": 488903764.0, + "step": 12818 + }, + { + "epoch": 1.6307085612517491, + "ewc_loss": 0.06611010432243347, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031930414843373, + "grad_norm": 7.706693649291992, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8543916344642639, + "num_tokens": 488946890.0, + "step": 12819 + }, + { + "epoch": 1.6308357715303397, + "ewc_loss": 0.06624851375818253, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032068826840259135, + "grad_norm": 7.793529987335205, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8610310554504395, + "num_tokens": 488981901.0, + "step": 12820 + }, + { + "epoch": 1.6309629818089302, + "ewc_loss": 0.06611818075180054, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000319384882459417, + "grad_norm": 7.66844367980957, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8672103881835938, + "num_tokens": 489024180.0, + "step": 12821 + }, + { + "epoch": 1.6310901920875207, + "ewc_loss": 0.06640950590372086, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032229817588813603, + "grad_norm": 7.820718765258789, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8669725656509399, + "num_tokens": 489070971.0, + "step": 12822 + }, + { + "epoch": 1.6312174023661112, + "ewc_loss": 0.06600230932235718, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003182262589689344, + "grad_norm": 7.687004089355469, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8752049207687378, + "num_tokens": 489111636.0, + "step": 12823 + }, + { + "epoch": 1.6313446126447015, + "ewc_loss": 0.0663655698299408, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003218588244635612, + "grad_norm": 7.815573215484619, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.862147331237793, + "num_tokens": 489148677.0, + "step": 12824 + }, + { + "epoch": 1.631471822923292, + "ewc_loss": 0.06612688302993774, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003194719320163131, + "grad_norm": 7.783356189727783, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8525930643081665, + "num_tokens": 489186222.0, + "step": 12825 + }, + { + "epoch": 1.6315990332018826, + "ewc_loss": 0.06625054776668549, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003207086119800806, + "grad_norm": 7.809412956237793, + "learning_rate": 1e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8384319543838501, + "num_tokens": 489223173.0, + "step": 12826 + }, + { + "epoch": 1.6317262434804731, + "ewc_loss": 0.06613429635763168, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031954608857631683, + "grad_norm": 7.792781829833984, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8640592098236084, + "num_tokens": 489259236.0, + "step": 12827 + }, + { + "epoch": 1.6318534537590637, + "ewc_loss": 0.06620260328054428, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003202291554771364, + "grad_norm": 7.780166149139404, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8561442494392395, + "num_tokens": 489298449.0, + "step": 12828 + }, + { + "epoch": 1.6319806640376542, + "ewc_loss": 0.06618906557559967, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003200937935616821, + "grad_norm": 7.904394626617432, + "learning_rate": 1e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8430701494216919, + "num_tokens": 489334800.0, + "step": 12829 + }, + { + "epoch": 1.6321078743162447, + "ewc_loss": 0.06598342955112457, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003180374624207616, + "grad_norm": 7.687242031097412, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8703835010528564, + "num_tokens": 489378510.0, + "step": 12830 + }, + { + "epoch": 1.6322350845948352, + "ewc_loss": 0.06652361154556274, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003234392497688532, + "grad_norm": 7.816336631774902, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8514494895935059, + "num_tokens": 489419267.0, + "step": 12831 + }, + { + "epoch": 1.6323622948734258, + "ewc_loss": 0.06602229177951813, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031842602766118944, + "grad_norm": 7.774682998657227, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8668276071548462, + "num_tokens": 489454772.0, + "step": 12832 + }, + { + "epoch": 1.6324895051520163, + "ewc_loss": 0.06634757667779922, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032167890458367765, + "grad_norm": 7.815700531005859, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8570367097854614, + "num_tokens": 489490146.0, + "step": 12833 + }, + { + "epoch": 1.6326167154306068, + "ewc_loss": 0.06617061048746109, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031990924617275596, + "grad_norm": 7.741540908813477, + "learning_rate": 1e-06, + "loss": 0.5608, + "mean_token_accuracy": 0.8340854644775391, + "num_tokens": 489532754.0, + "step": 12834 + }, + { + "epoch": 1.6327439257091974, + "ewc_loss": 0.06637970358133316, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003220001526642591, + "grad_norm": 7.831376552581787, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.860319972038269, + "num_tokens": 489570378.0, + "step": 12835 + }, + { + "epoch": 1.6328711359877879, + "ewc_loss": 0.06625709682703018, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032077409559860826, + "grad_norm": 7.755539894104004, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8557910919189453, + "num_tokens": 489604562.0, + "step": 12836 + }, + { + "epoch": 1.6329983462663784, + "ewc_loss": 0.06637271493673325, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032193027436733246, + "grad_norm": 7.80634880065918, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8711520433425903, + "num_tokens": 489644060.0, + "step": 12837 + }, + { + "epoch": 1.633125556544969, + "ewc_loss": 0.06630561500787735, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000321259256452322, + "grad_norm": 7.821547508239746, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8648650646209717, + "num_tokens": 489682693.0, + "step": 12838 + }, + { + "epoch": 1.6332527668235595, + "ewc_loss": 0.06621710956096649, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003203741798643023, + "grad_norm": 7.741950035095215, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8723840713500977, + "num_tokens": 489720786.0, + "step": 12839 + }, + { + "epoch": 1.63337997710215, + "ewc_loss": 0.06635819375514984, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032178504625335336, + "grad_norm": 7.77968168258667, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.869817852973938, + "num_tokens": 489758773.0, + "step": 12840 + }, + { + "epoch": 1.6335071873807405, + "ewc_loss": 0.06617559492588043, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031995904282666743, + "grad_norm": 7.793179512023926, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.865094006061554, + "num_tokens": 489793436.0, + "step": 12841 + }, + { + "epoch": 1.6336343976593308, + "ewc_loss": 0.06630563735961914, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003212595183867961, + "grad_norm": 7.796367645263672, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8828378915786743, + "num_tokens": 489828634.0, + "step": 12842 + }, + { + "epoch": 1.6337616079379214, + "ewc_loss": 0.06624560058116913, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032065907726064324, + "grad_norm": 7.765930652618408, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8671913146972656, + "num_tokens": 489866335.0, + "step": 12843 + }, + { + "epoch": 1.6338888182165119, + "ewc_loss": 0.06626422703266144, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003208453708793968, + "grad_norm": 7.765121936798096, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8725188970565796, + "num_tokens": 489903059.0, + "step": 12844 + }, + { + "epoch": 1.6340160284951024, + "ewc_loss": 0.06626048684120178, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003208080306649208, + "grad_norm": 7.730624675750732, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8712233304977417, + "num_tokens": 489940186.0, + "step": 12845 + }, + { + "epoch": 1.634143238773693, + "ewc_loss": 0.06632544100284576, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003214575117453933, + "grad_norm": 7.740259170532227, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.875629186630249, + "num_tokens": 489984732.0, + "step": 12846 + }, + { + "epoch": 1.6342704490522835, + "ewc_loss": 0.06629092991352081, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003211124276276678, + "grad_norm": 7.800577640533447, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8652493953704834, + "num_tokens": 490016699.0, + "step": 12847 + }, + { + "epoch": 1.6343976593308738, + "ewc_loss": 0.06609293818473816, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031913252314552665, + "grad_norm": 7.697669982910156, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8578361868858337, + "num_tokens": 490051351.0, + "step": 12848 + }, + { + "epoch": 1.6345248696094643, + "ewc_loss": 0.06637316942214966, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032193478546105325, + "grad_norm": 7.787578105926514, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8730512857437134, + "num_tokens": 490089668.0, + "step": 12849 + }, + { + "epoch": 1.6346520798880548, + "ewc_loss": 0.06611121445894241, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003193152660969645, + "grad_norm": 7.714040756225586, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8651131391525269, + "num_tokens": 490129651.0, + "step": 12850 + }, + { + "epoch": 1.6347792901666454, + "ewc_loss": 0.06632101535797119, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003214133030269295, + "grad_norm": 7.732600688934326, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8569814562797546, + "num_tokens": 490168441.0, + "step": 12851 + }, + { + "epoch": 1.6349065004452359, + "ewc_loss": 0.0662187859416008, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003203909727744758, + "grad_norm": 7.7757182121276855, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8734945058822632, + "num_tokens": 490204754.0, + "step": 12852 + }, + { + "epoch": 1.6350337107238264, + "ewc_loss": 0.06624889373779297, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032069211010821164, + "grad_norm": 7.716814041137695, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8790993690490723, + "num_tokens": 490244178.0, + "step": 12853 + }, + { + "epoch": 1.635160921002417, + "ewc_loss": 0.06632164120674133, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032141958945430815, + "grad_norm": 7.753782749176025, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8608324527740479, + "num_tokens": 490281161.0, + "step": 12854 + }, + { + "epoch": 1.6352881312810075, + "ewc_loss": 0.0660000741481781, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032064522383734584, + "grad_norm": 7.7638139724731445, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8648020625114441, + "num_tokens": 490314892.0, + "step": 12855 + }, + { + "epoch": 1.635415341559598, + "ewc_loss": 0.06601758301258087, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003208203997928649, + "grad_norm": 7.75770378112793, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8521544933319092, + "num_tokens": 490352600.0, + "step": 12856 + }, + { + "epoch": 1.6355425518381885, + "ewc_loss": 0.06604564189910889, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003211009898222983, + "grad_norm": 7.752375602722168, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8758559226989746, + "num_tokens": 490384487.0, + "step": 12857 + }, + { + "epoch": 1.635669762116779, + "ewc_loss": 0.06606048345565796, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003212493611499667, + "grad_norm": 7.786639213562012, + "learning_rate": 1e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8440283536911011, + "num_tokens": 490422199.0, + "step": 12858 + }, + { + "epoch": 1.6357969723953696, + "ewc_loss": 0.06591251492500305, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.000319769635098055, + "grad_norm": 7.7268500328063965, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8565859198570251, + "num_tokens": 490458058.0, + "step": 12859 + }, + { + "epoch": 1.6359241826739601, + "ewc_loss": 0.06614869832992554, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032213146914727986, + "grad_norm": 7.742413520812988, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8764646053314209, + "num_tokens": 490498447.0, + "step": 12860 + }, + { + "epoch": 1.6360513929525506, + "ewc_loss": 0.06598281860351562, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032047275453805923, + "grad_norm": 7.71317195892334, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8508566617965698, + "num_tokens": 490538502.0, + "step": 12861 + }, + { + "epoch": 1.6361786032311412, + "ewc_loss": 0.06615535914897919, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003221981751266867, + "grad_norm": 7.751387596130371, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8685476779937744, + "num_tokens": 490575752.0, + "step": 12862 + }, + { + "epoch": 1.6363058135097317, + "ewc_loss": 0.06633736193180084, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032157677924260497, + "grad_norm": 7.766695976257324, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8650556802749634, + "num_tokens": 490613502.0, + "step": 12863 + }, + { + "epoch": 1.6364330237883222, + "ewc_loss": 0.06605411320924759, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003211856528650969, + "grad_norm": 7.794759750366211, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8584626317024231, + "num_tokens": 490653820.0, + "step": 12864 + }, + { + "epoch": 1.6365602340669128, + "ewc_loss": 0.06632695347070694, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032147267484106123, + "grad_norm": 7.69601583480835, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8492225408554077, + "num_tokens": 490693332.0, + "step": 12865 + }, + { + "epoch": 1.636687444345503, + "ewc_loss": 0.06655675172805786, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032377068419009447, + "grad_norm": 7.865054607391357, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8619078397750854, + "num_tokens": 490733135.0, + "step": 12866 + }, + { + "epoch": 1.6368146546240936, + "ewc_loss": 0.06616527587175369, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031985589885152876, + "grad_norm": 7.6791510581970215, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8677014708518982, + "num_tokens": 490775505.0, + "step": 12867 + }, + { + "epoch": 1.6369418649026841, + "ewc_loss": 0.0666050910949707, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000324253982398659, + "grad_norm": 7.8116912841796875, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.863050103187561, + "num_tokens": 490817783.0, + "step": 12868 + }, + { + "epoch": 1.6370690751812746, + "ewc_loss": 0.06631673872470856, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003213705203961581, + "grad_norm": 7.856566905975342, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8582409620285034, + "num_tokens": 490849464.0, + "step": 12869 + }, + { + "epoch": 1.6371962854598652, + "ewc_loss": 0.06629474461078644, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003211505536455661, + "grad_norm": 7.689812660217285, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8578211069107056, + "num_tokens": 490891925.0, + "step": 12870 + }, + { + "epoch": 1.6373234957384557, + "ewc_loss": 0.06653521209955215, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032355522853322327, + "grad_norm": 7.783851623535156, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8564134240150452, + "num_tokens": 490926221.0, + "step": 12871 + }, + { + "epoch": 1.6374507060170462, + "ewc_loss": 0.06625312566757202, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032073439797386527, + "grad_norm": 7.709366798400879, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8555985689163208, + "num_tokens": 490964141.0, + "step": 12872 + }, + { + "epoch": 1.6375779162956365, + "ewc_loss": 0.06658113747835159, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032401448697783053, + "grad_norm": 7.812197685241699, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8595302700996399, + "num_tokens": 490997917.0, + "step": 12873 + }, + { + "epoch": 1.637705126574227, + "ewc_loss": 0.06625571101903915, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032076024217531085, + "grad_norm": 7.728313446044922, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8790363669395447, + "num_tokens": 491029244.0, + "step": 12874 + }, + { + "epoch": 1.6378323368528176, + "ewc_loss": 0.06654240190982819, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003236271732021123, + "grad_norm": 7.803274631500244, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8630622029304504, + "num_tokens": 491066591.0, + "step": 12875 + }, + { + "epoch": 1.6379595471314081, + "ewc_loss": 0.06629157066345215, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003211188013665378, + "grad_norm": 7.643665790557861, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8632067441940308, + "num_tokens": 491106546.0, + "step": 12876 + }, + { + "epoch": 1.6380867574099987, + "ewc_loss": 0.06659488379955292, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032415197347290814, + "grad_norm": 7.766289710998535, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8571698665618896, + "num_tokens": 491145512.0, + "step": 12877 + }, + { + "epoch": 1.6382139676885892, + "ewc_loss": 0.06628770381212234, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003210801805835217, + "grad_norm": 7.70672082901001, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8619449138641357, + "num_tokens": 491185624.0, + "step": 12878 + }, + { + "epoch": 1.6383411779671797, + "ewc_loss": 0.06654980033636093, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003237011260353029, + "grad_norm": 7.765820026397705, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8581741452217102, + "num_tokens": 491220982.0, + "step": 12879 + }, + { + "epoch": 1.6384683882457702, + "ewc_loss": 0.06643116474151611, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032251476659439504, + "grad_norm": 7.728726387023926, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8639651536941528, + "num_tokens": 491262501.0, + "step": 12880 + }, + { + "epoch": 1.6385955985243608, + "ewc_loss": 0.06645126640796661, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032271581585519016, + "grad_norm": 7.768664360046387, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.875449538230896, + "num_tokens": 491298078.0, + "step": 12881 + }, + { + "epoch": 1.6387228088029513, + "ewc_loss": 0.06638472527265549, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003220503858756274, + "grad_norm": 7.7342023849487305, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8713054656982422, + "num_tokens": 491336203.0, + "step": 12882 + }, + { + "epoch": 1.6388500190815418, + "ewc_loss": 0.06652961671352386, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032349929097108543, + "grad_norm": 7.784108638763428, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8525620698928833, + "num_tokens": 491374723.0, + "step": 12883 + }, + { + "epoch": 1.6389772293601323, + "ewc_loss": 0.0663529634475708, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003217327466700226, + "grad_norm": 7.772193908691406, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8654561042785645, + "num_tokens": 491410546.0, + "step": 12884 + }, + { + "epoch": 1.6391044396387229, + "ewc_loss": 0.06637939810752869, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032199706765823066, + "grad_norm": 7.799736022949219, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8595037460327148, + "num_tokens": 491450642.0, + "step": 12885 + }, + { + "epoch": 1.6392316499173134, + "ewc_loss": 0.06634561717510223, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003216593468096107, + "grad_norm": 7.745066165924072, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8604133129119873, + "num_tokens": 491489893.0, + "step": 12886 + }, + { + "epoch": 1.639358860195904, + "ewc_loss": 0.06642860174179077, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003224890970159322, + "grad_norm": 7.755284786224365, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8526773452758789, + "num_tokens": 491535870.0, + "step": 12887 + }, + { + "epoch": 1.6394860704744945, + "ewc_loss": 0.06642024219036102, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003224055690225214, + "grad_norm": 7.787092208862305, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.862273097038269, + "num_tokens": 491575988.0, + "step": 12888 + }, + { + "epoch": 1.639613280753085, + "ewc_loss": 0.06625175476074219, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003207207191735506, + "grad_norm": 7.743494510650635, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8660189509391785, + "num_tokens": 491616822.0, + "step": 12889 + }, + { + "epoch": 1.6397404910316755, + "ewc_loss": 0.06650716066360474, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032327475491911173, + "grad_norm": 7.773298263549805, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8829207420349121, + "num_tokens": 491654132.0, + "step": 12890 + }, + { + "epoch": 1.6398677013102658, + "ewc_loss": 0.06628146767616272, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003210177819710225, + "grad_norm": 7.758009433746338, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8504866361618042, + "num_tokens": 491697215.0, + "step": 12891 + }, + { + "epoch": 1.6399949115888564, + "ewc_loss": 0.06640641391277313, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032226720941253006, + "grad_norm": 7.808938026428223, + "learning_rate": 1e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8420087099075317, + "num_tokens": 491731338.0, + "step": 12892 + }, + { + "epoch": 1.6401221218674469, + "ewc_loss": 0.06630699336528778, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003212731098756194, + "grad_norm": 7.744388103485107, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8618484735488892, + "num_tokens": 491770808.0, + "step": 12893 + }, + { + "epoch": 1.6402493321460374, + "ewc_loss": 0.06650444865226746, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003232476010452956, + "grad_norm": 7.818446159362793, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8708241581916809, + "num_tokens": 491804167.0, + "step": 12894 + }, + { + "epoch": 1.640376542424628, + "ewc_loss": 0.06618153303861618, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003200184728484601, + "grad_norm": 7.722251892089844, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8644211292266846, + "num_tokens": 491840094.0, + "step": 12895 + }, + { + "epoch": 1.6405037527032185, + "ewc_loss": 0.06656752526760101, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032387833925895393, + "grad_norm": 7.845156192779541, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8504079580307007, + "num_tokens": 491881643.0, + "step": 12896 + }, + { + "epoch": 1.6406309629818088, + "ewc_loss": 0.06615141034126282, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031971718999557197, + "grad_norm": 7.712213516235352, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8520781397819519, + "num_tokens": 491920951.0, + "step": 12897 + }, + { + "epoch": 1.6407581732603993, + "ewc_loss": 0.06651496887207031, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003233528113923967, + "grad_norm": 7.827288627624512, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8584334254264832, + "num_tokens": 491954599.0, + "step": 12898 + }, + { + "epoch": 1.6408853835389898, + "ewc_loss": 0.06615213304758072, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031972446595318615, + "grad_norm": 7.684043884277344, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8646605014801025, + "num_tokens": 491995375.0, + "step": 12899 + }, + { + "epoch": 1.6410125938175804, + "ewc_loss": 0.06671486794948578, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032535180798731744, + "grad_norm": 7.8448381423950195, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8581861257553101, + "num_tokens": 492035297.0, + "step": 12900 + }, + { + "epoch": 1.6411398040961709, + "ewc_loss": 0.06625348329544067, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032073797774501145, + "grad_norm": 7.677361011505127, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8627933859825134, + "num_tokens": 492071003.0, + "step": 12901 + }, + { + "epoch": 1.6412670143747614, + "ewc_loss": 0.06672120094299316, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003254150797147304, + "grad_norm": 7.870453834533691, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.861991822719574, + "num_tokens": 492106046.0, + "step": 12902 + }, + { + "epoch": 1.641394224653352, + "ewc_loss": 0.0661998987197876, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032020205981098115, + "grad_norm": 7.643275260925293, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.875390887260437, + "num_tokens": 492147252.0, + "step": 12903 + }, + { + "epoch": 1.6415214349319425, + "ewc_loss": 0.06675563752651215, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003257594653405249, + "grad_norm": 7.803333282470703, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8537775278091431, + "num_tokens": 492188185.0, + "step": 12904 + }, + { + "epoch": 1.641648645210533, + "ewc_loss": 0.06627541780471802, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032095727510750294, + "grad_norm": 7.740200042724609, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8679497241973877, + "num_tokens": 492219630.0, + "step": 12905 + }, + { + "epoch": 1.6417758554891235, + "ewc_loss": 0.06666131317615509, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032481621019542217, + "grad_norm": 7.840595245361328, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8649768233299255, + "num_tokens": 492251955.0, + "step": 12906 + }, + { + "epoch": 1.641903065767714, + "ewc_loss": 0.0663452297449112, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003216554468963295, + "grad_norm": 7.703210353851318, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8676169514656067, + "num_tokens": 492293484.0, + "step": 12907 + }, + { + "epoch": 1.6420302760463046, + "ewc_loss": 0.06651295721530914, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032333264243789017, + "grad_norm": 7.777414321899414, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8489493131637573, + "num_tokens": 492331169.0, + "step": 12908 + }, + { + "epoch": 1.642157486324895, + "ewc_loss": 0.06635036319494247, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032170675694942474, + "grad_norm": 7.668370723724365, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8705917596817017, + "num_tokens": 492366071.0, + "step": 12909 + }, + { + "epoch": 1.6422846966034856, + "ewc_loss": 0.0666336789727211, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032453989842906594, + "grad_norm": 7.859246730804443, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8828941583633423, + "num_tokens": 492396820.0, + "step": 12910 + }, + { + "epoch": 1.6424119068820762, + "ewc_loss": 0.06631812453269958, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032138434471562505, + "grad_norm": 7.678997993469238, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8709933757781982, + "num_tokens": 492435282.0, + "step": 12911 + }, + { + "epoch": 1.6425391171606667, + "ewc_loss": 0.0667681097984314, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032588420435786247, + "grad_norm": 7.788846969604492, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8709707260131836, + "num_tokens": 492470195.0, + "step": 12912 + }, + { + "epoch": 1.6426663274392572, + "ewc_loss": 0.06633215397596359, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003215246833860874, + "grad_norm": 7.668172836303711, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8565509915351868, + "num_tokens": 492514265.0, + "step": 12913 + }, + { + "epoch": 1.6427935377178478, + "ewc_loss": 0.0667024478316307, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003252275928389281, + "grad_norm": 7.803417205810547, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8648222088813782, + "num_tokens": 492550348.0, + "step": 12914 + }, + { + "epoch": 1.642920747996438, + "ewc_loss": 0.06651559472084045, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003233590687159449, + "grad_norm": 7.925928592681885, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8668169975280762, + "num_tokens": 492590648.0, + "step": 12915 + }, + { + "epoch": 1.6430479582750286, + "ewc_loss": 0.06611496210098267, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00031935269362293184, + "grad_norm": 7.647974967956543, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8666159510612488, + "num_tokens": 492629523.0, + "step": 12916 + }, + { + "epoch": 1.6431751685536191, + "ewc_loss": 0.06671961396932602, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003253992472309619, + "grad_norm": 7.821045875549316, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8591553568840027, + "num_tokens": 492668392.0, + "step": 12917 + }, + { + "epoch": 1.6433023788322096, + "ewc_loss": 0.06620930135250092, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003202961233910173, + "grad_norm": 7.764327526092529, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8529664278030396, + "num_tokens": 492696073.0, + "step": 12918 + }, + { + "epoch": 1.6434295891108002, + "ewc_loss": 0.06658728420734406, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003240759251639247, + "grad_norm": 7.829920291900635, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.864611029624939, + "num_tokens": 492728590.0, + "step": 12919 + }, + { + "epoch": 1.6435567993893907, + "ewc_loss": 0.06625604629516602, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003207635891158134, + "grad_norm": 7.726034641265869, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8569075465202332, + "num_tokens": 492767290.0, + "step": 12920 + }, + { + "epoch": 1.6436840096679812, + "ewc_loss": 0.06661155819892883, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003243186802137643, + "grad_norm": 7.817794322967529, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.874459981918335, + "num_tokens": 492801019.0, + "step": 12921 + }, + { + "epoch": 1.6438112199465715, + "ewc_loss": 0.06630848348140717, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003212879819329828, + "grad_norm": 7.699838638305664, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8620942234992981, + "num_tokens": 492841220.0, + "step": 12922 + }, + { + "epoch": 1.643938430225162, + "ewc_loss": 0.06653055548667908, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003235087206121534, + "grad_norm": 7.754392147064209, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8529895544052124, + "num_tokens": 492885076.0, + "step": 12923 + }, + { + "epoch": 1.6440656405037526, + "ewc_loss": 0.06634251773357391, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032162826391868293, + "grad_norm": 7.742986679077148, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.858225405216217, + "num_tokens": 492918554.0, + "step": 12924 + }, + { + "epoch": 1.6441928507823431, + "ewc_loss": 0.0663793608546257, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032199674751609564, + "grad_norm": 7.72724723815918, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8792852163314819, + "num_tokens": 492961890.0, + "step": 12925 + }, + { + "epoch": 1.6443200610609336, + "ewc_loss": 0.06657809019088745, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003239840443711728, + "grad_norm": 7.80300235748291, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8741375803947449, + "num_tokens": 492997768.0, + "step": 12926 + }, + { + "epoch": 1.6444472713395242, + "ewc_loss": 0.06621801853179932, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003203832602594048, + "grad_norm": 7.674100399017334, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8883413076400757, + "num_tokens": 493037017.0, + "step": 12927 + }, + { + "epoch": 1.6445744816181147, + "ewc_loss": 0.06664563715457916, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032465948606841266, + "grad_norm": 7.785994052886963, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8728986978530884, + "num_tokens": 493074579.0, + "step": 12928 + }, + { + "epoch": 1.6447016918967052, + "ewc_loss": 0.0662272721529007, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003204758104402572, + "grad_norm": 7.691169261932373, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8659124374389648, + "num_tokens": 493111249.0, + "step": 12929 + }, + { + "epoch": 1.6448289021752958, + "ewc_loss": 0.06659616529941559, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032416475005447865, + "grad_norm": 7.8387980461120605, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8475277423858643, + "num_tokens": 493150723.0, + "step": 12930 + }, + { + "epoch": 1.6449561124538863, + "ewc_loss": 0.06628677248954773, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032107080915011466, + "grad_norm": 7.6961188316345215, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8602038025856018, + "num_tokens": 493189530.0, + "step": 12931 + }, + { + "epoch": 1.6450833227324768, + "ewc_loss": 0.06668975949287415, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003251006710343063, + "grad_norm": 7.787449359893799, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8705902099609375, + "num_tokens": 493231369.0, + "step": 12932 + }, + { + "epoch": 1.6452105330110673, + "ewc_loss": 0.06634782999753952, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032168140751309693, + "grad_norm": 7.772723197937012, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8501005172729492, + "num_tokens": 493278430.0, + "step": 12933 + }, + { + "epoch": 1.6453377432896579, + "ewc_loss": 0.06657838821411133, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003239869838580489, + "grad_norm": 7.866208076477051, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.869570255279541, + "num_tokens": 493312258.0, + "step": 12934 + }, + { + "epoch": 1.6454649535682484, + "ewc_loss": 0.06631441414356232, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003213472955394536, + "grad_norm": 7.714020729064941, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8612169623374939, + "num_tokens": 493347725.0, + "step": 12935 + }, + { + "epoch": 1.645592163846839, + "ewc_loss": 0.06655186414718628, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003237218188587576, + "grad_norm": 7.798397064208984, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8670310378074646, + "num_tokens": 493388119.0, + "step": 12936 + }, + { + "epoch": 1.6457193741254295, + "ewc_loss": 0.06633278727531433, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032153096981346607, + "grad_norm": 7.760827541351318, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8651702404022217, + "num_tokens": 493424607.0, + "step": 12937 + }, + { + "epoch": 1.64584658440402, + "ewc_loss": 0.06636796146631241, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003218827478121966, + "grad_norm": 7.821604251861572, + "learning_rate": 1e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.841357946395874, + "num_tokens": 493459546.0, + "step": 12938 + }, + { + "epoch": 1.6459737946826105, + "ewc_loss": 0.06625044345855713, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032070756424218416, + "grad_norm": 7.731491565704346, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8676221370697021, + "num_tokens": 493494162.0, + "step": 12939 + }, + { + "epoch": 1.6461010049612008, + "ewc_loss": 0.06635850667953491, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032178816036321223, + "grad_norm": 7.767198085784912, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8608101606369019, + "num_tokens": 493537523.0, + "step": 12940 + }, + { + "epoch": 1.6462282152397913, + "ewc_loss": 0.06628705561161041, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032107369042932987, + "grad_norm": 7.68980598449707, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8628139495849609, + "num_tokens": 493580065.0, + "step": 12941 + }, + { + "epoch": 1.6463554255183819, + "ewc_loss": 0.06646629422903061, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003228660498280078, + "grad_norm": 7.730765342712402, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8710120916366577, + "num_tokens": 493626717.0, + "step": 12942 + }, + { + "epoch": 1.6464826357969724, + "ewc_loss": 0.06634871661663055, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003216903132852167, + "grad_norm": 7.713737487792969, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8695296049118042, + "num_tokens": 493666266.0, + "step": 12943 + }, + { + "epoch": 1.646609846075563, + "ewc_loss": 0.06644593179225922, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032266241032630205, + "grad_norm": 7.7627339363098145, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8526831269264221, + "num_tokens": 493703184.0, + "step": 12944 + }, + { + "epoch": 1.6467370563541535, + "ewc_loss": 0.06636139750480652, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032181706046685576, + "grad_norm": 7.759624004364014, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8784302473068237, + "num_tokens": 493744061.0, + "step": 12945 + }, + { + "epoch": 1.6468642666327438, + "ewc_loss": 0.06645120680332184, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003227151755709201, + "grad_norm": 7.757195472717285, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8636951446533203, + "num_tokens": 493783585.0, + "step": 12946 + }, + { + "epoch": 1.6469914769113343, + "ewc_loss": 0.06657597422599792, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032396288588643074, + "grad_norm": 7.794947624206543, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8696874976158142, + "num_tokens": 493827781.0, + "step": 12947 + }, + { + "epoch": 1.6471186871899248, + "ewc_loss": 0.06628118455410004, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003210150171071291, + "grad_norm": 7.7391533851623535, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8676548600196838, + "num_tokens": 493864418.0, + "step": 12948 + }, + { + "epoch": 1.6472458974685154, + "ewc_loss": 0.06667113304138184, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032491443562321365, + "grad_norm": 7.891992568969727, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8624454140663147, + "num_tokens": 493895226.0, + "step": 12949 + }, + { + "epoch": 1.6473731077471059, + "ewc_loss": 0.0662446916103363, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003206500259693712, + "grad_norm": 7.756038188934326, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8549562692642212, + "num_tokens": 493935210.0, + "step": 12950 + }, + { + "epoch": 1.6475003180256964, + "ewc_loss": 0.06657847762107849, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003239878569729626, + "grad_norm": 7.867452621459961, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8632912635803223, + "num_tokens": 493966368.0, + "step": 12951 + }, + { + "epoch": 1.647627528304287, + "ewc_loss": 0.06633418798446655, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003215449978597462, + "grad_norm": 7.767861843109131, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8565994501113892, + "num_tokens": 494004304.0, + "step": 12952 + }, + { + "epoch": 1.6477547385828775, + "ewc_loss": 0.06656139343976974, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032381704659201205, + "grad_norm": 7.832153797149658, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8711926937103271, + "num_tokens": 494044758.0, + "step": 12953 + }, + { + "epoch": 1.647881948861468, + "ewc_loss": 0.06621025502681732, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032030572765506804, + "grad_norm": 7.761440277099609, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8473759293556213, + "num_tokens": 494080624.0, + "step": 12954 + }, + { + "epoch": 1.6480091591400585, + "ewc_loss": 0.06659951061010361, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003241982194595039, + "grad_norm": 7.8379974365234375, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8689918518066406, + "num_tokens": 494116215.0, + "step": 12955 + }, + { + "epoch": 1.648136369418649, + "ewc_loss": 0.06609838455915451, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003191869764123112, + "grad_norm": 7.691203594207764, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8621490001678467, + "num_tokens": 494150317.0, + "step": 12956 + }, + { + "epoch": 1.6482635796972396, + "ewc_loss": 0.06658846139907837, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032408771221525967, + "grad_norm": 7.858142852783203, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8767343759536743, + "num_tokens": 494187273.0, + "step": 12957 + }, + { + "epoch": 1.64839078997583, + "ewc_loss": 0.06614267081022263, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003196298494003713, + "grad_norm": 7.6943817138671875, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8606439828872681, + "num_tokens": 494226921.0, + "step": 12958 + }, + { + "epoch": 1.6485180002544206, + "ewc_loss": 0.06661028414964676, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003243059618398547, + "grad_norm": 7.827414512634277, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8701063394546509, + "num_tokens": 494263141.0, + "step": 12959 + }, + { + "epoch": 1.6486452105330112, + "ewc_loss": 0.06609145551919937, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003191176801919937, + "grad_norm": 7.6433515548706055, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8588052988052368, + "num_tokens": 494307925.0, + "step": 12960 + }, + { + "epoch": 1.6487724208116017, + "ewc_loss": 0.06674348562955856, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003256379859521985, + "grad_norm": 7.771971702575684, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8572814464569092, + "num_tokens": 494353428.0, + "step": 12961 + }, + { + "epoch": 1.6488996310901922, + "ewc_loss": 0.06631244719028473, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032132765045389533, + "grad_norm": 7.770387649536133, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8522300720214844, + "num_tokens": 494383559.0, + "step": 12962 + }, + { + "epoch": 1.6490268413687827, + "ewc_loss": 0.06644909828901291, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032269410439766943, + "grad_norm": 7.7950263023376465, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8649681806564331, + "num_tokens": 494419564.0, + "step": 12963 + }, + { + "epoch": 1.649154051647373, + "ewc_loss": 0.06642317771911621, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003224349347874522, + "grad_norm": 7.666859149932861, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8673595786094666, + "num_tokens": 494466886.0, + "step": 12964 + }, + { + "epoch": 1.6492812619259636, + "ewc_loss": 0.06665672361850739, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003247703134547919, + "grad_norm": 7.856166839599609, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8459961414337158, + "num_tokens": 494506672.0, + "step": 12965 + }, + { + "epoch": 1.649408472204554, + "ewc_loss": 0.0663042813539505, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003212459560018033, + "grad_norm": 7.732861518859863, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.864342212677002, + "num_tokens": 494541881.0, + "step": 12966 + }, + { + "epoch": 1.6495356824831446, + "ewc_loss": 0.06669344007968903, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032513754558749497, + "grad_norm": 7.769835472106934, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.869197428226471, + "num_tokens": 494583546.0, + "step": 12967 + }, + { + "epoch": 1.6496628927617352, + "ewc_loss": 0.06643331795930862, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000322536303428933, + "grad_norm": 7.743003845214844, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8522748947143555, + "num_tokens": 494616625.0, + "step": 12968 + }, + { + "epoch": 1.6497901030403257, + "ewc_loss": 0.06655868887901306, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032379000913351774, + "grad_norm": 7.754891872406006, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8777909874916077, + "num_tokens": 494654590.0, + "step": 12969 + }, + { + "epoch": 1.6499173133189162, + "ewc_loss": 0.06651324778795242, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032333561102859676, + "grad_norm": 7.779393196105957, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8786000609397888, + "num_tokens": 494692471.0, + "step": 12970 + }, + { + "epoch": 1.6500445235975065, + "ewc_loss": 0.06646742671728134, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032287740032188594, + "grad_norm": 7.713277339935303, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8551416397094727, + "num_tokens": 494730347.0, + "step": 12971 + }, + { + "epoch": 1.650171733876097, + "ewc_loss": 0.06658876687288284, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003240907972212881, + "grad_norm": 7.799159526824951, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8510342836380005, + "num_tokens": 494763753.0, + "step": 12972 + }, + { + "epoch": 1.6502989441546876, + "ewc_loss": 0.06639138609170914, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032211700454354286, + "grad_norm": 7.7020111083984375, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8580969572067261, + "num_tokens": 494800416.0, + "step": 12973 + }, + { + "epoch": 1.6504261544332781, + "ewc_loss": 0.06725002825260162, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003258205542806536, + "grad_norm": 7.833441734313965, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8617479801177979, + "num_tokens": 494842733.0, + "step": 12974 + }, + { + "epoch": 1.6505533647118686, + "ewc_loss": 0.06640461087226868, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003222492232453078, + "grad_norm": 7.727900981903076, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8555217981338501, + "num_tokens": 494878661.0, + "step": 12975 + }, + { + "epoch": 1.6506805749904592, + "ewc_loss": 0.06661299616098404, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003243330866098404, + "grad_norm": 7.777819633483887, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8564084768295288, + "num_tokens": 494918142.0, + "step": 12976 + }, + { + "epoch": 1.6508077852690497, + "ewc_loss": 0.06636714935302734, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032187465694732964, + "grad_norm": 7.760952949523926, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8625922203063965, + "num_tokens": 494956947.0, + "step": 12977 + }, + { + "epoch": 1.6509349955476402, + "ewc_loss": 0.06654244661331177, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003236275806557387, + "grad_norm": 7.779736042022705, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.858261227607727, + "num_tokens": 494989850.0, + "step": 12978 + }, + { + "epoch": 1.6510622058262308, + "ewc_loss": 0.06645455211400986, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032274864497594535, + "grad_norm": 7.892087936401367, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8552008867263794, + "num_tokens": 495026770.0, + "step": 12979 + }, + { + "epoch": 1.6511894161048213, + "ewc_loss": 0.06631828844547272, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000321386061841622, + "grad_norm": 7.686500072479248, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8754786252975464, + "num_tokens": 495065009.0, + "step": 12980 + }, + { + "epoch": 1.6513166263834118, + "ewc_loss": 0.0666516125202179, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032471923623234034, + "grad_norm": 7.83203649520874, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8642408847808838, + "num_tokens": 495096914.0, + "step": 12981 + }, + { + "epoch": 1.6514438366620023, + "ewc_loss": 0.06627225875854492, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003209256974514574, + "grad_norm": 7.701538562774658, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8676913976669312, + "num_tokens": 495138348.0, + "step": 12982 + }, + { + "epoch": 1.6515710469405929, + "ewc_loss": 0.0666225329041481, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003244284598622471, + "grad_norm": 7.80412483215332, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8585752844810486, + "num_tokens": 495177026.0, + "step": 12983 + }, + { + "epoch": 1.6516982572191834, + "ewc_loss": 0.06631820648908615, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003213851887267083, + "grad_norm": 7.716548919677734, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8642569780349731, + "num_tokens": 495213850.0, + "step": 12984 + }, + { + "epoch": 1.651825467497774, + "ewc_loss": 0.066672682762146, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000324930006172508, + "grad_norm": 7.792035102844238, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.872880220413208, + "num_tokens": 495257213.0, + "step": 12985 + }, + { + "epoch": 1.6519526777763645, + "ewc_loss": 0.06631156802177429, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003213188028894365, + "grad_norm": 7.687567710876465, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8538016080856323, + "num_tokens": 495300018.0, + "step": 12986 + }, + { + "epoch": 1.652079888054955, + "ewc_loss": 0.06677160412073135, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000325919158058241, + "grad_norm": 7.826858043670654, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8636087775230408, + "num_tokens": 495336471.0, + "step": 12987 + }, + { + "epoch": 1.6522070983335455, + "ewc_loss": 0.06628268957138062, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003210300055798143, + "grad_norm": 7.693670749664307, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8671684265136719, + "num_tokens": 495374680.0, + "step": 12988 + }, + { + "epoch": 1.6523343086121358, + "ewc_loss": 0.06680481135845184, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003262512618675828, + "grad_norm": 7.824092864990234, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8665037155151367, + "num_tokens": 495414096.0, + "step": 12989 + }, + { + "epoch": 1.6524615188907263, + "ewc_loss": 0.06634203344583511, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032162346178665757, + "grad_norm": 7.7332329750061035, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8528145551681519, + "num_tokens": 495452955.0, + "step": 12990 + }, + { + "epoch": 1.6525887291693169, + "ewc_loss": 0.06669071316719055, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032511030440218747, + "grad_norm": 7.856468200683594, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8603615760803223, + "num_tokens": 495490091.0, + "step": 12991 + }, + { + "epoch": 1.6527159394479074, + "ewc_loss": 0.06625555455684662, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032075864146463573, + "grad_norm": 7.694352149963379, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8702428936958313, + "num_tokens": 495524543.0, + "step": 12992 + }, + { + "epoch": 1.652843149726498, + "ewc_loss": 0.0667349100112915, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032555218786001205, + "grad_norm": 7.839691638946533, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8542237281799316, + "num_tokens": 495562068.0, + "step": 12993 + }, + { + "epoch": 1.6529703600050885, + "ewc_loss": 0.06636185199022293, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032182165887206793, + "grad_norm": 7.715149402618408, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8728716969490051, + "num_tokens": 495598355.0, + "step": 12994 + }, + { + "epoch": 1.6530975702836788, + "ewc_loss": 0.06682677567005157, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032647090847603977, + "grad_norm": 7.846075057983398, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8500592708587646, + "num_tokens": 495634735.0, + "step": 12995 + }, + { + "epoch": 1.6532247805622693, + "ewc_loss": 0.0663841962814331, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003220450889784843, + "grad_norm": 7.753334045410156, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8637188673019409, + "num_tokens": 495668804.0, + "step": 12996 + }, + { + "epoch": 1.6533519908408598, + "ewc_loss": 0.06663058698177338, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032450896105729043, + "grad_norm": 7.742969036102295, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8674497604370117, + "num_tokens": 495707129.0, + "step": 12997 + }, + { + "epoch": 1.6534792011194503, + "ewc_loss": 0.06658750027418137, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003240781370550394, + "grad_norm": 7.7654852867126465, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8435552716255188, + "num_tokens": 495746501.0, + "step": 12998 + }, + { + "epoch": 1.6536064113980409, + "ewc_loss": 0.06650708615779877, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003232739691156894, + "grad_norm": 7.749316692352295, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8796662092208862, + "num_tokens": 495789723.0, + "step": 12999 + }, + { + "epoch": 1.6537336216766314, + "ewc_loss": 0.06664031744003296, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003246063133701682, + "grad_norm": 7.809638500213623, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8546892404556274, + "num_tokens": 495827073.0, + "step": 13000 + }, + { + "epoch": 1.653860831955222, + "ewc_loss": 0.06644106656312943, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003226138069294393, + "grad_norm": 7.798421382904053, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8532401323318481, + "num_tokens": 495862362.0, + "step": 13001 + }, + { + "epoch": 1.6539880422338125, + "ewc_loss": 0.06661072373390198, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032431038562208414, + "grad_norm": 7.739475250244141, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8662774562835693, + "num_tokens": 495905546.0, + "step": 13002 + }, + { + "epoch": 1.654115252512403, + "ewc_loss": 0.06660129874944687, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003242161183152348, + "grad_norm": 7.831204891204834, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8607130646705627, + "num_tokens": 495942291.0, + "step": 13003 + }, + { + "epoch": 1.6542424627909935, + "ewc_loss": 0.06639479845762253, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032215111423283815, + "grad_norm": 7.745090484619141, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8528760671615601, + "num_tokens": 495978631.0, + "step": 13004 + }, + { + "epoch": 1.654369673069584, + "ewc_loss": 0.06664516031742096, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003246547421440482, + "grad_norm": 7.783878803253174, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8580323457717896, + "num_tokens": 496016626.0, + "step": 13005 + }, + { + "epoch": 1.6544968833481746, + "ewc_loss": 0.06645108759403229, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003227139823138714, + "grad_norm": 7.738694667816162, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.854049801826477, + "num_tokens": 496051949.0, + "step": 13006 + }, + { + "epoch": 1.654624093626765, + "ewc_loss": 0.06655392050743103, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032374236616306007, + "grad_norm": 7.708681106567383, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8690236806869507, + "num_tokens": 496092693.0, + "step": 13007 + }, + { + "epoch": 1.6547513039053556, + "ewc_loss": 0.06664532423019409, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003246563719585538, + "grad_norm": 7.80997896194458, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.857765257358551, + "num_tokens": 496132853.0, + "step": 13008 + }, + { + "epoch": 1.6548785141839462, + "ewc_loss": 0.06636612117290497, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003218643250875175, + "grad_norm": 7.741730690002441, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8480888605117798, + "num_tokens": 496168303.0, + "step": 13009 + }, + { + "epoch": 1.6550057244625367, + "ewc_loss": 0.06660092622041702, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003242123930249363, + "grad_norm": 7.80296516418457, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8501402139663696, + "num_tokens": 496206132.0, + "step": 13010 + }, + { + "epoch": 1.6551329347411272, + "ewc_loss": 0.06631506979465485, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003213538439013064, + "grad_norm": 7.75055456161499, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8566591739654541, + "num_tokens": 496247542.0, + "step": 13011 + }, + { + "epoch": 1.6552601450197177, + "ewc_loss": 0.06648830324411392, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003230861620977521, + "grad_norm": 7.75125789642334, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8735548853874207, + "num_tokens": 496285698.0, + "step": 13012 + }, + { + "epoch": 1.655387355298308, + "ewc_loss": 0.06631636619567871, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003213668242096901, + "grad_norm": 7.792355537414551, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8652515411376953, + "num_tokens": 496322216.0, + "step": 13013 + }, + { + "epoch": 1.6555145655768986, + "ewc_loss": 0.06634338200092316, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003216369659639895, + "grad_norm": 7.765176773071289, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8601261973381042, + "num_tokens": 496357687.0, + "step": 13014 + }, + { + "epoch": 1.655641775855489, + "ewc_loss": 0.06648865342140198, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032308968366123736, + "grad_norm": 7.75692081451416, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8741308450698853, + "num_tokens": 496400875.0, + "step": 13015 + }, + { + "epoch": 1.6557689861340796, + "ewc_loss": 0.06633303314447403, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003215334436390549, + "grad_norm": 7.750490665435791, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8651763200759888, + "num_tokens": 496432076.0, + "step": 13016 + }, + { + "epoch": 1.6558961964126702, + "ewc_loss": 0.06649380922317505, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032314122654497623, + "grad_norm": 7.831460952758789, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8607203364372253, + "num_tokens": 496466852.0, + "step": 13017 + }, + { + "epoch": 1.6560234066912607, + "ewc_loss": 0.0663514956831932, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003217180783394724, + "grad_norm": 7.817183017730713, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8613064289093018, + "num_tokens": 496508890.0, + "step": 13018 + }, + { + "epoch": 1.6561506169698512, + "ewc_loss": 0.06639891862869263, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032219235436059535, + "grad_norm": 7.788907051086426, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8780540227890015, + "num_tokens": 496537964.0, + "step": 13019 + }, + { + "epoch": 1.6562778272484415, + "ewc_loss": 0.06649819016456604, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003231850278098136, + "grad_norm": 7.840746879577637, + "learning_rate": 1e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.8451896905899048, + "num_tokens": 496571263.0, + "step": 13020 + }, + { + "epoch": 1.656405037527032, + "ewc_loss": 0.06628274917602539, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003210306167602539, + "grad_norm": 7.756947040557861, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8702294826507568, + "num_tokens": 496610058.0, + "step": 13021 + }, + { + "epoch": 1.6565322478056226, + "ewc_loss": 0.06654642522335052, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003236673946958035, + "grad_norm": 7.846505641937256, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8565807342529297, + "num_tokens": 496640542.0, + "step": 13022 + }, + { + "epoch": 1.656659458084213, + "ewc_loss": 0.06629006564617157, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003211038128938526, + "grad_norm": 7.751089096069336, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8610543012619019, + "num_tokens": 496678030.0, + "step": 13023 + }, + { + "epoch": 1.6567866683628036, + "ewc_loss": 0.06638786196708679, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003220817307010293, + "grad_norm": 7.822535037994385, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8589839935302734, + "num_tokens": 496714057.0, + "step": 13024 + }, + { + "epoch": 1.6569138786413942, + "ewc_loss": 0.06637340784072876, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003219371719751507, + "grad_norm": 7.762959957122803, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8792808055877686, + "num_tokens": 496744501.0, + "step": 13025 + }, + { + "epoch": 1.6570410889199847, + "ewc_loss": 0.06650872528553009, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003232903254684061, + "grad_norm": 7.817126274108887, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8674781918525696, + "num_tokens": 496780233.0, + "step": 13026 + }, + { + "epoch": 1.6571682991985752, + "ewc_loss": 0.06630460917949677, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003212492447346449, + "grad_norm": 7.778581142425537, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8641517162322998, + "num_tokens": 496809645.0, + "step": 13027 + }, + { + "epoch": 1.6572955094771658, + "ewc_loss": 0.06664588302373886, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003222205559723079, + "grad_norm": 7.737568378448486, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8631840944290161, + "num_tokens": 496854871.0, + "step": 13028 + }, + { + "epoch": 1.6574227197557563, + "ewc_loss": 0.06639967858791351, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003221998631488532, + "grad_norm": 7.718228340148926, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.880681037902832, + "num_tokens": 496891958.0, + "step": 13029 + }, + { + "epoch": 1.6575499300343468, + "ewc_loss": 0.06677143275737762, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003234760370105505, + "grad_norm": 7.811238765716553, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8559250831604004, + "num_tokens": 496928550.0, + "step": 13030 + }, + { + "epoch": 1.6576771403129373, + "ewc_loss": 0.06654752790927887, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032123696291819215, + "grad_norm": 7.690773963928223, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8619550466537476, + "num_tokens": 496972009.0, + "step": 13031 + }, + { + "epoch": 1.6578043505915279, + "ewc_loss": 0.06682658940553665, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003240276128053665, + "grad_norm": 7.7708635330200195, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8603700399398804, + "num_tokens": 497013605.0, + "step": 13032 + }, + { + "epoch": 1.6579315608701184, + "ewc_loss": 0.06674472987651825, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003232090384699404, + "grad_norm": 7.789185523986816, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8650593757629395, + "num_tokens": 497048712.0, + "step": 13033 + }, + { + "epoch": 1.658058771148709, + "ewc_loss": 0.06667888164520264, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003225505643058568, + "grad_norm": 7.739569664001465, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8780578374862671, + "num_tokens": 497089161.0, + "step": 13034 + }, + { + "epoch": 1.6581859814272994, + "ewc_loss": 0.06643494218587875, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003225525433663279, + "grad_norm": 7.784439563751221, + "learning_rate": 1e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8347172141075134, + "num_tokens": 497127253.0, + "step": 13035 + }, + { + "epoch": 1.65831319170589, + "ewc_loss": 0.06634582579135895, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032166141318157315, + "grad_norm": 7.716917514801025, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8584150075912476, + "num_tokens": 497167637.0, + "step": 13036 + }, + { + "epoch": 1.6584404019844805, + "ewc_loss": 0.06669919192790985, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032275367993861437, + "grad_norm": 7.780325889587402, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8631536960601807, + "num_tokens": 497205663.0, + "step": 13037 + }, + { + "epoch": 1.6585676122630708, + "ewc_loss": 0.06636448949575424, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032184802694246173, + "grad_norm": 7.736026763916016, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8795693516731262, + "num_tokens": 497242146.0, + "step": 13038 + }, + { + "epoch": 1.6586948225416613, + "ewc_loss": 0.06646064668893814, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032280958839692175, + "grad_norm": 7.751882076263428, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8683236241340637, + "num_tokens": 497279339.0, + "step": 13039 + }, + { + "epoch": 1.6588220328202519, + "ewc_loss": 0.06637489795684814, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000321952102240175, + "grad_norm": 7.714579105377197, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8538808226585388, + "num_tokens": 497318530.0, + "step": 13040 + }, + { + "epoch": 1.6589492430988424, + "ewc_loss": 0.06648111343383789, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003230142465326935, + "grad_norm": 7.771483421325684, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8553111553192139, + "num_tokens": 497352943.0, + "step": 13041 + }, + { + "epoch": 1.659076453377433, + "ewc_loss": 0.06631232053041458, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003213263407815248, + "grad_norm": 7.765221118927002, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8582330942153931, + "num_tokens": 497387729.0, + "step": 13042 + }, + { + "epoch": 1.6592036636560235, + "ewc_loss": 0.06645748019218445, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032277798163704574, + "grad_norm": 7.768299102783203, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8639407753944397, + "num_tokens": 497427875.0, + "step": 13043 + }, + { + "epoch": 1.6593308739346138, + "ewc_loss": 0.06636980921030045, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003219011996407062, + "grad_norm": 7.740893363952637, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8553519248962402, + "num_tokens": 497469076.0, + "step": 13044 + }, + { + "epoch": 1.6594580842132043, + "ewc_loss": 0.06645601242780685, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032276325509883463, + "grad_norm": 7.79620361328125, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.865098237991333, + "num_tokens": 497509556.0, + "step": 13045 + }, + { + "epoch": 1.6595852944917948, + "ewc_loss": 0.06636994332075119, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032190256752073765, + "grad_norm": 7.737367153167725, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8556894063949585, + "num_tokens": 497548151.0, + "step": 13046 + }, + { + "epoch": 1.6597125047703853, + "ewc_loss": 0.06646601855754852, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032286334317177534, + "grad_norm": 7.816578388214111, + "learning_rate": 1e-06, + "loss": 0.5546, + "mean_token_accuracy": 0.836982011795044, + "num_tokens": 497589088.0, + "step": 13047 + }, + { + "epoch": 1.6598397150489759, + "ewc_loss": 0.06629802286624908, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032118335366249084, + "grad_norm": 7.7431960105896, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8727452754974365, + "num_tokens": 497632095.0, + "step": 13048 + }, + { + "epoch": 1.6599669253275664, + "ewc_loss": 0.06659291684627533, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032413232838734984, + "grad_norm": 7.836352348327637, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8749579191207886, + "num_tokens": 497665148.0, + "step": 13049 + }, + { + "epoch": 1.660094135606157, + "ewc_loss": 0.0662475973367691, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032067912979982793, + "grad_norm": 7.793292045593262, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8570012450218201, + "num_tokens": 497700134.0, + "step": 13050 + }, + { + "epoch": 1.6602213458847475, + "ewc_loss": 0.06648267805576324, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032302987528964877, + "grad_norm": 7.768859386444092, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8671935200691223, + "num_tokens": 497743101.0, + "step": 13051 + }, + { + "epoch": 1.660348556163338, + "ewc_loss": 0.06632702052593231, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003214732860215008, + "grad_norm": 7.805197238922119, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.854995846748352, + "num_tokens": 497775112.0, + "step": 13052 + }, + { + "epoch": 1.6604757664419285, + "ewc_loss": 0.06622383743524551, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003204414970241487, + "grad_norm": 7.755995750427246, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8740007877349854, + "num_tokens": 497813320.0, + "step": 13053 + }, + { + "epoch": 1.660602976720519, + "ewc_loss": 0.06666672974824905, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032242899760603905, + "grad_norm": 13.89000129699707, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8581533432006836, + "num_tokens": 497852780.0, + "step": 13054 + }, + { + "epoch": 1.6607301869991096, + "ewc_loss": 0.07560384273529053, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0004118001088500023, + "grad_norm": 8.84367847442627, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8655788898468018, + "num_tokens": 497886313.0, + "step": 13055 + }, + { + "epoch": 1.6608573972777, + "ewc_loss": 0.06583456695079803, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003141073393635452, + "grad_norm": 7.749534606933594, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.861319363117218, + "num_tokens": 497924108.0, + "step": 13056 + }, + { + "epoch": 1.6609846075562906, + "ewc_loss": 0.06776948273181915, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003358979010954499, + "grad_norm": 8.152909278869629, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8532285690307617, + "num_tokens": 497964656.0, + "step": 13057 + }, + { + "epoch": 1.6611118178348812, + "ewc_loss": 0.06700162589550018, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032821937929838896, + "grad_norm": 7.8349504470825195, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.849170446395874, + "num_tokens": 498005436.0, + "step": 13058 + }, + { + "epoch": 1.6612390281134717, + "ewc_loss": 0.06716174632310867, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003298205847386271, + "grad_norm": 8.024104118347168, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8591952323913574, + "num_tokens": 498042237.0, + "step": 13059 + }, + { + "epoch": 1.6613662383920622, + "ewc_loss": 0.06649177521467209, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000323120882967487, + "grad_norm": 7.7411580085754395, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8623714447021484, + "num_tokens": 498083008.0, + "step": 13060 + }, + { + "epoch": 1.6614934486706527, + "ewc_loss": 0.06717240810394287, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000329927250277251, + "grad_norm": 7.978987693786621, + "learning_rate": 1e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8405585289001465, + "num_tokens": 498120550.0, + "step": 13061 + }, + { + "epoch": 1.661620658949243, + "ewc_loss": 0.06652644276618958, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032346753869205713, + "grad_norm": 7.818443298339844, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8594076633453369, + "num_tokens": 498153033.0, + "step": 13062 + }, + { + "epoch": 1.6617478692278336, + "ewc_loss": 0.0668334886431694, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000326538021909073, + "grad_norm": 7.843480587005615, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8640754818916321, + "num_tokens": 498190491.0, + "step": 13063 + }, + { + "epoch": 1.661875079506424, + "ewc_loss": 0.06658481061458588, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003240512451156974, + "grad_norm": 7.822970390319824, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8559401035308838, + "num_tokens": 498235889.0, + "step": 13064 + }, + { + "epoch": 1.6620022897850146, + "ewc_loss": 0.06657387316226959, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003239418438170105, + "grad_norm": 7.832314968109131, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8601189851760864, + "num_tokens": 498277191.0, + "step": 13065 + }, + { + "epoch": 1.6621295000636052, + "ewc_loss": 0.06659416854381561, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032414475572295487, + "grad_norm": 13.781264305114746, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8505796194076538, + "num_tokens": 498318443.0, + "step": 13066 + }, + { + "epoch": 1.6622567103421957, + "ewc_loss": 0.07544302940368652, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0004101920349057764, + "grad_norm": 8.775358200073242, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8496578931808472, + "num_tokens": 498355955.0, + "step": 13067 + }, + { + "epoch": 1.662383920620786, + "ewc_loss": 0.06625361740589142, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.000318297854391858, + "grad_norm": 7.733285427093506, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.863554060459137, + "num_tokens": 498388588.0, + "step": 13068 + }, + { + "epoch": 1.6625111308993765, + "ewc_loss": 0.06780946254730225, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033629778772592545, + "grad_norm": 8.124176025390625, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8610748052597046, + "num_tokens": 498430661.0, + "step": 13069 + }, + { + "epoch": 1.662638341177967, + "ewc_loss": 0.06743831187486649, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003325862344354391, + "grad_norm": 7.817284107208252, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.851103663444519, + "num_tokens": 498470428.0, + "step": 13070 + }, + { + "epoch": 1.6627655514565576, + "ewc_loss": 0.06727159768342972, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003309191088192165, + "grad_norm": 7.9744768142700195, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.862443208694458, + "num_tokens": 498512031.0, + "step": 13071 + }, + { + "epoch": 1.662892761735148, + "ewc_loss": 0.06683403253555298, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032654349342919886, + "grad_norm": 7.796083927154541, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8853611946105957, + "num_tokens": 498548344.0, + "step": 13072 + }, + { + "epoch": 1.6630199720137386, + "ewc_loss": 0.06726531684398651, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033085630275309086, + "grad_norm": 7.919228553771973, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.876198410987854, + "num_tokens": 498590115.0, + "step": 13073 + }, + { + "epoch": 1.6631471822923292, + "ewc_loss": 0.06691006571054459, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003273037727922201, + "grad_norm": 7.8017191886901855, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8630627989768982, + "num_tokens": 498634034.0, + "step": 13074 + }, + { + "epoch": 1.6632743925709197, + "ewc_loss": 0.06697073578834534, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032791050034575164, + "grad_norm": 7.9710283279418945, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8550707101821899, + "num_tokens": 498675086.0, + "step": 13075 + }, + { + "epoch": 1.6634016028495102, + "ewc_loss": 0.06664188206195831, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003246220003347844, + "grad_norm": 7.7426958084106445, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8697745203971863, + "num_tokens": 498711374.0, + "step": 13076 + }, + { + "epoch": 1.6635288131281007, + "ewc_loss": 0.06706977635622025, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003289009036961943, + "grad_norm": 7.916524410247803, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8685330152511597, + "num_tokens": 498753050.0, + "step": 13077 + }, + { + "epoch": 1.6636560234066913, + "ewc_loss": 0.06651147454977036, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032331785769201815, + "grad_norm": 7.8288493156433105, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8640363216400146, + "num_tokens": 498790406.0, + "step": 13078 + }, + { + "epoch": 1.6637832336852818, + "ewc_loss": 0.06692270934581757, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003274301707278937, + "grad_norm": 7.852314472198486, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.86191725730896, + "num_tokens": 498827712.0, + "step": 13079 + }, + { + "epoch": 1.6639104439638723, + "ewc_loss": 0.06670745462179184, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032527768053114414, + "grad_norm": 7.854591369628906, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8678093552589417, + "num_tokens": 498862283.0, + "step": 13080 + }, + { + "epoch": 1.6640376542424629, + "ewc_loss": 0.06658805906772614, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003240836667828262, + "grad_norm": 7.80354118347168, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8639745116233826, + "num_tokens": 498902686.0, + "step": 13081 + }, + { + "epoch": 1.6641648645210534, + "ewc_loss": 0.06675128638744354, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000325716013321653, + "grad_norm": 7.850617408752441, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8508191704750061, + "num_tokens": 498942785.0, + "step": 13082 + }, + { + "epoch": 1.664292074799644, + "ewc_loss": 0.06656660884618759, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032386922976002097, + "grad_norm": 7.848487377166748, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8687511086463928, + "num_tokens": 498982761.0, + "step": 13083 + }, + { + "epoch": 1.6644192850782344, + "ewc_loss": 0.06668116897344589, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003250148147344589, + "grad_norm": 7.827136039733887, + "learning_rate": 1e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8445006608963013, + "num_tokens": 499025762.0, + "step": 13084 + }, + { + "epoch": 1.664546495356825, + "ewc_loss": 0.06662875413894653, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003244906838517636, + "grad_norm": 7.808047294616699, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8534594774246216, + "num_tokens": 499063394.0, + "step": 13085 + }, + { + "epoch": 1.6646737056354155, + "ewc_loss": 0.06662977486848831, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032450087019242346, + "grad_norm": 7.833461761474609, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.873760461807251, + "num_tokens": 499105824.0, + "step": 13086 + }, + { + "epoch": 1.6648009159140058, + "ewc_loss": 0.06661224365234375, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000324325606925413, + "grad_norm": 7.818249702453613, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8701003789901733, + "num_tokens": 499140935.0, + "step": 13087 + }, + { + "epoch": 1.6649281261925963, + "ewc_loss": 0.06683307886123657, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003240925434511155, + "grad_norm": 7.8232245445251465, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8629497289657593, + "num_tokens": 499179492.0, + "step": 13088 + }, + { + "epoch": 1.6650553364711869, + "ewc_loss": 0.0666816383600235, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032501950045116246, + "grad_norm": 7.826046466827393, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8736261129379272, + "num_tokens": 499214716.0, + "step": 13089 + }, + { + "epoch": 1.6651825467497774, + "ewc_loss": 0.06664247065782547, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032462782110087574, + "grad_norm": 7.824186325073242, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.868949294090271, + "num_tokens": 499250337.0, + "step": 13090 + }, + { + "epoch": 1.665309757028368, + "ewc_loss": 0.06661525368690491, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003243557002861053, + "grad_norm": 7.831050872802734, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8670293092727661, + "num_tokens": 499283675.0, + "step": 13091 + }, + { + "epoch": 1.6654369673069584, + "ewc_loss": 0.0667475163936615, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003256782947573811, + "grad_norm": 7.904162406921387, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8670111894607544, + "num_tokens": 499319153.0, + "step": 13092 + }, + { + "epoch": 1.6655641775855488, + "ewc_loss": 0.06659696996212006, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003241728700231761, + "grad_norm": 7.825108051300049, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8599115610122681, + "num_tokens": 499356675.0, + "step": 13093 + }, + { + "epoch": 1.6656913878641393, + "ewc_loss": 0.06662169098854065, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032442001975141466, + "grad_norm": 7.781618118286133, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8781132102012634, + "num_tokens": 499392129.0, + "step": 13094 + }, + { + "epoch": 1.6658185981427298, + "ewc_loss": 0.06663741171360016, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003245772095397115, + "grad_norm": 7.8831071853637695, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8739687204360962, + "num_tokens": 499424958.0, + "step": 13095 + }, + { + "epoch": 1.6659458084213203, + "ewc_loss": 0.06653741002082825, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032357717282138765, + "grad_norm": 7.739866733551025, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8719831705093384, + "num_tokens": 499467769.0, + "step": 13096 + }, + { + "epoch": 1.6660730186999109, + "ewc_loss": 0.06692272424697876, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032743034535087645, + "grad_norm": 7.972204685211182, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8531726002693176, + "num_tokens": 499513828.0, + "step": 13097 + }, + { + "epoch": 1.6662002289785014, + "ewc_loss": 0.06642661243677139, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003224692482035607, + "grad_norm": 7.7682623863220215, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8689517378807068, + "num_tokens": 499548219.0, + "step": 13098 + }, + { + "epoch": 1.666327439257092, + "ewc_loss": 0.0670054703950882, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003282578254584223, + "grad_norm": 7.919347763061523, + "learning_rate": 1e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8467562198638916, + "num_tokens": 499587158.0, + "step": 13099 + }, + { + "epoch": 1.6664546495356825, + "ewc_loss": 0.06651619076728821, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032336500589735806, + "grad_norm": 7.7620015144348145, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8655749559402466, + "num_tokens": 499625235.0, + "step": 13100 + }, + { + "epoch": 1.666581859814273, + "ewc_loss": 0.0670149102807045, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032835223828442395, + "grad_norm": 7.954265117645264, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8437340259552002, + "num_tokens": 499662842.0, + "step": 13101 + }, + { + "epoch": 1.6667090700928635, + "ewc_loss": 0.06651376187801361, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032334073330275714, + "grad_norm": 7.756779670715332, + "learning_rate": 1e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.8404396176338196, + "num_tokens": 499703464.0, + "step": 13102 + }, + { + "epoch": 1.666836280371454, + "ewc_loss": 0.06704463064670563, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003286494466010481, + "grad_norm": 7.965723037719727, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8627068996429443, + "num_tokens": 499739160.0, + "step": 13103 + }, + { + "epoch": 1.6669634906500446, + "ewc_loss": 0.06642122566699982, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032241534790955484, + "grad_norm": 7.767902851104736, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8495362401008606, + "num_tokens": 499775024.0, + "step": 13104 + }, + { + "epoch": 1.667090700928635, + "ewc_loss": 0.06700733304023743, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032827648101374507, + "grad_norm": 7.884585857391357, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8654652833938599, + "num_tokens": 499816479.0, + "step": 13105 + }, + { + "epoch": 1.6672179112072256, + "ewc_loss": 0.06657944619655609, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032399757765233517, + "grad_norm": 7.804788589477539, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8657771348953247, + "num_tokens": 499848790.0, + "step": 13106 + }, + { + "epoch": 1.6673451214858162, + "ewc_loss": 0.06684965640306473, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032669969368726015, + "grad_norm": 7.794294834136963, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.881885826587677, + "num_tokens": 499890955.0, + "step": 13107 + }, + { + "epoch": 1.6674723317644067, + "ewc_loss": 0.06672462821006775, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003254494513384998, + "grad_norm": 7.850616455078125, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8721421360969543, + "num_tokens": 499924813.0, + "step": 13108 + }, + { + "epoch": 1.6675995420429972, + "ewc_loss": 0.06663419306278229, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003245451080147177, + "grad_norm": 7.7809038162231445, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8596909642219543, + "num_tokens": 499960032.0, + "step": 13109 + }, + { + "epoch": 1.6677267523215877, + "ewc_loss": 0.06677065789699554, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032590972841717303, + "grad_norm": 7.843441009521484, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8774442672729492, + "num_tokens": 499995868.0, + "step": 13110 + }, + { + "epoch": 1.667853962600178, + "ewc_loss": 0.06661790609359741, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032438221387565136, + "grad_norm": 7.756491184234619, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8678903579711914, + "num_tokens": 500031662.0, + "step": 13111 + }, + { + "epoch": 1.6679811728787686, + "ewc_loss": 0.06689296662807465, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032713281689211726, + "grad_norm": 7.885419845581055, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8544844388961792, + "num_tokens": 500070189.0, + "step": 13112 + }, + { + "epoch": 1.668108383157359, + "ewc_loss": 0.06656059622764587, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003238091303501278, + "grad_norm": 7.785678386688232, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8599416017532349, + "num_tokens": 500109999.0, + "step": 13113 + }, + { + "epoch": 1.6682355934359496, + "ewc_loss": 0.06682312488555908, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032643438316881657, + "grad_norm": 7.868880271911621, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8460054397583008, + "num_tokens": 500151473.0, + "step": 13114 + }, + { + "epoch": 1.6683628037145402, + "ewc_loss": 0.06656485795974731, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000323851709254086, + "grad_norm": 8.011205673217773, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8614406585693359, + "num_tokens": 500187436.0, + "step": 13115 + }, + { + "epoch": 1.6684900139931307, + "ewc_loss": 0.06642575562000275, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000322460662573576, + "grad_norm": 7.751502513885498, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8618982434272766, + "num_tokens": 500222433.0, + "step": 13116 + }, + { + "epoch": 1.668617224271721, + "ewc_loss": 0.06673004478216171, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003255035844631493, + "grad_norm": 7.836721420288086, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8648447394371033, + "num_tokens": 500257390.0, + "step": 13117 + }, + { + "epoch": 1.6687444345503115, + "ewc_loss": 0.06644529104232788, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000322656036587432, + "grad_norm": 7.734506607055664, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8635438680648804, + "num_tokens": 500301260.0, + "step": 13118 + }, + { + "epoch": 1.668871644828902, + "ewc_loss": 0.0668218731880188, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003264218394178897, + "grad_norm": 7.840281009674072, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8682683110237122, + "num_tokens": 500343453.0, + "step": 13119 + }, + { + "epoch": 1.6689988551074926, + "ewc_loss": 0.06646998226642609, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032290289527736604, + "grad_norm": 7.7729268074035645, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8700680732727051, + "num_tokens": 500375133.0, + "step": 13120 + }, + { + "epoch": 1.669126065386083, + "ewc_loss": 0.06673571467399597, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032556024962104857, + "grad_norm": 7.843837261199951, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8591052889823914, + "num_tokens": 500408540.0, + "step": 13121 + }, + { + "epoch": 1.6692532756646736, + "ewc_loss": 0.06661678850650787, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003243710089009255, + "grad_norm": 7.808434963226318, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8765231966972351, + "num_tokens": 500446243.0, + "step": 13122 + }, + { + "epoch": 1.6693804859432642, + "ewc_loss": 0.0666133463382721, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032433660817332566, + "grad_norm": 7.850854873657227, + "learning_rate": 1e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.8397467732429504, + "num_tokens": 500484313.0, + "step": 13123 + }, + { + "epoch": 1.6695076962218547, + "ewc_loss": 0.0666007250547409, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003242103848606348, + "grad_norm": 7.756920337677002, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8744708299636841, + "num_tokens": 500525566.0, + "step": 13124 + }, + { + "epoch": 1.6696349065004452, + "ewc_loss": 0.06685327738523483, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032673589885234833, + "grad_norm": 7.82844877243042, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.86850905418396, + "num_tokens": 500566274.0, + "step": 13125 + }, + { + "epoch": 1.6697621167790357, + "ewc_loss": 0.06655797362327576, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032378282048739493, + "grad_norm": 7.736605644226074, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8552402257919312, + "num_tokens": 500602769.0, + "step": 13126 + }, + { + "epoch": 1.6698893270576263, + "ewc_loss": 0.06690502166748047, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003272533358540386, + "grad_norm": 7.896406650543213, + "learning_rate": 1e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8467649817466736, + "num_tokens": 500642895.0, + "step": 13127 + }, + { + "epoch": 1.6700165373362168, + "ewc_loss": 0.06647610664367676, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032296424615196884, + "grad_norm": 7.7105841636657715, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8605577945709229, + "num_tokens": 500681461.0, + "step": 13128 + }, + { + "epoch": 1.6701437476148073, + "ewc_loss": 0.06709276139736176, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003291307657491416, + "grad_norm": 7.853187084197998, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8473960757255554, + "num_tokens": 500723395.0, + "step": 13129 + }, + { + "epoch": 1.6702709578933979, + "ewc_loss": 0.06645205616950989, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003227236738894135, + "grad_norm": 7.707161903381348, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8642274141311646, + "num_tokens": 500760571.0, + "step": 13130 + }, + { + "epoch": 1.6703981681719884, + "ewc_loss": 0.06701883673667908, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003283914702478796, + "grad_norm": 7.882181167602539, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8541566133499146, + "num_tokens": 500797858.0, + "step": 13131 + }, + { + "epoch": 1.670525378450579, + "ewc_loss": 0.0666058361530304, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003242614329792559, + "grad_norm": 7.717410087585449, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8685511350631714, + "num_tokens": 500840199.0, + "step": 13132 + }, + { + "epoch": 1.6706525887291694, + "ewc_loss": 0.06698133051395416, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032801643828861415, + "grad_norm": 7.752459526062012, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8566660284996033, + "num_tokens": 500883290.0, + "step": 13133 + }, + { + "epoch": 1.67077979900776, + "ewc_loss": 0.06687033921480179, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032690653461031616, + "grad_norm": 7.812393665313721, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8675273656845093, + "num_tokens": 500923072.0, + "step": 13134 + }, + { + "epoch": 1.6709070092863505, + "ewc_loss": 0.06684231758117676, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032662629382684827, + "grad_norm": 7.765758514404297, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8656121492385864, + "num_tokens": 500958726.0, + "step": 13135 + }, + { + "epoch": 1.6710342195649408, + "ewc_loss": 0.06691090762615204, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000327312242006883, + "grad_norm": 7.843009948730469, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8585513234138489, + "num_tokens": 500996746.0, + "step": 13136 + }, + { + "epoch": 1.6711614298435313, + "ewc_loss": 0.06677491962909698, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032595236552879214, + "grad_norm": 7.802132606506348, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8542139530181885, + "num_tokens": 501034135.0, + "step": 13137 + }, + { + "epoch": 1.6712886401221219, + "ewc_loss": 0.06699327379465103, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032813585130497813, + "grad_norm": 7.795229911804199, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8649553060531616, + "num_tokens": 501069810.0, + "step": 13138 + }, + { + "epoch": 1.6714158504007124, + "ewc_loss": 0.06675240397453308, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003257271309848875, + "grad_norm": 7.802858352661133, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8573185801506042, + "num_tokens": 501110597.0, + "step": 13139 + }, + { + "epoch": 1.671543060679303, + "ewc_loss": 0.06679501384496689, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000326153269270435, + "grad_norm": 7.788849830627441, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8511131405830383, + "num_tokens": 501145927.0, + "step": 13140 + }, + { + "epoch": 1.6716702709578934, + "ewc_loss": 0.06687986105680466, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032700173323974013, + "grad_norm": 7.826790809631348, + "learning_rate": 1e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8436053991317749, + "num_tokens": 501188473.0, + "step": 13141 + }, + { + "epoch": 1.6717974812364838, + "ewc_loss": 0.06677527725696564, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003259559452999383, + "grad_norm": 7.772014141082764, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8759901523590088, + "num_tokens": 501226143.0, + "step": 13142 + }, + { + "epoch": 1.6719246915150743, + "ewc_loss": 0.06693513691425323, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003275544731877744, + "grad_norm": 7.836230278015137, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8532834053039551, + "num_tokens": 501266980.0, + "step": 13143 + }, + { + "epoch": 1.6720519017936648, + "ewc_loss": 0.06668451428413391, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032504828413948417, + "grad_norm": 7.754207134246826, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8596318960189819, + "num_tokens": 501305414.0, + "step": 13144 + }, + { + "epoch": 1.6721791120722553, + "ewc_loss": 0.06685115396976471, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003267146530561149, + "grad_norm": 7.803031921386719, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8543131947517395, + "num_tokens": 501343009.0, + "step": 13145 + }, + { + "epoch": 1.6723063223508459, + "ewc_loss": 0.06667265295982361, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000324929686030373, + "grad_norm": 7.784090995788574, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8685811758041382, + "num_tokens": 501383571.0, + "step": 13146 + }, + { + "epoch": 1.6724335326294364, + "ewc_loss": 0.06676626205444336, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003258657525293529, + "grad_norm": 7.806573867797852, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8652017116546631, + "num_tokens": 501425444.0, + "step": 13147 + }, + { + "epoch": 1.672560742908027, + "ewc_loss": 0.06676687300205231, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003258718934375793, + "grad_norm": 8.036974906921387, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8756139278411865, + "num_tokens": 501466536.0, + "step": 13148 + }, + { + "epoch": 1.6726879531866174, + "ewc_loss": 0.06633926928043365, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032159584225155413, + "grad_norm": 7.740623474121094, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8656876683235168, + "num_tokens": 501507346.0, + "step": 13149 + }, + { + "epoch": 1.672815163465208, + "ewc_loss": 0.0670209527015686, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032841262873262167, + "grad_norm": 7.883315563201904, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8694802522659302, + "num_tokens": 501545515.0, + "step": 13150 + }, + { + "epoch": 1.6729423737437985, + "ewc_loss": 0.06639809906482697, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003221841179765761, + "grad_norm": 7.673964977264404, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8555672764778137, + "num_tokens": 501588793.0, + "step": 13151 + }, + { + "epoch": 1.673069584022389, + "ewc_loss": 0.0671280026435852, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032948318403214216, + "grad_norm": 7.889004230499268, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8610259890556335, + "num_tokens": 501624384.0, + "step": 13152 + }, + { + "epoch": 1.6731967943009796, + "ewc_loss": 0.06644202768802643, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032262341119349003, + "grad_norm": 7.726813316345215, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8624178171157837, + "num_tokens": 501663150.0, + "step": 13153 + }, + { + "epoch": 1.67332400457957, + "ewc_loss": 0.06711354851722717, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032933856709860265, + "grad_norm": 7.8390374183654785, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8551152348518372, + "num_tokens": 501706177.0, + "step": 13154 + }, + { + "epoch": 1.6734512148581606, + "ewc_loss": 0.06666739284992218, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032487709540873766, + "grad_norm": 7.83769416809082, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.873604416847229, + "num_tokens": 501745267.0, + "step": 13155 + }, + { + "epoch": 1.6735784251367511, + "ewc_loss": 0.06690233945846558, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032722647301852703, + "grad_norm": 7.757449150085449, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8573358058929443, + "num_tokens": 501783591.0, + "step": 13156 + }, + { + "epoch": 1.6737056354153417, + "ewc_loss": 0.06701675802469254, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000328370719216764, + "grad_norm": 7.9338698387146, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8529039621353149, + "num_tokens": 501825190.0, + "step": 13157 + }, + { + "epoch": 1.6738328456939322, + "ewc_loss": 0.06634099781513214, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032405450474470854, + "grad_norm": 7.777392864227295, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.862799882888794, + "num_tokens": 501855813.0, + "step": 13158 + }, + { + "epoch": 1.6739600559725227, + "ewc_loss": 0.06694492697715759, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003300938114989549, + "grad_norm": 8.024663925170898, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8590516448020935, + "num_tokens": 501890106.0, + "step": 13159 + }, + { + "epoch": 1.674087266251113, + "ewc_loss": 0.06612507998943329, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032189529156312346, + "grad_norm": 7.607198238372803, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8483403921127319, + "num_tokens": 501935437.0, + "step": 13160 + }, + { + "epoch": 1.6742144765297036, + "ewc_loss": 0.06711690127849579, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00033181352773681283, + "grad_norm": 7.956592082977295, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8510306477546692, + "num_tokens": 501980134.0, + "step": 13161 + }, + { + "epoch": 1.674341686808294, + "ewc_loss": 0.06647757440805435, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003229788853786886, + "grad_norm": 7.716034412384033, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8578824996948242, + "num_tokens": 502015977.0, + "step": 13162 + }, + { + "epoch": 1.6744688970868846, + "ewc_loss": 0.06731248646974564, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033132798853330314, + "grad_norm": 7.861461639404297, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8639819622039795, + "num_tokens": 502061717.0, + "step": 13163 + }, + { + "epoch": 1.6745961073654752, + "ewc_loss": 0.06645108014345169, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.00032515532802790403, + "grad_norm": 7.785467624664307, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8603336811065674, + "num_tokens": 502101594.0, + "step": 13164 + }, + { + "epoch": 1.6747233176440657, + "ewc_loss": 0.06708946824073792, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000329097849316895, + "grad_norm": 7.85693359375, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8625180721282959, + "num_tokens": 502136521.0, + "step": 13165 + }, + { + "epoch": 1.674850527922656, + "ewc_loss": 0.06666657328605652, + "ewc_loss_diag": 3.3855438232421875e-05, + "ewc_loss_parallel": 0.0003273102920502424, + "grad_norm": 7.745312213897705, + "learning_rate": 1e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8423546552658081, + "num_tokens": 502174821.0, + "step": 13166 + }, + { + "epoch": 1.6749777382012465, + "ewc_loss": 0.06702962517738342, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032849941635504365, + "grad_norm": 7.906707286834717, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8566451072692871, + "num_tokens": 502210615.0, + "step": 13167 + }, + { + "epoch": 1.675104948479837, + "ewc_loss": 0.06686705350875854, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003268736763857305, + "grad_norm": 7.744755268096924, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8629895448684692, + "num_tokens": 502248220.0, + "step": 13168 + }, + { + "epoch": 1.6752321587584276, + "ewc_loss": 0.0670754611492157, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032895777258090675, + "grad_norm": 7.815268039703369, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8614294528961182, + "num_tokens": 502290686.0, + "step": 13169 + }, + { + "epoch": 1.675359369037018, + "ewc_loss": 0.06700560450553894, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003282591642346233, + "grad_norm": 7.815472602844238, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8496597409248352, + "num_tokens": 502331789.0, + "step": 13170 + }, + { + "epoch": 1.6754865793156086, + "ewc_loss": 0.06708873808383942, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003290905151516199, + "grad_norm": 7.793513774871826, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8691385388374329, + "num_tokens": 502371490.0, + "step": 13171 + }, + { + "epoch": 1.6756137895941992, + "ewc_loss": 0.06700532138347626, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000328256341163069, + "grad_norm": 7.788378715515137, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8609433770179749, + "num_tokens": 502415606.0, + "step": 13172 + }, + { + "epoch": 1.6757409998727897, + "ewc_loss": 0.06715098023414612, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003297129587735981, + "grad_norm": 7.82736349105835, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8616926074028015, + "num_tokens": 502456203.0, + "step": 13173 + }, + { + "epoch": 1.6758682101513802, + "ewc_loss": 0.06705285608768463, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003287317231297493, + "grad_norm": 7.820023059844971, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.878473162651062, + "num_tokens": 502491508.0, + "step": 13174 + }, + { + "epoch": 1.6759954204299707, + "ewc_loss": 0.06714992225170135, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003297023067716509, + "grad_norm": 7.792670726776123, + "learning_rate": 1e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8510265946388245, + "num_tokens": 502533408.0, + "step": 13175 + }, + { + "epoch": 1.6761226307085613, + "ewc_loss": 0.06707777082920074, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032898085191845894, + "grad_norm": 7.866954803466797, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8547549247741699, + "num_tokens": 502569824.0, + "step": 13176 + }, + { + "epoch": 1.6762498409871518, + "ewc_loss": 0.06696167588233948, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003278198710177094, + "grad_norm": 7.791476249694824, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8700814247131348, + "num_tokens": 502604776.0, + "step": 13177 + }, + { + "epoch": 1.6763770512657423, + "ewc_loss": 0.06705886870622635, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000328791793435812, + "grad_norm": 7.812568187713623, + "learning_rate": 1e-06, + "loss": 0.526, + "mean_token_accuracy": 0.8443605899810791, + "num_tokens": 502645732.0, + "step": 13178 + }, + { + "epoch": 1.6765042615443329, + "ewc_loss": 0.0670630931854248, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003288340521976352, + "grad_norm": 7.81980037689209, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8654286861419678, + "num_tokens": 502686044.0, + "step": 13179 + }, + { + "epoch": 1.6766314718229234, + "ewc_loss": 0.06707048416137695, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032890791771933436, + "grad_norm": 7.789482116699219, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8532273173332214, + "num_tokens": 502728288.0, + "step": 13180 + }, + { + "epoch": 1.676758682101514, + "ewc_loss": 0.06767922639846802, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033011252526193857, + "grad_norm": 7.956650257110596, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8537448644638062, + "num_tokens": 502760772.0, + "step": 13181 + }, + { + "epoch": 1.6768858923801044, + "ewc_loss": 0.06708641350269318, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032662582816556096, + "grad_norm": 7.725873947143555, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8620826005935669, + "num_tokens": 502791585.0, + "step": 13182 + }, + { + "epoch": 1.677013102658695, + "ewc_loss": 0.06736335158348083, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003318366943858564, + "grad_norm": 7.893989086151123, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8791263103485107, + "num_tokens": 502826339.0, + "step": 13183 + }, + { + "epoch": 1.6771403129372855, + "ewc_loss": 0.0671563595533371, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032732528052292764, + "grad_norm": 7.740658760070801, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8788818120956421, + "num_tokens": 502868716.0, + "step": 13184 + }, + { + "epoch": 1.6772675232158758, + "ewc_loss": 0.06742775440216064, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003300392418168485, + "grad_norm": 7.877102851867676, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8620641231536865, + "num_tokens": 502907992.0, + "step": 13185 + }, + { + "epoch": 1.6773947334944663, + "ewc_loss": 0.06716969609260559, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003274586342740804, + "grad_norm": 7.8302388191223145, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8555414080619812, + "num_tokens": 502947197.0, + "step": 13186 + }, + { + "epoch": 1.6775219437730569, + "ewc_loss": 0.0673394575715065, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032915628980845213, + "grad_norm": 7.836910724639893, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8717389106750488, + "num_tokens": 502985890.0, + "step": 13187 + }, + { + "epoch": 1.6776491540516474, + "ewc_loss": 0.0672873854637146, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032863562228158116, + "grad_norm": 7.77878999710083, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8557811379432678, + "num_tokens": 503033476.0, + "step": 13188 + }, + { + "epoch": 1.677776364330238, + "ewc_loss": 0.06739936023950577, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003297553339507431, + "grad_norm": 7.992249011993408, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8701208829879761, + "num_tokens": 503068002.0, + "step": 13189 + }, + { + "epoch": 1.6779035746088284, + "ewc_loss": 0.06704821437597275, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003262438694946468, + "grad_norm": 7.89695405960083, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8541457056999207, + "num_tokens": 503110059.0, + "step": 13190 + }, + { + "epoch": 1.6780307848874187, + "ewc_loss": 0.06699122488498688, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032811533310450613, + "grad_norm": 7.813819885253906, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8557710647583008, + "num_tokens": 503146617.0, + "step": 13191 + }, + { + "epoch": 1.6781579951660093, + "ewc_loss": 0.0669158324599266, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032736145658418536, + "grad_norm": 7.8290605545043945, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8585668206214905, + "num_tokens": 503186436.0, + "step": 13192 + }, + { + "epoch": 1.6782852054445998, + "ewc_loss": 0.06687811017036438, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003269841836299747, + "grad_norm": 7.782674312591553, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8667219281196594, + "num_tokens": 503227983.0, + "step": 13193 + }, + { + "epoch": 1.6784124157231903, + "ewc_loss": 0.0669732540845871, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003279356169514358, + "grad_norm": 7.840237140655518, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8484717607498169, + "num_tokens": 503269301.0, + "step": 13194 + }, + { + "epoch": 1.6785396260017809, + "ewc_loss": 0.06683699041604996, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032657303381711245, + "grad_norm": 7.827988624572754, + "learning_rate": 1e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8440966606140137, + "num_tokens": 503308919.0, + "step": 13195 + }, + { + "epoch": 1.6786668362803714, + "ewc_loss": 0.06706159561872482, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003288190928287804, + "grad_norm": 7.835122585296631, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8709676265716553, + "num_tokens": 503345030.0, + "step": 13196 + }, + { + "epoch": 1.678794046558962, + "ewc_loss": 0.06692460924386978, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032744923373684287, + "grad_norm": 7.869167327880859, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8549686670303345, + "num_tokens": 503379142.0, + "step": 13197 + }, + { + "epoch": 1.6789212568375524, + "ewc_loss": 0.06693051755428314, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032750831451267004, + "grad_norm": 7.783122539520264, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8616769909858704, + "num_tokens": 503411810.0, + "step": 13198 + }, + { + "epoch": 1.679048467116143, + "ewc_loss": 0.06710660457611084, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003292691253591329, + "grad_norm": 7.867033958435059, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8591616153717041, + "num_tokens": 503450266.0, + "step": 13199 + }, + { + "epoch": 1.6791756773947335, + "ewc_loss": 0.0668773278594017, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003269764129072428, + "grad_norm": 7.828315258026123, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8685404062271118, + "num_tokens": 503491015.0, + "step": 13200 + }, + { + "epoch": 1.679302887673324, + "ewc_loss": 0.06710435450077057, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003292467154096812, + "grad_norm": 7.830462455749512, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8702384829521179, + "num_tokens": 503531964.0, + "step": 13201 + }, + { + "epoch": 1.6794300979519146, + "ewc_loss": 0.06728167831897736, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032857852056622505, + "grad_norm": 7.773394584655762, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8651818037033081, + "num_tokens": 503576069.0, + "step": 13202 + }, + { + "epoch": 1.679557308230505, + "ewc_loss": 0.06734995543956757, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003292612382210791, + "grad_norm": 7.871619701385498, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8630096912384033, + "num_tokens": 503607895.0, + "step": 13203 + }, + { + "epoch": 1.6796845185090956, + "ewc_loss": 0.06730645895004272, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003288263105787337, + "grad_norm": 7.892388820648193, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8570376634597778, + "num_tokens": 503636891.0, + "step": 13204 + }, + { + "epoch": 1.6798117287876861, + "ewc_loss": 0.06720530986785889, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003278148651588708, + "grad_norm": 7.765069007873535, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.876889705657959, + "num_tokens": 503682037.0, + "step": 13205 + }, + { + "epoch": 1.6799389390662767, + "ewc_loss": 0.06745495647192001, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003303112753201276, + "grad_norm": 7.8446149826049805, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8649007081985474, + "num_tokens": 503724008.0, + "step": 13206 + }, + { + "epoch": 1.6800661493448672, + "ewc_loss": 0.06724946200847626, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032825631205923855, + "grad_norm": 7.838178634643555, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8699970245361328, + "num_tokens": 503759008.0, + "step": 13207 + }, + { + "epoch": 1.6801933596234577, + "ewc_loss": 0.06729726493358612, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003287344006821513, + "grad_norm": 7.892343521118164, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8580909967422485, + "num_tokens": 503796599.0, + "step": 13208 + }, + { + "epoch": 1.680320569902048, + "ewc_loss": 0.067114919424057, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003269108710810542, + "grad_norm": 7.800323009490967, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8591228723526001, + "num_tokens": 503835539.0, + "step": 13209 + }, + { + "epoch": 1.6804477801806386, + "ewc_loss": 0.06727181375026703, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032847982947714627, + "grad_norm": 7.895946025848389, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8589481115341187, + "num_tokens": 503875933.0, + "step": 13210 + }, + { + "epoch": 1.680574990459229, + "ewc_loss": 0.06714467704296112, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032720848685130477, + "grad_norm": 7.768174648284912, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8597604036331177, + "num_tokens": 503913740.0, + "step": 13211 + }, + { + "epoch": 1.6807022007378196, + "ewc_loss": 0.06747640669345856, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003305257414467633, + "grad_norm": 7.8500075340271, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8654695153236389, + "num_tokens": 503955105.0, + "step": 13212 + }, + { + "epoch": 1.6808294110164101, + "ewc_loss": 0.06712502241134644, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032701189047656953, + "grad_norm": 7.883905410766602, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8623076677322388, + "num_tokens": 503987641.0, + "step": 13213 + }, + { + "epoch": 1.6809566212950007, + "ewc_loss": 0.06706752628087997, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003288783773314208, + "grad_norm": 7.89640998840332, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8536462783813477, + "num_tokens": 504028699.0, + "step": 13214 + }, + { + "epoch": 1.681083831573591, + "ewc_loss": 0.06721539795398712, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032791568082757294, + "grad_norm": 7.818146705627441, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8554826974868774, + "num_tokens": 504065597.0, + "step": 13215 + }, + { + "epoch": 1.6812110418521815, + "ewc_loss": 0.06699559092521667, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032815898885019124, + "grad_norm": 7.848055839538574, + "learning_rate": 1e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8477970361709595, + "num_tokens": 504105968.0, + "step": 13216 + }, + { + "epoch": 1.681338252130772, + "ewc_loss": 0.06729736924171448, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032873544842004776, + "grad_norm": 7.887617111206055, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.865454912185669, + "num_tokens": 504143743.0, + "step": 13217 + }, + { + "epoch": 1.6814654624093626, + "ewc_loss": 0.06690694391727448, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032727260258980095, + "grad_norm": 7.896265506744385, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8649705648422241, + "num_tokens": 504177023.0, + "step": 13218 + }, + { + "epoch": 1.681592672687953, + "ewc_loss": 0.06711021065711975, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032686383929103613, + "grad_norm": 7.874050617218018, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8608832955360413, + "num_tokens": 504210492.0, + "step": 13219 + }, + { + "epoch": 1.6817198829665436, + "ewc_loss": 0.06708633899688721, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032662512967363, + "grad_norm": 7.7965617179870605, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8615214824676514, + "num_tokens": 504248322.0, + "step": 13220 + }, + { + "epoch": 1.6818470932451342, + "ewc_loss": 0.06718315929174423, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003275932976976037, + "grad_norm": 7.840190887451172, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8473928570747375, + "num_tokens": 504285915.0, + "step": 13221 + }, + { + "epoch": 1.6819743035237247, + "ewc_loss": 0.06711816042661667, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032694332185201347, + "grad_norm": 7.862639904022217, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8599305748939514, + "num_tokens": 504325578.0, + "step": 13222 + }, + { + "epoch": 1.6821015138023152, + "ewc_loss": 0.06691667437553406, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032736986759118736, + "grad_norm": 7.8465657234191895, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.881966233253479, + "num_tokens": 504362876.0, + "step": 13223 + }, + { + "epoch": 1.6822287240809057, + "ewc_loss": 0.06707213819026947, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003264830738771707, + "grad_norm": 7.859365940093994, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8600420951843262, + "num_tokens": 504402303.0, + "step": 13224 + }, + { + "epoch": 1.6823559343594963, + "ewc_loss": 0.06703203916549683, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032608211040496826, + "grad_norm": 7.856870174407959, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8624058961868286, + "num_tokens": 504440039.0, + "step": 13225 + }, + { + "epoch": 1.6824831446380868, + "ewc_loss": 0.06713088601827621, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032707059290260077, + "grad_norm": 7.839775085449219, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.852213978767395, + "num_tokens": 504485102.0, + "step": 13226 + }, + { + "epoch": 1.6826103549166773, + "ewc_loss": 0.06693176180124283, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003250793379265815, + "grad_norm": 7.82058572769165, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8604299426078796, + "num_tokens": 504522762.0, + "step": 13227 + }, + { + "epoch": 1.6827375651952678, + "ewc_loss": 0.06713651865720749, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032712690881453454, + "grad_norm": 7.830798149108887, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8667615056037903, + "num_tokens": 504562818.0, + "step": 13228 + }, + { + "epoch": 1.6828647754738584, + "ewc_loss": 0.06698719412088394, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032563364948146045, + "grad_norm": 7.81488561630249, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8527170419692993, + "num_tokens": 504602914.0, + "step": 13229 + }, + { + "epoch": 1.682991985752449, + "ewc_loss": 0.06709297001361847, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032669139909558, + "grad_norm": 7.8374481201171875, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8678659200668335, + "num_tokens": 504641376.0, + "step": 13230 + }, + { + "epoch": 1.6831191960310394, + "ewc_loss": 0.06680956482887268, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003262988175265491, + "grad_norm": 7.840911388397217, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8690922260284424, + "num_tokens": 504675031.0, + "step": 13231 + }, + { + "epoch": 1.68324640630963, + "ewc_loss": 0.06708069145679474, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032656858093105257, + "grad_norm": 7.862872123718262, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8679953217506409, + "num_tokens": 504709624.0, + "step": 13232 + }, + { + "epoch": 1.6833736165882205, + "ewc_loss": 0.06673486530780792, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003255517513025552, + "grad_norm": 7.813261032104492, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.865271806716919, + "num_tokens": 504749310.0, + "step": 13233 + }, + { + "epoch": 1.6835008268668108, + "ewc_loss": 0.06681272387504578, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032633039518259466, + "grad_norm": 7.754876136779785, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8635099530220032, + "num_tokens": 504791700.0, + "step": 13234 + }, + { + "epoch": 1.6836280371454013, + "ewc_loss": 0.06696314364671707, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032783456845209, + "grad_norm": 7.859400749206543, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8514667749404907, + "num_tokens": 504833246.0, + "step": 13235 + }, + { + "epoch": 1.6837552474239919, + "ewc_loss": 0.06672200560569763, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032542317057959735, + "grad_norm": 7.795393943786621, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8655552864074707, + "num_tokens": 504872901.0, + "step": 13236 + }, + { + "epoch": 1.6838824577025824, + "ewc_loss": 0.06721317768096924, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032789353281259537, + "grad_norm": 7.862943172454834, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8550251722335815, + "num_tokens": 504906138.0, + "step": 13237 + }, + { + "epoch": 1.684009667981173, + "ewc_loss": 0.06679090857505798, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003261122037656605, + "grad_norm": 7.7973313331604, + "learning_rate": 1e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.8412823677062988, + "num_tokens": 504947147.0, + "step": 13238 + }, + { + "epoch": 1.6841368782597634, + "ewc_loss": 0.06697525829076767, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032795569859445095, + "grad_norm": 7.823345184326172, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8638967275619507, + "num_tokens": 504985332.0, + "step": 13239 + }, + { + "epoch": 1.6842640885383537, + "ewc_loss": 0.0669332891702652, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003275360504630953, + "grad_norm": 7.816239833831787, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8538691997528076, + "num_tokens": 505027995.0, + "step": 13240 + }, + { + "epoch": 1.6843912988169443, + "ewc_loss": 0.06725741922855377, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003283359110355377, + "grad_norm": 7.841128349304199, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.864510715007782, + "num_tokens": 505065311.0, + "step": 13241 + }, + { + "epoch": 1.6845185090955348, + "ewc_loss": 0.0668497160077095, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003267002757638693, + "grad_norm": 7.774538040161133, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.857435405254364, + "num_tokens": 505105810.0, + "step": 13242 + }, + { + "epoch": 1.6846457193741253, + "ewc_loss": 0.06743143498897552, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033007608726620674, + "grad_norm": 7.8380584716796875, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8688973784446716, + "num_tokens": 505141003.0, + "step": 13243 + }, + { + "epoch": 1.6847729296527159, + "ewc_loss": 0.0668720155954361, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003269232693128288, + "grad_norm": 7.728324890136719, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8523339629173279, + "num_tokens": 505188025.0, + "step": 13244 + }, + { + "epoch": 1.6849001399313064, + "ewc_loss": 0.06734126061201096, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033161573810502887, + "grad_norm": 7.867138385772705, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8761841654777527, + "num_tokens": 505226101.0, + "step": 13245 + }, + { + "epoch": 1.685027350209897, + "ewc_loss": 0.06693390011787415, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003275421040598303, + "grad_norm": 7.770802021026611, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.852512001991272, + "num_tokens": 505264880.0, + "step": 13246 + }, + { + "epoch": 1.6851545604884874, + "ewc_loss": 0.06759476661682129, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003317093360237777, + "grad_norm": 7.92115592956543, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.85454261302948, + "num_tokens": 505306208.0, + "step": 13247 + }, + { + "epoch": 1.685281770767078, + "ewc_loss": 0.06695764511823654, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032777959131635725, + "grad_norm": 7.760766983032227, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8838920593261719, + "num_tokens": 505346024.0, + "step": 13248 + }, + { + "epoch": 1.6854089810456685, + "ewc_loss": 0.06745918095111847, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033279487979598343, + "grad_norm": 7.900102615356445, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8843294978141785, + "num_tokens": 505383644.0, + "step": 13249 + }, + { + "epoch": 1.685536191324259, + "ewc_loss": 0.06699280440807343, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032813113648444414, + "grad_norm": 7.874142646789551, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8664308190345764, + "num_tokens": 505420365.0, + "step": 13250 + }, + { + "epoch": 1.6856634016028496, + "ewc_loss": 0.06717441976070404, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032994727371260524, + "grad_norm": 7.833734512329102, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8623796701431274, + "num_tokens": 505458211.0, + "step": 13251 + }, + { + "epoch": 1.68579061188144, + "ewc_loss": 0.06713870167732239, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032959014060907066, + "grad_norm": 7.874687194824219, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8566495180130005, + "num_tokens": 505501649.0, + "step": 13252 + }, + { + "epoch": 1.6859178221600306, + "ewc_loss": 0.06704317033290863, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003286348655819893, + "grad_norm": 7.810075283050537, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8650966882705688, + "num_tokens": 505538763.0, + "step": 13253 + }, + { + "epoch": 1.6860450324386211, + "ewc_loss": 0.06727217882871628, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033092492958530784, + "grad_norm": 7.872400283813477, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8839475512504578, + "num_tokens": 505574562.0, + "step": 13254 + }, + { + "epoch": 1.6861722427172117, + "ewc_loss": 0.06696265190839767, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032782964990474284, + "grad_norm": 7.791655540466309, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8760302662849426, + "num_tokens": 505609718.0, + "step": 13255 + }, + { + "epoch": 1.6862994529958022, + "ewc_loss": 0.067156583070755, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003297689254395664, + "grad_norm": 7.88846492767334, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8760074377059937, + "num_tokens": 505644337.0, + "step": 13256 + }, + { + "epoch": 1.6864266632743927, + "ewc_loss": 0.06693169474601746, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032752007246017456, + "grad_norm": 7.826581954956055, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8723099827766418, + "num_tokens": 505681929.0, + "step": 13257 + }, + { + "epoch": 1.686553873552983, + "ewc_loss": 0.06716059148311615, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032980908872559667, + "grad_norm": 7.884169101715088, + "learning_rate": 1e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.84026700258255, + "num_tokens": 505719373.0, + "step": 13258 + }, + { + "epoch": 1.6866810838315736, + "ewc_loss": 0.066982701420784, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032803008798509836, + "grad_norm": 7.822410583496094, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8470954298973083, + "num_tokens": 505757767.0, + "step": 13259 + }, + { + "epoch": 1.686808294110164, + "ewc_loss": 0.06720027327537537, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003302058612462133, + "grad_norm": 7.969710826873779, + "learning_rate": 1e-06, + "loss": 0.5987, + "mean_token_accuracy": 0.8320069909095764, + "num_tokens": 505794777.0, + "step": 13260 + }, + { + "epoch": 1.6869355043887546, + "ewc_loss": 0.06686054170131683, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032680854201316833, + "grad_norm": 7.770026206970215, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8695814609527588, + "num_tokens": 505832672.0, + "step": 13261 + }, + { + "epoch": 1.6870627146673451, + "ewc_loss": 0.06735877692699432, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003317908849567175, + "grad_norm": 7.892562389373779, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8509067296981812, + "num_tokens": 505873142.0, + "step": 13262 + }, + { + "epoch": 1.6871899249459357, + "ewc_loss": 0.06680567562580109, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032625984749756753, + "grad_norm": 7.769737720489502, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8660249710083008, + "num_tokens": 505911994.0, + "step": 13263 + }, + { + "epoch": 1.687317135224526, + "ewc_loss": 0.06726372241973877, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000330840382957831, + "grad_norm": 7.899068832397461, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8664168119430542, + "num_tokens": 505948874.0, + "step": 13264 + }, + { + "epoch": 1.6874443455031165, + "ewc_loss": 0.06701891869306564, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032839231425896287, + "grad_norm": 7.81766414642334, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8674081563949585, + "num_tokens": 505989679.0, + "step": 13265 + }, + { + "epoch": 1.687571555781707, + "ewc_loss": 0.06748245656490326, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003305862483102828, + "grad_norm": 7.881629467010498, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8699440360069275, + "num_tokens": 506033209.0, + "step": 13266 + }, + { + "epoch": 1.6876987660602976, + "ewc_loss": 0.06687255203723907, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003269286244176328, + "grad_norm": 7.844073295593262, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8711588978767395, + "num_tokens": 506065379.0, + "step": 13267 + }, + { + "epoch": 1.687825976338888, + "ewc_loss": 0.06716504693031311, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003298536466900259, + "grad_norm": 7.884395122528076, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8539597988128662, + "num_tokens": 506099484.0, + "step": 13268 + }, + { + "epoch": 1.6879531866174786, + "ewc_loss": 0.06718732416629791, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003276350034866482, + "grad_norm": 7.881227970123291, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8466581702232361, + "num_tokens": 506136559.0, + "step": 13269 + }, + { + "epoch": 1.6880803968960691, + "ewc_loss": 0.06694142520427704, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003276173665653914, + "grad_norm": 7.827789783477783, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.860289454460144, + "num_tokens": 506172997.0, + "step": 13270 + }, + { + "epoch": 1.6882076071746597, + "ewc_loss": 0.06706906855106354, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032889386056922376, + "grad_norm": 7.833115100860596, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8765987157821655, + "num_tokens": 506209858.0, + "step": 13271 + }, + { + "epoch": 1.6883348174532502, + "ewc_loss": 0.06692285090684891, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032743162591941655, + "grad_norm": 7.8168625831604, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8739041090011597, + "num_tokens": 506242260.0, + "step": 13272 + }, + { + "epoch": 1.6884620277318407, + "ewc_loss": 0.06707429885864258, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032894607284106314, + "grad_norm": 7.914524555206299, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8492108583450317, + "num_tokens": 506276856.0, + "step": 13273 + }, + { + "epoch": 1.6885892380104313, + "ewc_loss": 0.06707413494586945, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.000326503039104864, + "grad_norm": 7.797522068023682, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8808858394622803, + "num_tokens": 506311469.0, + "step": 13274 + }, + { + "epoch": 1.6887164482890218, + "ewc_loss": 0.06715359538793564, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032973906490951777, + "grad_norm": 7.873161792755127, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8716753721237183, + "num_tokens": 506348240.0, + "step": 13275 + }, + { + "epoch": 1.6888436585676123, + "ewc_loss": 0.06714396923780441, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003272014146205038, + "grad_norm": 7.7628326416015625, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8642215728759766, + "num_tokens": 506387203.0, + "step": 13276 + }, + { + "epoch": 1.6889708688462028, + "ewc_loss": 0.06756343692541122, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003313960914965719, + "grad_norm": 7.9275712966918945, + "learning_rate": 1e-06, + "loss": 0.55, + "mean_token_accuracy": 0.839626669883728, + "num_tokens": 506421869.0, + "step": 13277 + }, + { + "epoch": 1.6890980791247934, + "ewc_loss": 0.06682939827442169, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003264971310272813, + "grad_norm": 7.803171157836914, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8630239963531494, + "num_tokens": 506458873.0, + "step": 13278 + }, + { + "epoch": 1.689225289403384, + "ewc_loss": 0.06759051978588104, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033166687353514135, + "grad_norm": 7.908596515655518, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8502429127693176, + "num_tokens": 506495947.0, + "step": 13279 + }, + { + "epoch": 1.6893524996819744, + "ewc_loss": 0.06721064448356628, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003278681542724371, + "grad_norm": 7.864293098449707, + "learning_rate": 1e-06, + "loss": 0.526, + "mean_token_accuracy": 0.8451342582702637, + "num_tokens": 506533098.0, + "step": 13280 + }, + { + "epoch": 1.689479709960565, + "ewc_loss": 0.06734414398670197, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032920317607931793, + "grad_norm": 7.854737758636475, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8735390901565552, + "num_tokens": 506571346.0, + "step": 13281 + }, + { + "epoch": 1.6896069202391555, + "ewc_loss": 0.06690897047519684, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003272928006481379, + "grad_norm": 7.796002388000488, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8645116686820984, + "num_tokens": 506607912.0, + "step": 13282 + }, + { + "epoch": 1.6897341305177458, + "ewc_loss": 0.06700758635997772, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003282790130469948, + "grad_norm": 7.817296981811523, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8522812128067017, + "num_tokens": 506654258.0, + "step": 13283 + }, + { + "epoch": 1.6898613407963363, + "ewc_loss": 0.06696222722530365, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003278254298493266, + "grad_norm": 7.781100273132324, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8692301511764526, + "num_tokens": 506693436.0, + "step": 13284 + }, + { + "epoch": 1.6899885510749268, + "ewc_loss": 0.06705687195062637, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003287718282081187, + "grad_norm": 7.84055233001709, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8650470972061157, + "num_tokens": 506737383.0, + "step": 13285 + }, + { + "epoch": 1.6901157613535174, + "ewc_loss": 0.06718533486127853, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032761506736278534, + "grad_norm": 7.826590061187744, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8713075518608093, + "num_tokens": 506772282.0, + "step": 13286 + }, + { + "epoch": 1.690242971632108, + "ewc_loss": 0.06747011840343475, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003280215314589441, + "grad_norm": 7.834916591644287, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.873725175857544, + "num_tokens": 506807208.0, + "step": 13287 + }, + { + "epoch": 1.6903701819106984, + "ewc_loss": 0.06711682677268982, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032693002140149474, + "grad_norm": 7.781419277191162, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8696036338806152, + "num_tokens": 506847952.0, + "step": 13288 + }, + { + "epoch": 1.6904973921892887, + "ewc_loss": 0.0672692134976387, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032845386886037886, + "grad_norm": 7.876700401306152, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8652254939079285, + "num_tokens": 506888265.0, + "step": 13289 + }, + { + "epoch": 1.6906246024678793, + "ewc_loss": 0.06682121753692627, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032641534926369786, + "grad_norm": 7.769070148468018, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8662289381027222, + "num_tokens": 506926096.0, + "step": 13290 + }, + { + "epoch": 1.6907518127464698, + "ewc_loss": 0.06699812412261963, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003281843673903495, + "grad_norm": 7.872777462005615, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8550620079040527, + "num_tokens": 506962252.0, + "step": 13291 + }, + { + "epoch": 1.6908790230250603, + "ewc_loss": 0.0667937695980072, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003261408128309995, + "grad_norm": 7.794168949127197, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8675568103790283, + "num_tokens": 507004709.0, + "step": 13292 + }, + { + "epoch": 1.6910062333036509, + "ewc_loss": 0.06702730059623718, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003284761623945087, + "grad_norm": 7.895858287811279, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8785150051116943, + "num_tokens": 507041041.0, + "step": 13293 + }, + { + "epoch": 1.6911334435822414, + "ewc_loss": 0.06702391803264618, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032600085251033306, + "grad_norm": 7.839367866516113, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8567103147506714, + "num_tokens": 507080251.0, + "step": 13294 + }, + { + "epoch": 1.691260653860832, + "ewc_loss": 0.06696279346942902, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003278311050962657, + "grad_norm": 7.800180435180664, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.862435519695282, + "num_tokens": 507120813.0, + "step": 13295 + }, + { + "epoch": 1.6913878641394224, + "ewc_loss": 0.06715715676546097, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032733328407630324, + "grad_norm": 8.559770584106445, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8502953052520752, + "num_tokens": 507160284.0, + "step": 13296 + }, + { + "epoch": 1.691515074418013, + "ewc_loss": 0.06634987890720367, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003217019548173994, + "grad_norm": 7.646123886108398, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8548815846443176, + "num_tokens": 507205069.0, + "step": 13297 + }, + { + "epoch": 1.6916422846966035, + "ewc_loss": 0.06780650466680527, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033626818913035095, + "grad_norm": 8.053330421447754, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.871738076210022, + "num_tokens": 507244689.0, + "step": 13298 + }, + { + "epoch": 1.691769494975194, + "ewc_loss": 0.06638924032449722, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003220955259166658, + "grad_norm": 7.690750598907471, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8635225892066956, + "num_tokens": 507283917.0, + "step": 13299 + }, + { + "epoch": 1.6918967052537845, + "ewc_loss": 0.06779980659484863, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033620119211263955, + "grad_norm": 7.983936786651611, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8651173710823059, + "num_tokens": 507327937.0, + "step": 13300 + }, + { + "epoch": 1.692023915532375, + "ewc_loss": 0.0667174756526947, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032537782681174576, + "grad_norm": 7.696079254150391, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8756904602050781, + "num_tokens": 507369521.0, + "step": 13301 + }, + { + "epoch": 1.6921511258109656, + "ewc_loss": 0.06765348464250565, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003347379679325968, + "grad_norm": 7.989800453186035, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8728476762771606, + "num_tokens": 507408157.0, + "step": 13302 + }, + { + "epoch": 1.6922783360895561, + "ewc_loss": 0.06692758202552795, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032747891964390874, + "grad_norm": 7.775387287139893, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8693524599075317, + "num_tokens": 507448003.0, + "step": 13303 + }, + { + "epoch": 1.6924055463681467, + "ewc_loss": 0.06758207082748413, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000334023789037019, + "grad_norm": 7.9277167320251465, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8713928461074829, + "num_tokens": 507489970.0, + "step": 13304 + }, + { + "epoch": 1.6925327566467372, + "ewc_loss": 0.0670231282711029, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032843442750163376, + "grad_norm": 7.84829568862915, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8648003339767456, + "num_tokens": 507524536.0, + "step": 13305 + }, + { + "epoch": 1.6926599669253277, + "ewc_loss": 0.06743124127388, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033251551212742925, + "grad_norm": 7.917232513427734, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8840310573577881, + "num_tokens": 507561575.0, + "step": 13306 + }, + { + "epoch": 1.692787177203918, + "ewc_loss": 0.06708688288927078, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003290719469077885, + "grad_norm": 7.873452186584473, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8645793199539185, + "num_tokens": 507597212.0, + "step": 13307 + }, + { + "epoch": 1.6929143874825086, + "ewc_loss": 0.06716230511665344, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003298261435702443, + "grad_norm": 7.865381717681885, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8664826154708862, + "num_tokens": 507634251.0, + "step": 13308 + }, + { + "epoch": 1.693041597761099, + "ewc_loss": 0.06718353182077408, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003300384560134262, + "grad_norm": 7.877231121063232, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8691431283950806, + "num_tokens": 507671345.0, + "step": 13309 + }, + { + "epoch": 1.6931688080396896, + "ewc_loss": 0.06719265878200531, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003301297256257385, + "grad_norm": 7.914340019226074, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8520148992538452, + "num_tokens": 507709797.0, + "step": 13310 + }, + { + "epoch": 1.6932960183182801, + "ewc_loss": 0.06715919077396393, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003297950606793165, + "grad_norm": 7.880621910095215, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8623866438865662, + "num_tokens": 507747047.0, + "step": 13311 + }, + { + "epoch": 1.6934232285968707, + "ewc_loss": 0.06717780232429504, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003299811214674264, + "grad_norm": 7.872746467590332, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8627530336380005, + "num_tokens": 507782187.0, + "step": 13312 + }, + { + "epoch": 1.693550438875461, + "ewc_loss": 0.06727489829063416, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033095208345912397, + "grad_norm": 7.896146774291992, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8499516248703003, + "num_tokens": 507826714.0, + "step": 13313 + }, + { + "epoch": 1.6936776491540515, + "ewc_loss": 0.06719242036342621, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003301273100078106, + "grad_norm": 7.8069233894348145, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8545444011688232, + "num_tokens": 507869423.0, + "step": 13314 + }, + { + "epoch": 1.693804859432642, + "ewc_loss": 0.06740915775299072, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033229473046958447, + "grad_norm": 7.937622547149658, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8732191324234009, + "num_tokens": 507904711.0, + "step": 13315 + }, + { + "epoch": 1.6939320697112326, + "ewc_loss": 0.0674620270729065, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033038193942047656, + "grad_norm": 13.919635772705078, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8588730096817017, + "num_tokens": 507947760.0, + "step": 13316 + }, + { + "epoch": 1.694059279989823, + "ewc_loss": 0.07577395439147949, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00041594268986955285, + "grad_norm": 8.821749687194824, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8608336448669434, + "num_tokens": 507984249.0, + "step": 13317 + }, + { + "epoch": 1.6941864902684136, + "ewc_loss": 0.06726573407649994, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032841903157532215, + "grad_norm": 7.976709365844727, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8603765964508057, + "num_tokens": 508017206.0, + "step": 13318 + }, + { + "epoch": 1.6943137005470041, + "ewc_loss": 0.06861705332994461, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00034193226019851863, + "grad_norm": 8.188741683959961, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8568001985549927, + "num_tokens": 508056357.0, + "step": 13319 + }, + { + "epoch": 1.6944409108255947, + "ewc_loss": 0.06819542497396469, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00034015736309811473, + "grad_norm": 7.958642482757568, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8550539016723633, + "num_tokens": 508094870.0, + "step": 13320 + }, + { + "epoch": 1.6945681211041852, + "ewc_loss": 0.0678756907582283, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033696001628413796, + "grad_norm": 8.146039962768555, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.862393856048584, + "num_tokens": 508120323.0, + "step": 13321 + }, + { + "epoch": 1.6946953313827757, + "ewc_loss": 0.06755025684833527, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033370574237778783, + "grad_norm": 7.934054851531982, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8551899194717407, + "num_tokens": 508160018.0, + "step": 13322 + }, + { + "epoch": 1.6948225416613663, + "ewc_loss": 0.06791173666715622, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033732049632817507, + "grad_norm": 8.047784805297852, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8814144730567932, + "num_tokens": 508200947.0, + "step": 13323 + }, + { + "epoch": 1.6949497519399568, + "ewc_loss": 0.06731848418712616, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000331387942424044, + "grad_norm": 7.787720680236816, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8498868942260742, + "num_tokens": 508246457.0, + "step": 13324 + }, + { + "epoch": 1.6950769622185473, + "ewc_loss": 0.06782986223697662, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003365017764735967, + "grad_norm": 8.0051851272583, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8576439619064331, + "num_tokens": 508281951.0, + "step": 13325 + }, + { + "epoch": 1.6952041724971378, + "ewc_loss": 0.06735594570636749, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000331762625137344, + "grad_norm": 7.915966510772705, + "learning_rate": 1e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8445579409599304, + "num_tokens": 508322571.0, + "step": 13326 + }, + { + "epoch": 1.6953313827757284, + "ewc_loss": 0.0677422434091568, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003356255474500358, + "grad_norm": 7.912952423095703, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8693439960479736, + "num_tokens": 508357347.0, + "step": 13327 + }, + { + "epoch": 1.695458593054319, + "ewc_loss": 0.06730052828788757, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033120837179012597, + "grad_norm": 7.800435543060303, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8649972677230835, + "num_tokens": 508401813.0, + "step": 13328 + }, + { + "epoch": 1.6955858033329094, + "ewc_loss": 0.06766864657402039, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003348895988892764, + "grad_norm": 7.977829933166504, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8513711094856262, + "num_tokens": 508440457.0, + "step": 13329 + }, + { + "epoch": 1.6957130136115, + "ewc_loss": 0.06707870215177536, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003289901651442051, + "grad_norm": 7.847049713134766, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8541272878646851, + "num_tokens": 508472272.0, + "step": 13330 + }, + { + "epoch": 1.6958402238900905, + "ewc_loss": 0.06742879003286362, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003324910067021847, + "grad_norm": 7.876218795776367, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8757082223892212, + "num_tokens": 508509352.0, + "step": 13331 + }, + { + "epoch": 1.6959674341686808, + "ewc_loss": 0.06730028986930847, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003312059852760285, + "grad_norm": 7.824703693389893, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8719764351844788, + "num_tokens": 508552515.0, + "step": 13332 + }, + { + "epoch": 1.6960946444472713, + "ewc_loss": 0.06737177073955536, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033192086266353726, + "grad_norm": 7.892906188964844, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8751236796379089, + "num_tokens": 508591399.0, + "step": 13333 + }, + { + "epoch": 1.6962218547258618, + "ewc_loss": 0.06717308610677719, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003299339732620865, + "grad_norm": 7.858828544616699, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8677313327789307, + "num_tokens": 508630427.0, + "step": 13334 + }, + { + "epoch": 1.6963490650044524, + "ewc_loss": 0.06726260483264923, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003308291488792747, + "grad_norm": 7.839757919311523, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.871544361114502, + "num_tokens": 508667469.0, + "step": 13335 + }, + { + "epoch": 1.696476275283043, + "ewc_loss": 0.06732308119535446, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033143392647616565, + "grad_norm": 7.882079601287842, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8660461902618408, + "num_tokens": 508706308.0, + "step": 13336 + }, + { + "epoch": 1.6966034855616334, + "ewc_loss": 0.06713303923606873, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003295334754511714, + "grad_norm": 7.877810001373291, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8509484529495239, + "num_tokens": 508738968.0, + "step": 13337 + }, + { + "epoch": 1.6967306958402237, + "ewc_loss": 0.06730864942073822, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033128962968476117, + "grad_norm": 7.893553733825684, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8707261681556702, + "num_tokens": 508769879.0, + "step": 13338 + }, + { + "epoch": 1.6968579061188143, + "ewc_loss": 0.06741178780794144, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032987960730679333, + "grad_norm": 13.792257308959961, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8485023975372314, + "num_tokens": 508812277.0, + "step": 13339 + }, + { + "epoch": 1.6969851163974048, + "ewc_loss": 0.07617098838090897, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00041747160139493644, + "grad_norm": 8.901355743408203, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8540326356887817, + "num_tokens": 508849293.0, + "step": 13340 + }, + { + "epoch": 1.6971123266759953, + "ewc_loss": 0.06673122942447662, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032551545882597566, + "grad_norm": 7.933258056640625, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8640775680541992, + "num_tokens": 508885798.0, + "step": 13341 + }, + { + "epoch": 1.6972395369545858, + "ewc_loss": 0.06842736899852753, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003424768219701946, + "grad_norm": 8.197033882141113, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8606453537940979, + "num_tokens": 508920491.0, + "step": 13342 + }, + { + "epoch": 1.6973667472331764, + "ewc_loss": 0.06813152134418488, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003395183302927762, + "grad_norm": 8.101563453674316, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8625568151473999, + "num_tokens": 508956161.0, + "step": 13343 + }, + { + "epoch": 1.697493957511767, + "ewc_loss": 0.06760235130786896, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033422658452764153, + "grad_norm": 8.017313003540039, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8596146106719971, + "num_tokens": 508995696.0, + "step": 13344 + }, + { + "epoch": 1.6976211677903574, + "ewc_loss": 0.06759120523929596, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003341151459608227, + "grad_norm": 8.019535064697266, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8681192398071289, + "num_tokens": 509029505.0, + "step": 13345 + }, + { + "epoch": 1.697748378068948, + "ewc_loss": 0.06744493544101715, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003326524456497282, + "grad_norm": 7.9440083503723145, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8701286911964417, + "num_tokens": 509062815.0, + "step": 13346 + }, + { + "epoch": 1.6978755883475385, + "ewc_loss": 0.06758298724889755, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033403298584744334, + "grad_norm": 8.11180591583252, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8698501586914062, + "num_tokens": 509103376.0, + "step": 13347 + }, + { + "epoch": 1.698002798626129, + "ewc_loss": 0.0670529454946518, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032873262534849346, + "grad_norm": 7.789648532867432, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.872418999671936, + "num_tokens": 509142797.0, + "step": 13348 + }, + { + "epoch": 1.6981300089047195, + "ewc_loss": 0.0678725615143776, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033692875877022743, + "grad_norm": 8.357271194458008, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8523824214935303, + "num_tokens": 509182724.0, + "step": 13349 + }, + { + "epoch": 1.69825721918331, + "ewc_loss": 0.06666834652423859, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000324886612361297, + "grad_norm": 7.7315568923950195, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8712458610534668, + "num_tokens": 509217856.0, + "step": 13350 + }, + { + "epoch": 1.6983844294619006, + "ewc_loss": 0.06821063160896301, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003403094888199121, + "grad_norm": 8.226011276245117, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8641068935394287, + "num_tokens": 509251581.0, + "step": 13351 + }, + { + "epoch": 1.6985116397404911, + "ewc_loss": 0.06663285195827484, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032453163294121623, + "grad_norm": 7.7270121574401855, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8795909881591797, + "num_tokens": 509286687.0, + "step": 13352 + }, + { + "epoch": 1.6986388500190817, + "ewc_loss": 0.06812512129545212, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033945433096960187, + "grad_norm": 8.061141014099121, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8581462502479553, + "num_tokens": 509323372.0, + "step": 13353 + }, + { + "epoch": 1.6987660602976722, + "ewc_loss": 0.06703802943229675, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003285833809059113, + "grad_norm": 7.9276227951049805, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8563706278800964, + "num_tokens": 509362472.0, + "step": 13354 + }, + { + "epoch": 1.6988932705762627, + "ewc_loss": 0.06748487055301666, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003330518084112555, + "grad_norm": 7.930886268615723, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8647862672805786, + "num_tokens": 509397676.0, + "step": 13355 + }, + { + "epoch": 1.699020480854853, + "ewc_loss": 0.06720037758350372, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003302069380879402, + "grad_norm": 7.828019142150879, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8662741780281067, + "num_tokens": 509441100.0, + "step": 13356 + }, + { + "epoch": 1.6991476911334435, + "ewc_loss": 0.06770826876163483, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033284444361925125, + "grad_norm": 7.913792133331299, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8591615557670593, + "num_tokens": 509485430.0, + "step": 13357 + }, + { + "epoch": 1.699274901412034, + "ewc_loss": 0.06735280156135559, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033173110568895936, + "grad_norm": 7.962221622467041, + "learning_rate": 1e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.834372878074646, + "num_tokens": 509520102.0, + "step": 13358 + }, + { + "epoch": 1.6994021116906246, + "ewc_loss": 0.06712374091148376, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032944054692052305, + "grad_norm": 7.812828063964844, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.864762008190155, + "num_tokens": 509556525.0, + "step": 13359 + }, + { + "epoch": 1.6995293219692151, + "ewc_loss": 0.06757199764251709, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033392314799129963, + "grad_norm": 7.996913433074951, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8552212119102478, + "num_tokens": 509596899.0, + "step": 13360 + }, + { + "epoch": 1.6996565322478057, + "ewc_loss": 0.06725524365901947, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003283141122665256, + "grad_norm": 7.857953071594238, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8645643591880798, + "num_tokens": 509632125.0, + "step": 13361 + }, + { + "epoch": 1.699783742526396, + "ewc_loss": 0.0674125924706459, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033232904388569295, + "grad_norm": 7.94022274017334, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8576874732971191, + "num_tokens": 509668131.0, + "step": 13362 + }, + { + "epoch": 1.6999109528049865, + "ewc_loss": 0.06702332943677902, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003284364356659353, + "grad_norm": 7.809010982513428, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8762041330337524, + "num_tokens": 509704078.0, + "step": 13363 + }, + { + "epoch": 1.700038163083577, + "ewc_loss": 0.06765235215425491, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033228524262085557, + "grad_norm": 7.940240859985352, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8760178089141846, + "num_tokens": 509739464.0, + "step": 13364 + }, + { + "epoch": 1.7001653733621676, + "ewc_loss": 0.06704212725162506, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003286243590991944, + "grad_norm": 7.839473724365234, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8632408976554871, + "num_tokens": 509777970.0, + "step": 13365 + }, + { + "epoch": 1.700292583640758, + "ewc_loss": 0.06764134764671326, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033217514283023775, + "grad_norm": 7.932994365692139, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8499028086662292, + "num_tokens": 509817010.0, + "step": 13366 + }, + { + "epoch": 1.7004197939193486, + "ewc_loss": 0.067182257771492, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033002570853568614, + "grad_norm": 7.8148393630981445, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8684314489364624, + "num_tokens": 509854192.0, + "step": 13367 + }, + { + "epoch": 1.7005470041979391, + "ewc_loss": 0.06770072132349014, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033276891917921603, + "grad_norm": 7.9678730964660645, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8760422468185425, + "num_tokens": 509888633.0, + "step": 13368 + }, + { + "epoch": 1.7006742144765297, + "ewc_loss": 0.06694890558719635, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003276921634096652, + "grad_norm": 7.758769512176514, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8641228079795837, + "num_tokens": 509928474.0, + "step": 13369 + }, + { + "epoch": 1.7008014247551202, + "ewc_loss": 0.06758126616477966, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033401575637981296, + "grad_norm": 8.01801586151123, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8555206060409546, + "num_tokens": 509968875.0, + "step": 13370 + }, + { + "epoch": 1.7009286350337107, + "ewc_loss": 0.06697411835193634, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003279442898929119, + "grad_norm": 7.830721855163574, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8636184930801392, + "num_tokens": 510006021.0, + "step": 13371 + }, + { + "epoch": 1.7010558453123013, + "ewc_loss": 0.06760574877262115, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000334260577801615, + "grad_norm": 8.025228500366211, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8685311079025269, + "num_tokens": 510042111.0, + "step": 13372 + }, + { + "epoch": 1.7011830555908918, + "ewc_loss": 0.06702516973018646, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003284548001829535, + "grad_norm": 7.860620975494385, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8700460195541382, + "num_tokens": 510074894.0, + "step": 13373 + }, + { + "epoch": 1.7013102658694823, + "ewc_loss": 0.0674261599779129, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033246472594328225, + "grad_norm": 7.955644130706787, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8562227487564087, + "num_tokens": 510111516.0, + "step": 13374 + }, + { + "epoch": 1.7014374761480728, + "ewc_loss": 0.06703805178403854, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032858364284038544, + "grad_norm": 7.831821918487549, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8668176531791687, + "num_tokens": 510151549.0, + "step": 13375 + }, + { + "epoch": 1.7015646864266634, + "ewc_loss": 0.06735733151435852, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033177639124915004, + "grad_norm": 7.932790756225586, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.86322420835495, + "num_tokens": 510189215.0, + "step": 13376 + }, + { + "epoch": 1.701691896705254, + "ewc_loss": 0.06729118525981903, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003286735445726663, + "grad_norm": 7.833670139312744, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8836331367492676, + "num_tokens": 510231189.0, + "step": 13377 + }, + { + "epoch": 1.7018191069838444, + "ewc_loss": 0.06739519536495209, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033215503208339214, + "grad_norm": 7.914827346801758, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8670192956924438, + "num_tokens": 510273956.0, + "step": 13378 + }, + { + "epoch": 1.701946317262435, + "ewc_loss": 0.06729807704687119, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003287424915470183, + "grad_norm": 7.882019519805908, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8801836371421814, + "num_tokens": 510315095.0, + "step": 13379 + }, + { + "epoch": 1.7020735275410255, + "ewc_loss": 0.06716042757034302, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032980742980726063, + "grad_norm": 7.9773173332214355, + "learning_rate": 1e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8413972854614258, + "num_tokens": 510350753.0, + "step": 13380 + }, + { + "epoch": 1.7022007378196158, + "ewc_loss": 0.06689060479402542, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003271091845817864, + "grad_norm": 7.856253623962402, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8688908815383911, + "num_tokens": 510389257.0, + "step": 13381 + }, + { + "epoch": 1.7023279480982063, + "ewc_loss": 0.06726625561714172, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003308656741864979, + "grad_norm": 7.911239147186279, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8652129173278809, + "num_tokens": 510428879.0, + "step": 13382 + }, + { + "epoch": 1.7024551583767968, + "ewc_loss": 0.06711217761039734, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003293249465059489, + "grad_norm": 7.8792572021484375, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8656001687049866, + "num_tokens": 510468409.0, + "step": 13383 + }, + { + "epoch": 1.7025823686553874, + "ewc_loss": 0.06712843477725983, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003294875205028802, + "grad_norm": 7.865545272827148, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8790585994720459, + "num_tokens": 510503988.0, + "step": 13384 + }, + { + "epoch": 1.702709578933978, + "ewc_loss": 0.06745252013206482, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033028688631020486, + "grad_norm": 8.000958442687988, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.86151522397995, + "num_tokens": 510539009.0, + "step": 13385 + }, + { + "epoch": 1.7028367892125684, + "ewc_loss": 0.06705258786678314, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003287290164735168, + "grad_norm": 7.869974136352539, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8778964281082153, + "num_tokens": 510576177.0, + "step": 13386 + }, + { + "epoch": 1.7029639994911587, + "ewc_loss": 0.0674540251493454, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003303019911982119, + "grad_norm": 7.866601467132568, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8639358878135681, + "num_tokens": 510622194.0, + "step": 13387 + }, + { + "epoch": 1.7030912097697493, + "ewc_loss": 0.06710522621870041, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003292553883511573, + "grad_norm": 7.837886810302734, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8813339471817017, + "num_tokens": 510663564.0, + "step": 13388 + }, + { + "epoch": 1.7032184200483398, + "ewc_loss": 0.06722861528396606, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003304893325548619, + "grad_norm": 7.883106231689453, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8739458322525024, + "num_tokens": 510702153.0, + "step": 13389 + }, + { + "epoch": 1.7033456303269303, + "ewc_loss": 0.06721873581409454, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003303904668428004, + "grad_norm": 7.863312244415283, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8593279123306274, + "num_tokens": 510742812.0, + "step": 13390 + }, + { + "epoch": 1.7034728406055208, + "ewc_loss": 0.06739552319049835, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033215832081623375, + "grad_norm": 7.89611291885376, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8581748604774475, + "num_tokens": 510783363.0, + "step": 13391 + }, + { + "epoch": 1.7036000508841114, + "ewc_loss": 0.0671633929014206, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003298370575066656, + "grad_norm": 7.896789073944092, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8481637835502625, + "num_tokens": 510818362.0, + "step": 13392 + }, + { + "epoch": 1.703727261162702, + "ewc_loss": 0.06728380918502808, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033104117028415203, + "grad_norm": 7.862985134124756, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8744409084320068, + "num_tokens": 510854732.0, + "step": 13393 + }, + { + "epoch": 1.7038544714412924, + "ewc_loss": 0.06732980161905289, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033150112722069025, + "grad_norm": 7.893411636352539, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8569927215576172, + "num_tokens": 510895107.0, + "step": 13394 + }, + { + "epoch": 1.703981681719883, + "ewc_loss": 0.0671500414609909, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003297035291325301, + "grad_norm": 7.899702548980713, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8594790697097778, + "num_tokens": 510930737.0, + "step": 13395 + }, + { + "epoch": 1.7041088919984735, + "ewc_loss": 0.06738314032554626, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003320345713291317, + "grad_norm": 7.9353861808776855, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8728499412536621, + "num_tokens": 510964646.0, + "step": 13396 + }, + { + "epoch": 1.704236102277064, + "ewc_loss": 0.06709571182727814, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003291602188255638, + "grad_norm": 7.837226867675781, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.867159366607666, + "num_tokens": 511000583.0, + "step": 13397 + }, + { + "epoch": 1.7043633125556545, + "ewc_loss": 0.06726223975419998, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033082551090046763, + "grad_norm": 7.931264877319336, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.866723358631134, + "num_tokens": 511033729.0, + "step": 13398 + }, + { + "epoch": 1.704490522834245, + "ewc_loss": 0.0670604482293129, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032880762591958046, + "grad_norm": 7.773284435272217, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8690181374549866, + "num_tokens": 511074099.0, + "step": 13399 + }, + { + "epoch": 1.7046177331128356, + "ewc_loss": 0.06751474738121033, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003333505883347243, + "grad_norm": 8.011833190917969, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8521546125411987, + "num_tokens": 511112473.0, + "step": 13400 + }, + { + "epoch": 1.7047449433914261, + "ewc_loss": 0.06697153300046921, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032791844569146633, + "grad_norm": 7.805018901824951, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8629010319709778, + "num_tokens": 511149839.0, + "step": 13401 + }, + { + "epoch": 1.7048721536700167, + "ewc_loss": 0.06787648797035217, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003345265577081591, + "grad_norm": 8.027266502380371, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.852925717830658, + "num_tokens": 511183054.0, + "step": 13402 + }, + { + "epoch": 1.7049993639486072, + "ewc_loss": 0.066926971077919, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032747286604717374, + "grad_norm": 7.815728187561035, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8679232001304626, + "num_tokens": 511218498.0, + "step": 13403 + }, + { + "epoch": 1.7051265742271977, + "ewc_loss": 0.06789910793304443, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033475281088612974, + "grad_norm": 8.028407096862793, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8594282269477844, + "num_tokens": 511259776.0, + "step": 13404 + }, + { + "epoch": 1.705253784505788, + "ewc_loss": 0.06691616028547287, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032736474531702697, + "grad_norm": 7.778036594390869, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8705015182495117, + "num_tokens": 511297331.0, + "step": 13405 + }, + { + "epoch": 1.7053809947843785, + "ewc_loss": 0.06763704121112823, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003345735021866858, + "grad_norm": 7.989919185638428, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.875625729560852, + "num_tokens": 511331055.0, + "step": 13406 + }, + { + "epoch": 1.705508205062969, + "ewc_loss": 0.06698441505432129, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003280472883488983, + "grad_norm": 7.744561672210693, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.880567193031311, + "num_tokens": 511368651.0, + "step": 13407 + }, + { + "epoch": 1.7056354153415596, + "ewc_loss": 0.06763730943202972, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003345761797390878, + "grad_norm": 8.094677925109863, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8681221008300781, + "num_tokens": 511402163.0, + "step": 13408 + }, + { + "epoch": 1.7057626256201501, + "ewc_loss": 0.06695976853370667, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032780083711259067, + "grad_norm": 7.797928333282471, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8569558262825012, + "num_tokens": 511443769.0, + "step": 13409 + }, + { + "epoch": 1.7058898358987407, + "ewc_loss": 0.06764189153909683, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033462204737588763, + "grad_norm": 7.961113929748535, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8607358932495117, + "num_tokens": 511481038.0, + "step": 13410 + }, + { + "epoch": 1.706017046177331, + "ewc_loss": 0.06697527319192886, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032795584411360323, + "grad_norm": 7.801900863647461, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8663341403007507, + "num_tokens": 511522940.0, + "step": 13411 + }, + { + "epoch": 1.7061442564559215, + "ewc_loss": 0.06744692474603653, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033267238177359104, + "grad_norm": 7.977907180786133, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8616502285003662, + "num_tokens": 511562630.0, + "step": 13412 + }, + { + "epoch": 1.706271466734512, + "ewc_loss": 0.06705115735530853, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032871466828510165, + "grad_norm": 7.8274736404418945, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8488467931747437, + "num_tokens": 511598986.0, + "step": 13413 + }, + { + "epoch": 1.7063986770131025, + "ewc_loss": 0.06740879267454147, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033229103428311646, + "grad_norm": 7.962645530700684, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8655108213424683, + "num_tokens": 511633582.0, + "step": 13414 + }, + { + "epoch": 1.706525887291693, + "ewc_loss": 0.06706616282463074, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032886475673876703, + "grad_norm": 7.8525567054748535, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8638215661048889, + "num_tokens": 511673296.0, + "step": 13415 + }, + { + "epoch": 1.7066530975702836, + "ewc_loss": 0.06753384321928024, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033110016374848783, + "grad_norm": 7.926778793334961, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8561282753944397, + "num_tokens": 511707890.0, + "step": 13416 + }, + { + "epoch": 1.7067803078488741, + "ewc_loss": 0.06720149517059326, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003302180557511747, + "grad_norm": 7.893405437469482, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8710229992866516, + "num_tokens": 511741597.0, + "step": 13417 + }, + { + "epoch": 1.7069075181274647, + "ewc_loss": 0.06735270470380783, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003292887704446912, + "grad_norm": 7.8955302238464355, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.866667628288269, + "num_tokens": 511776816.0, + "step": 13418 + }, + { + "epoch": 1.7070347284060552, + "ewc_loss": 0.06738130748271942, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032957480289041996, + "grad_norm": 7.971882343292236, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8566053509712219, + "num_tokens": 511814866.0, + "step": 13419 + }, + { + "epoch": 1.7071619386846457, + "ewc_loss": 0.06718894839286804, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032765118521638215, + "grad_norm": 7.808868885040283, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.864421546459198, + "num_tokens": 511852458.0, + "step": 13420 + }, + { + "epoch": 1.7072891489632362, + "ewc_loss": 0.06771057844161987, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003304261190351099, + "grad_norm": 7.856863498687744, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8587643504142761, + "num_tokens": 511891201.0, + "step": 13421 + }, + { + "epoch": 1.7074163592418268, + "ewc_loss": 0.06729447841644287, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032870651921257377, + "grad_norm": 7.825057506561279, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.863315999507904, + "num_tokens": 511930862.0, + "step": 13422 + }, + { + "epoch": 1.7075435695204173, + "ewc_loss": 0.06750112771987915, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003307729493826628, + "grad_norm": 7.889706134796143, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.871370792388916, + "num_tokens": 511967695.0, + "step": 13423 + }, + { + "epoch": 1.7076707797990078, + "ewc_loss": 0.06706658005714417, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003288688894826919, + "grad_norm": 7.7998366355896, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8781534433364868, + "num_tokens": 512005976.0, + "step": 13424 + }, + { + "epoch": 1.7077979900775984, + "ewc_loss": 0.0673908144235611, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033211128902621567, + "grad_norm": 7.92264461517334, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8651336431503296, + "num_tokens": 512041575.0, + "step": 13425 + }, + { + "epoch": 1.7079252003561889, + "ewc_loss": 0.06701211631298065, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032832424039952457, + "grad_norm": 7.790127754211426, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8598939180374146, + "num_tokens": 512076171.0, + "step": 13426 + }, + { + "epoch": 1.7080524106347794, + "ewc_loss": 0.06753935664892197, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033359669032506645, + "grad_norm": 7.944626331329346, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8601934313774109, + "num_tokens": 512112082.0, + "step": 13427 + }, + { + "epoch": 1.70817962091337, + "ewc_loss": 0.06710314750671387, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003292345500085503, + "grad_norm": 7.885474681854248, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8505500555038452, + "num_tokens": 512149126.0, + "step": 13428 + }, + { + "epoch": 1.7083068311919605, + "ewc_loss": 0.06720633804798126, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003302665427327156, + "grad_norm": 7.858362674713135, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8651478886604309, + "num_tokens": 512182368.0, + "step": 13429 + }, + { + "epoch": 1.7084340414705508, + "ewc_loss": 0.06729784607887268, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003311815671622753, + "grad_norm": 7.905379772186279, + "learning_rate": 1e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8448296189308167, + "num_tokens": 512218224.0, + "step": 13430 + }, + { + "epoch": 1.7085612517491413, + "ewc_loss": 0.06707002222537994, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003289033193141222, + "grad_norm": 7.797073841094971, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8630501627922058, + "num_tokens": 512255523.0, + "step": 13431 + }, + { + "epoch": 1.7086884620277318, + "ewc_loss": 0.06778068840503693, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003311271721031517, + "grad_norm": 7.822841167449951, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8741905689239502, + "num_tokens": 512290514.0, + "step": 13432 + }, + { + "epoch": 1.7088156723063224, + "ewc_loss": 0.06743040680885315, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033006578451022506, + "grad_norm": 7.815435886383057, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8726904988288879, + "num_tokens": 512328468.0, + "step": 13433 + }, + { + "epoch": 1.708942882584913, + "ewc_loss": 0.0676039308309555, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003318010421935469, + "grad_norm": 7.909591197967529, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8611541390419006, + "num_tokens": 512359786.0, + "step": 13434 + }, + { + "epoch": 1.7090700928635034, + "ewc_loss": 0.06760072708129883, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00032932756585069, + "grad_norm": 7.873196125030518, + "learning_rate": 1e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8420326113700867, + "num_tokens": 512395679.0, + "step": 13435 + }, + { + "epoch": 1.7091973031420937, + "ewc_loss": 0.06753654032945633, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033112711389549077, + "grad_norm": 7.834455966949463, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8611273765563965, + "num_tokens": 512433491.0, + "step": 13436 + }, + { + "epoch": 1.7093245134206843, + "ewc_loss": 0.06745891273021698, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033035085652954876, + "grad_norm": 7.790502548217773, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8564348816871643, + "num_tokens": 512470215.0, + "step": 13437 + }, + { + "epoch": 1.7094517236992748, + "ewc_loss": 0.067732073366642, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033064105082303286, + "grad_norm": 7.863070011138916, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8595728874206543, + "num_tokens": 512507055.0, + "step": 13438 + }, + { + "epoch": 1.7095789339778653, + "ewc_loss": 0.0676647275686264, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00032996758818626404, + "grad_norm": 7.831106662750244, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8458614945411682, + "num_tokens": 512547877.0, + "step": 13439 + }, + { + "epoch": 1.7097061442564558, + "ewc_loss": 0.06781353056430817, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003314556379336864, + "grad_norm": 7.944728374481201, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8496215343475342, + "num_tokens": 512586558.0, + "step": 13440 + }, + { + "epoch": 1.7098333545350464, + "ewc_loss": 0.06757476925849915, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003290679887868464, + "grad_norm": 7.767706871032715, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.860662043094635, + "num_tokens": 512631181.0, + "step": 13441 + }, + { + "epoch": 1.709960564813637, + "ewc_loss": 0.06792989373207092, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033261923817917705, + "grad_norm": 7.8276519775390625, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8733168840408325, + "num_tokens": 512667925.0, + "step": 13442 + }, + { + "epoch": 1.7100877750922274, + "ewc_loss": 0.06768149137496948, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033013519714586437, + "grad_norm": 7.836043357849121, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.846344530582428, + "num_tokens": 512703136.0, + "step": 13443 + }, + { + "epoch": 1.710214985370818, + "ewc_loss": 0.06780599057674408, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033138025901280344, + "grad_norm": 7.8993682861328125, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8654792308807373, + "num_tokens": 512739001.0, + "step": 13444 + }, + { + "epoch": 1.7103421956494085, + "ewc_loss": 0.06776781380176544, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000330998474964872, + "grad_norm": 7.814608573913574, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8675122857093811, + "num_tokens": 512780517.0, + "step": 13445 + }, + { + "epoch": 1.710469405927999, + "ewc_loss": 0.06777951121330261, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033111541415564716, + "grad_norm": 7.852587699890137, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.870587170124054, + "num_tokens": 512815964.0, + "step": 13446 + }, + { + "epoch": 1.7105966162065895, + "ewc_loss": 0.06777554750442505, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033107580384239554, + "grad_norm": 7.837107181549072, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.86629718542099, + "num_tokens": 512855451.0, + "step": 13447 + }, + { + "epoch": 1.71072382648518, + "ewc_loss": 0.06785373389720917, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033185764914378524, + "grad_norm": 7.863050937652588, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8669995665550232, + "num_tokens": 512895809.0, + "step": 13448 + }, + { + "epoch": 1.7108510367637706, + "ewc_loss": 0.06775569915771484, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003308772575110197, + "grad_norm": 7.843245506286621, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.868147611618042, + "num_tokens": 512936424.0, + "step": 13449 + }, + { + "epoch": 1.7109782470423611, + "ewc_loss": 0.06787465512752533, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033206684747710824, + "grad_norm": 7.9043731689453125, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8778501749038696, + "num_tokens": 512976486.0, + "step": 13450 + }, + { + "epoch": 1.7111054573209517, + "ewc_loss": 0.06772644817829132, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033058479311876, + "grad_norm": 7.916136264801025, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8539659976959229, + "num_tokens": 513014131.0, + "step": 13451 + }, + { + "epoch": 1.7112326675995422, + "ewc_loss": 0.06744097173213959, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003301714896224439, + "grad_norm": 7.875156879425049, + "learning_rate": 1e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8352549076080322, + "num_tokens": 513054706.0, + "step": 13452 + }, + { + "epoch": 1.7113598778781327, + "ewc_loss": 0.06750982999801636, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003308600280433893, + "grad_norm": 7.839777946472168, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8696675300598145, + "num_tokens": 513094089.0, + "step": 13453 + }, + { + "epoch": 1.711487088156723, + "ewc_loss": 0.06774959713220596, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033081628498621285, + "grad_norm": 7.842919826507568, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8603437542915344, + "num_tokens": 513136306.0, + "step": 13454 + }, + { + "epoch": 1.7116142984353135, + "ewc_loss": 0.06728824973106384, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003310856700409204, + "grad_norm": 7.893614768981934, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8588632345199585, + "num_tokens": 513174376.0, + "step": 13455 + }, + { + "epoch": 1.711741508713904, + "ewc_loss": 0.06777197867631912, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033104009344242513, + "grad_norm": 7.835878849029541, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8662868738174438, + "num_tokens": 513211834.0, + "step": 13456 + }, + { + "epoch": 1.7118687189924946, + "ewc_loss": 0.06792078167200089, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000332528114086017, + "grad_norm": 7.966675758361816, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8546808362007141, + "num_tokens": 513241765.0, + "step": 13457 + }, + { + "epoch": 1.7119959292710851, + "ewc_loss": 0.06714960932731628, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032969925086945295, + "grad_norm": 7.846938610076904, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8500667810440063, + "num_tokens": 513283510.0, + "step": 13458 + }, + { + "epoch": 1.7121231395496757, + "ewc_loss": 0.0679701566696167, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003330218605697155, + "grad_norm": 7.924124240875244, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8618299961090088, + "num_tokens": 513320746.0, + "step": 13459 + }, + { + "epoch": 1.712250349828266, + "ewc_loss": 0.06767840683460236, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003301044343970716, + "grad_norm": 7.872740268707275, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8584774732589722, + "num_tokens": 513358036.0, + "step": 13460 + }, + { + "epoch": 1.7123775601068565, + "ewc_loss": 0.06783103942871094, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033163069747388363, + "grad_norm": 7.901170253753662, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8688039779663086, + "num_tokens": 513395156.0, + "step": 13461 + }, + { + "epoch": 1.712504770385447, + "ewc_loss": 0.06766213476657867, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00032994162756949663, + "grad_norm": 7.887272834777832, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8587054014205933, + "num_tokens": 513438631.0, + "step": 13462 + }, + { + "epoch": 1.7126319806640375, + "ewc_loss": 0.06778737902641296, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033119411091320217, + "grad_norm": 8.027874946594238, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8527826070785522, + "num_tokens": 513471225.0, + "step": 13463 + }, + { + "epoch": 1.712759190942628, + "ewc_loss": 0.06741976737976074, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00032751800608821213, + "grad_norm": 7.7802205085754395, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8550516963005066, + "num_tokens": 513513824.0, + "step": 13464 + }, + { + "epoch": 1.7128864012212186, + "ewc_loss": 0.06789053976535797, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003322256961837411, + "grad_norm": 7.940343856811523, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.856143593788147, + "num_tokens": 513556263.0, + "step": 13465 + }, + { + "epoch": 1.7130136114998091, + "ewc_loss": 0.0674523413181305, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000327843677951023, + "grad_norm": 7.825882911682129, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.866805911064148, + "num_tokens": 513595093.0, + "step": 13466 + }, + { + "epoch": 1.7131408217783997, + "ewc_loss": 0.0678706169128418, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033202648046426475, + "grad_norm": 7.936773777008057, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8544378280639648, + "num_tokens": 513633761.0, + "step": 13467 + }, + { + "epoch": 1.7132680320569902, + "ewc_loss": 0.06729966402053833, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032875838223844767, + "grad_norm": 7.845980644226074, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.859900176525116, + "num_tokens": 513671404.0, + "step": 13468 + }, + { + "epoch": 1.7133952423355807, + "ewc_loss": 0.06759865581989288, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003317482478450984, + "grad_norm": 7.893878936767578, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8616114258766174, + "num_tokens": 513713752.0, + "step": 13469 + }, + { + "epoch": 1.7135224526141712, + "ewc_loss": 0.06746736168861389, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003304353158455342, + "grad_norm": 7.852525234222412, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8585677146911621, + "num_tokens": 513755261.0, + "step": 13470 + }, + { + "epoch": 1.7136496628927618, + "ewc_loss": 0.06872671097517014, + "ewc_loss_diag": 3.5762786865234375e-05, + "ewc_loss_parallel": 0.00033082178561016917, + "grad_norm": 53.819759368896484, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8679546117782593, + "num_tokens": 513799042.0, + "step": 13471 + }, + { + "epoch": 1.7137768731713523, + "ewc_loss": 0.11082159727811813, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0007566534914076328, + "grad_norm": 12.679069519042969, + "learning_rate": 1e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8524036407470703, + "num_tokens": 513838148.0, + "step": 13472 + }, + { + "epoch": 1.7139040834499428, + "ewc_loss": 0.06852176785469055, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000333655159920454, + "grad_norm": 6.846506595611572, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8700969219207764, + "num_tokens": 513878960.0, + "step": 13473 + }, + { + "epoch": 1.7140312937285334, + "ewc_loss": 0.0943414717912674, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0005918521783314645, + "grad_norm": 11.720748901367188, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.864855170249939, + "num_tokens": 513914083.0, + "step": 13474 + }, + { + "epoch": 1.7141585040071239, + "ewc_loss": 0.10059263557195663, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0006543638883158565, + "grad_norm": 11.573266983032227, + "learning_rate": 1e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8517208099365234, + "num_tokens": 513951591.0, + "step": 13475 + }, + { + "epoch": 1.7142857142857144, + "ewc_loss": 0.07778678834438324, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00042630539974197745, + "grad_norm": 8.406363487243652, + "learning_rate": 1e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8476387858390808, + "num_tokens": 513986127.0, + "step": 13476 + }, + { + "epoch": 1.714412924564305, + "ewc_loss": 0.08155932277441025, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0004640307161025703, + "grad_norm": 9.971508979797363, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8569333553314209, + "num_tokens": 514028248.0, + "step": 13477 + }, + { + "epoch": 1.7145401348428955, + "ewc_loss": 0.08360574394464493, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0004893777659162879, + "grad_norm": 9.665849685668945, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8652265071868896, + "num_tokens": 514062757.0, + "step": 13478 + }, + { + "epoch": 1.7146673451214858, + "ewc_loss": 0.07505503296852112, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0004038706247229129, + "grad_norm": 8.608186721801758, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8500951528549194, + "num_tokens": 514102635.0, + "step": 13479 + }, + { + "epoch": 1.7147945554000763, + "ewc_loss": 0.07585836946964264, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00041434538434259593, + "grad_norm": 9.11561107635498, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8614158630371094, + "num_tokens": 514140883.0, + "step": 13480 + }, + { + "epoch": 1.7149217656786668, + "ewc_loss": 0.07477506995201111, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00040351247298531234, + "grad_norm": 8.61032485961914, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8727418780326843, + "num_tokens": 514176536.0, + "step": 13481 + }, + { + "epoch": 1.7150489759572574, + "ewc_loss": 0.07279255986213684, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00038368735113181174, + "grad_norm": 8.608871459960938, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8564655184745789, + "num_tokens": 514209969.0, + "step": 13482 + }, + { + "epoch": 1.7151761862358479, + "ewc_loss": 0.07225871086120605, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00037834877730347216, + "grad_norm": 8.452573776245117, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8612397909164429, + "num_tokens": 514249679.0, + "step": 13483 + }, + { + "epoch": 1.7153033965144384, + "ewc_loss": 0.07144351303577423, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00037019685260020196, + "grad_norm": 8.360932350158691, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8566409945487976, + "num_tokens": 514292719.0, + "step": 13484 + }, + { + "epoch": 1.7154306067930287, + "ewc_loss": 0.07028943300247192, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003610974526964128, + "grad_norm": 8.282876968383789, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8559989929199219, + "num_tokens": 514323396.0, + "step": 13485 + }, + { + "epoch": 1.7155578170716193, + "ewc_loss": 0.07012459635734558, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003594490699470043, + "grad_norm": 8.199512481689453, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8690330386161804, + "num_tokens": 514360951.0, + "step": 13486 + }, + { + "epoch": 1.7156850273502098, + "ewc_loss": 0.06970838457345963, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003528455563355237, + "grad_norm": 8.19011402130127, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8472323417663574, + "num_tokens": 514403879.0, + "step": 13487 + }, + { + "epoch": 1.7158122376288003, + "ewc_loss": 0.06920365244150162, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003502396575640887, + "grad_norm": 8.028254508972168, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8697266578674316, + "num_tokens": 514446628.0, + "step": 13488 + }, + { + "epoch": 1.7159394479073908, + "ewc_loss": 0.06878650188446045, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003460681182332337, + "grad_norm": 8.090402603149414, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8623191118240356, + "num_tokens": 514486132.0, + "step": 13489 + }, + { + "epoch": 1.7160666581859814, + "ewc_loss": 0.06863055378198624, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00034450864768587053, + "grad_norm": 8.021268844604492, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8507782816886902, + "num_tokens": 514526384.0, + "step": 13490 + }, + { + "epoch": 1.716193868464572, + "ewc_loss": 0.06842059642076492, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00034240909735672176, + "grad_norm": 8.027737617492676, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8608889579772949, + "num_tokens": 514565828.0, + "step": 13491 + }, + { + "epoch": 1.7163210787431624, + "ewc_loss": 0.06823210418224335, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00034052415867336094, + "grad_norm": 8.041308403015137, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8582308292388916, + "num_tokens": 514605260.0, + "step": 13492 + }, + { + "epoch": 1.716448289021753, + "ewc_loss": 0.06808000057935715, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003390031342860311, + "grad_norm": 7.985986232757568, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8646559715270996, + "num_tokens": 514643433.0, + "step": 13493 + }, + { + "epoch": 1.7165754993003435, + "ewc_loss": 0.06780216842889786, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003362248244229704, + "grad_norm": 7.919280052185059, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8654467463493347, + "num_tokens": 514683264.0, + "step": 13494 + }, + { + "epoch": 1.716702709578934, + "ewc_loss": 0.06800619512796402, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033826506114564836, + "grad_norm": 7.968306064605713, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8704595565795898, + "num_tokens": 514719775.0, + "step": 13495 + }, + { + "epoch": 1.7168299198575245, + "ewc_loss": 0.06776688992977142, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003358720277901739, + "grad_norm": 7.911305904388428, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8547625541687012, + "num_tokens": 514766287.0, + "step": 13496 + }, + { + "epoch": 1.716957130136115, + "ewc_loss": 0.06784741580486298, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003366772725712508, + "grad_norm": 7.97510290145874, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8725066184997559, + "num_tokens": 514806725.0, + "step": 13497 + }, + { + "epoch": 1.7170843404147056, + "ewc_loss": 0.06760621070861816, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033426526351831853, + "grad_norm": 7.8877363204956055, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8660603761672974, + "num_tokens": 514844008.0, + "step": 13498 + }, + { + "epoch": 1.7172115506932961, + "ewc_loss": 0.06779548525810242, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003361580311320722, + "grad_norm": 7.999322891235352, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8494960069656372, + "num_tokens": 514880356.0, + "step": 13499 + }, + { + "epoch": 1.7173387609718866, + "ewc_loss": 0.06739538908004761, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003321570111438632, + "grad_norm": 7.849656105041504, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8741726875305176, + "num_tokens": 514915309.0, + "step": 13500 + }, + { + "epoch": 1.7174659712504772, + "ewc_loss": 0.0678013265132904, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003362164134159684, + "grad_norm": 8.022053718566895, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8563024997711182, + "num_tokens": 514956496.0, + "step": 13501 + }, + { + "epoch": 1.7175931815290677, + "ewc_loss": 0.06736090034246445, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003318121307529509, + "grad_norm": 7.915046215057373, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8625954389572144, + "num_tokens": 514992247.0, + "step": 13502 + }, + { + "epoch": 1.717720391807658, + "ewc_loss": 0.06774044781923294, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000335607590386644, + "grad_norm": 7.93308162689209, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.854907214641571, + "num_tokens": 515035443.0, + "step": 13503 + }, + { + "epoch": 1.7178476020862485, + "ewc_loss": 0.0674620270729065, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003328233433421701, + "grad_norm": 7.935290813446045, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8669391870498657, + "num_tokens": 515072279.0, + "step": 13504 + }, + { + "epoch": 1.717974812364839, + "ewc_loss": 0.06752042472362518, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033340739901177585, + "grad_norm": 7.933178901672363, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8603829145431519, + "num_tokens": 515112547.0, + "step": 13505 + }, + { + "epoch": 1.7181020226434296, + "ewc_loss": 0.0675756186246872, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033395926584489644, + "grad_norm": 7.961811065673828, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8499189615249634, + "num_tokens": 515146814.0, + "step": 13506 + }, + { + "epoch": 1.7182292329220201, + "ewc_loss": 0.06741821020841599, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003323852433823049, + "grad_norm": 7.886651992797852, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8643265962600708, + "num_tokens": 515184393.0, + "step": 13507 + }, + { + "epoch": 1.7183564432006107, + "ewc_loss": 0.06761160492897034, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033431919291615486, + "grad_norm": 7.931828022003174, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8552380204200745, + "num_tokens": 515225312.0, + "step": 13508 + }, + { + "epoch": 1.718483653479201, + "ewc_loss": 0.06741306185722351, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033233370049856603, + "grad_norm": 7.978167533874512, + "learning_rate": 1e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8404493927955627, + "num_tokens": 515261330.0, + "step": 13509 + }, + { + "epoch": 1.7186108637577915, + "ewc_loss": 0.06786025315523148, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003343642456457019, + "grad_norm": 7.989020347595215, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.868254542350769, + "num_tokens": 515299325.0, + "step": 13510 + }, + { + "epoch": 1.718738074036382, + "ewc_loss": 0.06765660643577576, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033232782152481377, + "grad_norm": 7.899838447570801, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8626723289489746, + "num_tokens": 515335213.0, + "step": 13511 + }, + { + "epoch": 1.7188652843149725, + "ewc_loss": 0.06767474114894867, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033495048410259187, + "grad_norm": 8.025870323181152, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8731734156608582, + "num_tokens": 515365699.0, + "step": 13512 + }, + { + "epoch": 1.718992494593563, + "ewc_loss": 0.0672970712184906, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003311738546472043, + "grad_norm": 7.9158101081848145, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8709766268730164, + "num_tokens": 515396025.0, + "step": 13513 + }, + { + "epoch": 1.7191197048721536, + "ewc_loss": 0.06765830516815186, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033478622208349407, + "grad_norm": 7.985107421875, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8640477657318115, + "num_tokens": 515429724.0, + "step": 13514 + }, + { + "epoch": 1.7192469151507441, + "ewc_loss": 0.06719572842121124, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003301604010630399, + "grad_norm": 7.893363952636719, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8586572408676147, + "num_tokens": 515465894.0, + "step": 13515 + }, + { + "epoch": 1.7193741254293347, + "ewc_loss": 0.06780391931533813, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003338008827995509, + "grad_norm": 7.926722049713135, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.881577730178833, + "num_tokens": 515498894.0, + "step": 13516 + }, + { + "epoch": 1.7195013357079252, + "ewc_loss": 0.06754930317401886, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033125473419204354, + "grad_norm": 7.882948398590088, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8619931936264038, + "num_tokens": 515538783.0, + "step": 13517 + }, + { + "epoch": 1.7196285459865157, + "ewc_loss": 0.06767807900905609, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033254249137826264, + "grad_norm": 7.929240703582764, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8615727424621582, + "num_tokens": 515573107.0, + "step": 13518 + }, + { + "epoch": 1.7197557562651062, + "ewc_loss": 0.06791737675666809, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003324940335005522, + "grad_norm": 7.900973796844482, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.864549994468689, + "num_tokens": 515614901.0, + "step": 13519 + }, + { + "epoch": 1.7198829665436968, + "ewc_loss": 0.06761054694652557, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033186713699251413, + "grad_norm": 13.846302032470703, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8659825325012207, + "num_tokens": 515654197.0, + "step": 13520 + }, + { + "epoch": 1.7200101768222873, + "ewc_loss": 0.07662808895111084, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0004196012450847775, + "grad_norm": 8.907184600830078, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8806295394897461, + "num_tokens": 515697003.0, + "step": 13521 + }, + { + "epoch": 1.7201373871008778, + "ewc_loss": 0.0673966184258461, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003272864851169288, + "grad_norm": 7.943874835968018, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8580046892166138, + "num_tokens": 515737801.0, + "step": 13522 + }, + { + "epoch": 1.7202645973794684, + "ewc_loss": 0.0690804198384285, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003441245062276721, + "grad_norm": 8.21658992767334, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8541048765182495, + "num_tokens": 515774711.0, + "step": 13523 + }, + { + "epoch": 1.7203918076580589, + "ewc_loss": 0.0684511661529541, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003402733418624848, + "grad_norm": 8.029266357421875, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8547003269195557, + "num_tokens": 515811728.0, + "step": 13524 + }, + { + "epoch": 1.7205190179366494, + "ewc_loss": 0.06836868822574615, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003394486557226628, + "grad_norm": 8.156661987304688, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8688498735427856, + "num_tokens": 515844425.0, + "step": 13525 + }, + { + "epoch": 1.72064622821524, + "ewc_loss": 0.06771311163902283, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003328928432893008, + "grad_norm": 7.908295631408691, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8635549545288086, + "num_tokens": 515882439.0, + "step": 13526 + }, + { + "epoch": 1.7207734384938305, + "ewc_loss": 0.06842827051877975, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033760303631424904, + "grad_norm": 8.04234790802002, + "learning_rate": 1e-06, + "loss": 0.5509, + "mean_token_accuracy": 0.8414309024810791, + "num_tokens": 515922168.0, + "step": 13527 + }, + { + "epoch": 1.7209006487724208, + "ewc_loss": 0.0677194893360138, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003329566097818315, + "grad_norm": 7.94241189956665, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8700771331787109, + "num_tokens": 515961840.0, + "step": 13528 + }, + { + "epoch": 1.7210278590510113, + "ewc_loss": 0.06788049638271332, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033456666278652847, + "grad_norm": 8.069405555725098, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8624333143234253, + "num_tokens": 515997586.0, + "step": 13529 + }, + { + "epoch": 1.7211550693296018, + "ewc_loss": 0.06774729490280151, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003332346968818456, + "grad_norm": 7.9257283210754395, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8690112233161926, + "num_tokens": 516036660.0, + "step": 13530 + }, + { + "epoch": 1.7212822796081924, + "ewc_loss": 0.0677647590637207, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.000333409319864586, + "grad_norm": 8.012177467346191, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8547177314758301, + "num_tokens": 516070846.0, + "step": 13531 + }, + { + "epoch": 1.7214094898867829, + "ewc_loss": 0.06754621863365173, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003312239423394203, + "grad_norm": 7.871624946594238, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8848744630813599, + "num_tokens": 516107027.0, + "step": 13532 + }, + { + "epoch": 1.7215367001653734, + "ewc_loss": 0.0678010806441307, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033377253566868603, + "grad_norm": 7.990931510925293, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8632935285568237, + "num_tokens": 516143971.0, + "step": 13533 + }, + { + "epoch": 1.7216639104439637, + "ewc_loss": 0.06749416142702103, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033070333302021027, + "grad_norm": 7.86680269241333, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8644452095031738, + "num_tokens": 516180617.0, + "step": 13534 + }, + { + "epoch": 1.7217911207225542, + "ewc_loss": 0.06800787895917892, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003333991044200957, + "grad_norm": 7.9888386726379395, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8633708357810974, + "num_tokens": 516220371.0, + "step": 13535 + }, + { + "epoch": 1.7219183310011448, + "ewc_loss": 0.0676942691206932, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033026302116923034, + "grad_norm": 7.868009567260742, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8653318285942078, + "num_tokens": 516258642.0, + "step": 13536 + }, + { + "epoch": 1.7220455412797353, + "ewc_loss": 0.06803825497627258, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003337028610985726, + "grad_norm": 7.977386474609375, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8566316366195679, + "num_tokens": 516298971.0, + "step": 13537 + }, + { + "epoch": 1.7221727515583258, + "ewc_loss": 0.06739403307437897, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00032970207394100726, + "grad_norm": 7.896234035491943, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8752521872520447, + "num_tokens": 516334457.0, + "step": 13538 + }, + { + "epoch": 1.7222999618369164, + "ewc_loss": 0.06797084212303162, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033302875817753375, + "grad_norm": 7.901584148406982, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8878999948501587, + "num_tokens": 516374585.0, + "step": 13539 + }, + { + "epoch": 1.7224271721155069, + "ewc_loss": 0.0675676241517067, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033143797190859914, + "grad_norm": 7.949355602264404, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8491615056991577, + "num_tokens": 516414180.0, + "step": 13540 + }, + { + "epoch": 1.7225543823940974, + "ewc_loss": 0.06752820312976837, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.0003310437605250627, + "grad_norm": 7.920706748962402, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8711138963699341, + "num_tokens": 516450189.0, + "step": 13541 + }, + { + "epoch": 1.722681592672688, + "ewc_loss": 0.0676925778388977, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033268745755776763, + "grad_norm": 7.996232032775879, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8617714047431946, + "num_tokens": 516487549.0, + "step": 13542 + }, + { + "epoch": 1.7228088029512785, + "ewc_loss": 0.06745266914367676, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033028842881321907, + "grad_norm": 7.987461090087891, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8493385314941406, + "num_tokens": 516526649.0, + "step": 13543 + }, + { + "epoch": 1.722936013229869, + "ewc_loss": 0.06756982207298279, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033145988709293306, + "grad_norm": 7.93364143371582, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8681186437606812, + "num_tokens": 516563829.0, + "step": 13544 + }, + { + "epoch": 1.7230632235084595, + "ewc_loss": 0.0678505152463913, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003318254603073001, + "grad_norm": 7.951426982879639, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8454082012176514, + "num_tokens": 516605179.0, + "step": 13545 + }, + { + "epoch": 1.72319043378705, + "ewc_loss": 0.06752052903175354, + "ewc_loss_diag": 3.4332275390625e-05, + "ewc_loss_parallel": 0.00033096704282797873, + "grad_norm": 7.926649570465088, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8611385822296143, + "num_tokens": 516641576.0, + "step": 13546 + }, + { + "epoch": 1.7233176440656406, + "ewc_loss": 0.06776879727840424, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003310082829557359, + "grad_norm": 8.010791778564453, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.859447181224823, + "num_tokens": 516678687.0, + "step": 13547 + }, + { + "epoch": 1.7234448543442311, + "ewc_loss": 0.06758420169353485, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003291622851975262, + "grad_norm": 7.872732162475586, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8583019375801086, + "num_tokens": 516711484.0, + "step": 13548 + }, + { + "epoch": 1.7235720646228216, + "ewc_loss": 0.06798157095909119, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033313603489659727, + "grad_norm": 7.997054100036621, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8580077290534973, + "num_tokens": 516750598.0, + "step": 13549 + }, + { + "epoch": 1.7236992749014122, + "ewc_loss": 0.06755334883928299, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003288538136985153, + "grad_norm": 7.843559741973877, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8708412051200867, + "num_tokens": 516793438.0, + "step": 13550 + }, + { + "epoch": 1.7238264851800027, + "ewc_loss": 0.06791503727436066, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033247063402086496, + "grad_norm": 7.954732894897461, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8768134713172913, + "num_tokens": 516828692.0, + "step": 13551 + }, + { + "epoch": 1.723953695458593, + "ewc_loss": 0.06760310381650925, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00032935134368017316, + "grad_norm": 7.890081882476807, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8633847832679749, + "num_tokens": 516867592.0, + "step": 13552 + }, + { + "epoch": 1.7240809057371835, + "ewc_loss": 0.06785859167575836, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000331906252540648, + "grad_norm": 7.946935176849365, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8509587049484253, + "num_tokens": 516907668.0, + "step": 13553 + }, + { + "epoch": 1.724208116015774, + "ewc_loss": 0.06761516630649567, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00032947194995358586, + "grad_norm": 7.849014759063721, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8605011701583862, + "num_tokens": 516944628.0, + "step": 13554 + }, + { + "epoch": 1.7243353262943646, + "ewc_loss": 0.06794323027133942, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003327526501379907, + "grad_norm": 7.927402973175049, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8571845293045044, + "num_tokens": 516989027.0, + "step": 13555 + }, + { + "epoch": 1.7244625365729551, + "ewc_loss": 0.06768326461315155, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000330152950482443, + "grad_norm": 7.880805015563965, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8580151200294495, + "num_tokens": 517024464.0, + "step": 13556 + }, + { + "epoch": 1.7245897468515456, + "ewc_loss": 0.06789662688970566, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003322865813970566, + "grad_norm": 7.878881931304932, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.866669237613678, + "num_tokens": 517067714.0, + "step": 13557 + }, + { + "epoch": 1.724716957130136, + "ewc_loss": 0.06785208731889725, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003318411763757467, + "grad_norm": 7.957127094268799, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8661220669746399, + "num_tokens": 517107564.0, + "step": 13558 + }, + { + "epoch": 1.7248441674087265, + "ewc_loss": 0.06715016067028046, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003297047223895788, + "grad_norm": 7.863607406616211, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8670524954795837, + "num_tokens": 517148240.0, + "step": 13559 + }, + { + "epoch": 1.724971377687317, + "ewc_loss": 0.06741432845592499, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003323464188724756, + "grad_norm": 7.912586688995361, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8569002747535706, + "num_tokens": 517187841.0, + "step": 13560 + }, + { + "epoch": 1.7250985879659075, + "ewc_loss": 0.0676543116569519, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00032986339647322893, + "grad_norm": 7.891787528991699, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8463851809501648, + "num_tokens": 517225732.0, + "step": 13561 + }, + { + "epoch": 1.725225798244498, + "ewc_loss": 0.06729978322982788, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003312009503133595, + "grad_norm": 7.93457555770874, + "learning_rate": 1e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.846281111240387, + "num_tokens": 517260423.0, + "step": 13562 + }, + { + "epoch": 1.7253530085230886, + "ewc_loss": 0.06726056337356567, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000330808776197955, + "grad_norm": 7.861402988433838, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8501157760620117, + "num_tokens": 517300862.0, + "step": 13563 + }, + { + "epoch": 1.7254802188016791, + "ewc_loss": 0.0678642988204956, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003319633542560041, + "grad_norm": 7.935959815979004, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8673654794692993, + "num_tokens": 517341010.0, + "step": 13564 + }, + { + "epoch": 1.7256074290802697, + "ewc_loss": 0.06774881482124329, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003308085142634809, + "grad_norm": 7.838951587677002, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8457277417182922, + "num_tokens": 517376853.0, + "step": 13565 + }, + { + "epoch": 1.7257346393588602, + "ewc_loss": 0.06749395281076431, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003331426705699414, + "grad_norm": 7.9164323806762695, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8565517067909241, + "num_tokens": 517421854.0, + "step": 13566 + }, + { + "epoch": 1.7258618496374507, + "ewc_loss": 0.0678054541349411, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003313748457003385, + "grad_norm": 7.853369235992432, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8556140065193176, + "num_tokens": 517461711.0, + "step": 13567 + }, + { + "epoch": 1.7259890599160412, + "ewc_loss": 0.0680747926235199, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003340682596899569, + "grad_norm": 7.993476390838623, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8692805767059326, + "num_tokens": 517494755.0, + "step": 13568 + }, + { + "epoch": 1.7261162701946318, + "ewc_loss": 0.06774767488241196, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033079704735428095, + "grad_norm": 7.848400592803955, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8538856506347656, + "num_tokens": 517539061.0, + "step": 13569 + }, + { + "epoch": 1.7262434804732223, + "ewc_loss": 0.06811435520648956, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033446380984969437, + "grad_norm": 8.011444091796875, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8586980700492859, + "num_tokens": 517579768.0, + "step": 13570 + }, + { + "epoch": 1.7263706907518128, + "ewc_loss": 0.06774508953094482, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003307711740490049, + "grad_norm": 7.862654209136963, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8744784593582153, + "num_tokens": 517612006.0, + "step": 13571 + }, + { + "epoch": 1.7264979010304033, + "ewc_loss": 0.06815171241760254, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003348374448250979, + "grad_norm": 7.97428035736084, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8613922595977783, + "num_tokens": 517647547.0, + "step": 13572 + }, + { + "epoch": 1.7266251113089939, + "ewc_loss": 0.06779737770557404, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003312940534669906, + "grad_norm": 7.844228744506836, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8529621362686157, + "num_tokens": 517689067.0, + "step": 13573 + }, + { + "epoch": 1.7267523215875844, + "ewc_loss": 0.06806156039237976, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033393592457287014, + "grad_norm": 7.952097415924072, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8669883012771606, + "num_tokens": 517729967.0, + "step": 13574 + }, + { + "epoch": 1.726879531866175, + "ewc_loss": 0.06775146722793579, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000330835027853027, + "grad_norm": 7.858894348144531, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8615995645523071, + "num_tokens": 517770427.0, + "step": 13575 + }, + { + "epoch": 1.7270067421447655, + "ewc_loss": 0.06810735166072845, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003343938151374459, + "grad_norm": 8.038684844970703, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8587993383407593, + "num_tokens": 517804202.0, + "step": 13576 + }, + { + "epoch": 1.7271339524233558, + "ewc_loss": 0.06778550148010254, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033117528073489666, + "grad_norm": 7.925358295440674, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8547998070716858, + "num_tokens": 517834825.0, + "step": 13577 + }, + { + "epoch": 1.7272611627019463, + "ewc_loss": 0.06802226603031158, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003335429937578738, + "grad_norm": 7.986289978027344, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8550837635993958, + "num_tokens": 517874600.0, + "step": 13578 + }, + { + "epoch": 1.7273883729805368, + "ewc_loss": 0.06776192784309387, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003309395688120276, + "grad_norm": 7.916263103485107, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8527878522872925, + "num_tokens": 517913488.0, + "step": 13579 + }, + { + "epoch": 1.7275155832591274, + "ewc_loss": 0.06798180937767029, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003331384214106947, + "grad_norm": 7.9700117111206055, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8767048716545105, + "num_tokens": 517951094.0, + "step": 13580 + }, + { + "epoch": 1.7276427935377179, + "ewc_loss": 0.06776759773492813, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033099629217758775, + "grad_norm": 7.883157253265381, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8512781858444214, + "num_tokens": 517993811.0, + "step": 13581 + }, + { + "epoch": 1.7277700038163084, + "ewc_loss": 0.0678924098610878, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033224440994672477, + "grad_norm": 7.9655985832214355, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8638483285903931, + "num_tokens": 518032860.0, + "step": 13582 + }, + { + "epoch": 1.7278972140948987, + "ewc_loss": 0.0676744282245636, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033006464946083724, + "grad_norm": 7.887602806091309, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8571305871009827, + "num_tokens": 518070256.0, + "step": 13583 + }, + { + "epoch": 1.7280244243734892, + "ewc_loss": 0.06789884716272354, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033230878761969507, + "grad_norm": 7.923738479614258, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.868182897567749, + "num_tokens": 518102828.0, + "step": 13584 + }, + { + "epoch": 1.7281516346520798, + "ewc_loss": 0.06775087118148804, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003308290324639529, + "grad_norm": 7.856184005737305, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8740692138671875, + "num_tokens": 518146859.0, + "step": 13585 + }, + { + "epoch": 1.7282788449306703, + "ewc_loss": 0.06749362498521805, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003331393818370998, + "grad_norm": 7.916895389556885, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8695234656333923, + "num_tokens": 518186173.0, + "step": 13586 + }, + { + "epoch": 1.7284060552092608, + "ewc_loss": 0.06775900721549988, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033091037767007947, + "grad_norm": 7.86457633972168, + "learning_rate": 1e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8460870981216431, + "num_tokens": 518221903.0, + "step": 13587 + }, + { + "epoch": 1.7285332654878514, + "ewc_loss": 0.0675237700343132, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003334408102091402, + "grad_norm": 8.011486053466797, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.852184534072876, + "num_tokens": 518258048.0, + "step": 13588 + }, + { + "epoch": 1.7286604757664419, + "ewc_loss": 0.06714898347854614, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00032969299354590476, + "grad_norm": 7.847193241119385, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8539934158325195, + "num_tokens": 518299127.0, + "step": 13589 + }, + { + "epoch": 1.7287876860450324, + "ewc_loss": 0.06807848066091537, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033410510513931513, + "grad_norm": 8.004279136657715, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8726342916488647, + "num_tokens": 518336520.0, + "step": 13590 + }, + { + "epoch": 1.728914896323623, + "ewc_loss": 0.06763765960931778, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00032969689345918596, + "grad_norm": 7.925123691558838, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8583550453186035, + "num_tokens": 518368484.0, + "step": 13591 + }, + { + "epoch": 1.7290421066022135, + "ewc_loss": 0.0679338276386261, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003326585574541241, + "grad_norm": 7.960103988647461, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8591631650924683, + "num_tokens": 518408439.0, + "step": 13592 + }, + { + "epoch": 1.729169316880804, + "ewc_loss": 0.06769124418497086, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033023275318555534, + "grad_norm": 7.94433069229126, + "learning_rate": 1e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8416029810905457, + "num_tokens": 518447414.0, + "step": 13593 + }, + { + "epoch": 1.7292965271593945, + "ewc_loss": 0.06792224943637848, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003325428406242281, + "grad_norm": 7.960083961486816, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8571112751960754, + "num_tokens": 518486455.0, + "step": 13594 + }, + { + "epoch": 1.729423737437985, + "ewc_loss": 0.06769432127475739, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033026348683051765, + "grad_norm": 7.888777256011963, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8782473802566528, + "num_tokens": 518527023.0, + "step": 13595 + }, + { + "epoch": 1.7295509477165756, + "ewc_loss": 0.06784255057573318, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033174583222717047, + "grad_norm": 7.959120273590088, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8553997278213501, + "num_tokens": 518563285.0, + "step": 13596 + }, + { + "epoch": 1.729678157995166, + "ewc_loss": 0.06769789010286331, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003302992263343185, + "grad_norm": 7.907778263092041, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8623561859130859, + "num_tokens": 518607610.0, + "step": 13597 + }, + { + "epoch": 1.7298053682737566, + "ewc_loss": 0.06782038509845734, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003315241483505815, + "grad_norm": 7.937204360961914, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8714559674263, + "num_tokens": 518649434.0, + "step": 13598 + }, + { + "epoch": 1.7299325785523472, + "ewc_loss": 0.06775382906198502, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033085860195569694, + "grad_norm": 7.914804458618164, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8692257404327393, + "num_tokens": 518684508.0, + "step": 13599 + }, + { + "epoch": 1.7300597888309377, + "ewc_loss": 0.06784909963607788, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003318113158456981, + "grad_norm": 7.916926383972168, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8672794699668884, + "num_tokens": 518717883.0, + "step": 13600 + }, + { + "epoch": 1.730186999109528, + "ewc_loss": 0.06732690334320068, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003314721107017249, + "grad_norm": 7.882257461547852, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8675785660743713, + "num_tokens": 518756158.0, + "step": 13601 + }, + { + "epoch": 1.7303142093881185, + "ewc_loss": 0.06799735873937607, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003332938940729946, + "grad_norm": 8.15347671508789, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.861717939376831, + "num_tokens": 518793003.0, + "step": 13602 + }, + { + "epoch": 1.730441419666709, + "ewc_loss": 0.0674862265586853, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003281826211605221, + "grad_norm": 7.826088905334473, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8558328747749329, + "num_tokens": 518838485.0, + "step": 13603 + }, + { + "epoch": 1.7305686299452996, + "ewc_loss": 0.06764522939920425, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033465540036559105, + "grad_norm": 7.999837398529053, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8550834655761719, + "num_tokens": 518885936.0, + "step": 13604 + }, + { + "epoch": 1.7306958402238901, + "ewc_loss": 0.06713084131479263, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000329511531163007, + "grad_norm": 7.833184242248535, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8490359783172607, + "num_tokens": 518925684.0, + "step": 13605 + }, + { + "epoch": 1.7308230505024806, + "ewc_loss": 0.06776063144207001, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033580942545086145, + "grad_norm": 8.00475788116455, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8634806871414185, + "num_tokens": 518966438.0, + "step": 13606 + }, + { + "epoch": 1.730950260781071, + "ewc_loss": 0.06726591289043427, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033086223993450403, + "grad_norm": 7.796891689300537, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8646918535232544, + "num_tokens": 519013576.0, + "step": 13607 + }, + { + "epoch": 1.7310774710596615, + "ewc_loss": 0.06790201365947723, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003372232604306191, + "grad_norm": 8.050187110900879, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8586543798446655, + "num_tokens": 519045342.0, + "step": 13608 + }, + { + "epoch": 1.731204681338252, + "ewc_loss": 0.06734582781791687, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000331661372911185, + "grad_norm": 7.84548282623291, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8483065366744995, + "num_tokens": 519088566.0, + "step": 13609 + }, + { + "epoch": 1.7313318916168425, + "ewc_loss": 0.06789924949407578, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003371956117916852, + "grad_norm": 8.002106666564941, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8650194406509399, + "num_tokens": 519126516.0, + "step": 13610 + }, + { + "epoch": 1.731459101895433, + "ewc_loss": 0.06744492799043655, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033265238744206727, + "grad_norm": 7.830037593841553, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8706790208816528, + "num_tokens": 519165431.0, + "step": 13611 + }, + { + "epoch": 1.7315863121740236, + "ewc_loss": 0.06787393987178802, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033694252488203347, + "grad_norm": 8.097475051879883, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8778660297393799, + "num_tokens": 519200991.0, + "step": 13612 + }, + { + "epoch": 1.7317135224526141, + "ewc_loss": 0.06782159209251404, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033153625554405153, + "grad_norm": 8.036819458007812, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8687160015106201, + "num_tokens": 519245988.0, + "step": 13613 + }, + { + "epoch": 1.7318407327312046, + "ewc_loss": 0.06747886538505554, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003329917381051928, + "grad_norm": 7.919930458068848, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8671596646308899, + "num_tokens": 519282595.0, + "step": 13614 + }, + { + "epoch": 1.7319679430097952, + "ewc_loss": 0.06742937117815018, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.000332496827468276, + "grad_norm": 7.914321422576904, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8547364473342896, + "num_tokens": 519319061.0, + "step": 13615 + }, + { + "epoch": 1.7320951532883857, + "ewc_loss": 0.06741073727607727, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033231053384952247, + "grad_norm": 7.904386043548584, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8584786653518677, + "num_tokens": 519358617.0, + "step": 13616 + }, + { + "epoch": 1.7322223635669762, + "ewc_loss": 0.06804710626602173, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033379142405465245, + "grad_norm": 9.029084205627441, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8605930209159851, + "num_tokens": 519392461.0, + "step": 13617 + }, + { + "epoch": 1.7323495738455668, + "ewc_loss": 0.06723810732364655, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003257013449911028, + "grad_norm": 7.683701038360596, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8692148923873901, + "num_tokens": 519432893.0, + "step": 13618 + }, + { + "epoch": 1.7324767841241573, + "ewc_loss": 0.0689387246966362, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003475903649814427, + "grad_norm": 8.246820449829102, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8625595569610596, + "num_tokens": 519473445.0, + "step": 13619 + }, + { + "epoch": 1.7326039944027478, + "ewc_loss": 0.06674951314926147, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003256982017774135, + "grad_norm": 7.675911903381348, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8629867434501648, + "num_tokens": 519518883.0, + "step": 13620 + }, + { + "epoch": 1.7327312046813383, + "ewc_loss": 0.06905664503574371, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003487695939838886, + "grad_norm": 8.331869125366211, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8656376600265503, + "num_tokens": 519550123.0, + "step": 13621 + }, + { + "epoch": 1.7328584149599289, + "ewc_loss": 0.06722797453403473, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003304828715045005, + "grad_norm": 7.768800258636475, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8529712557792664, + "num_tokens": 519592538.0, + "step": 13622 + }, + { + "epoch": 1.7329856252385194, + "ewc_loss": 0.0687810406088829, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003460135485511273, + "grad_norm": 8.173563003540039, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8619428277015686, + "num_tokens": 519633466.0, + "step": 13623 + }, + { + "epoch": 1.73311283551711, + "ewc_loss": 0.0673864334821701, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003320674295537174, + "grad_norm": 7.814929485321045, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8582260608673096, + "num_tokens": 519672569.0, + "step": 13624 + }, + { + "epoch": 1.7332400457957005, + "ewc_loss": 0.0685555562376976, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003437587001826614, + "grad_norm": 8.17354679107666, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8681399822235107, + "num_tokens": 519707383.0, + "step": 13625 + }, + { + "epoch": 1.7333672560742908, + "ewc_loss": 0.06751781702041626, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033338129287585616, + "grad_norm": 7.873055458068848, + "learning_rate": 1e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8441010117530823, + "num_tokens": 519748187.0, + "step": 13626 + }, + { + "epoch": 1.7334944663528813, + "ewc_loss": 0.06875905394554138, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034091089037247, + "grad_norm": 9.20872688293457, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8597812056541443, + "num_tokens": 519790215.0, + "step": 13627 + }, + { + "epoch": 1.7336216766314718, + "ewc_loss": 0.06751127541065216, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033331583836115897, + "grad_norm": 7.786603927612305, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8532564640045166, + "num_tokens": 519827278.0, + "step": 13628 + }, + { + "epoch": 1.7337488869100623, + "ewc_loss": 0.06908811628818512, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00034908432280644774, + "grad_norm": 8.255292892456055, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8690925240516663, + "num_tokens": 519860216.0, + "step": 13629 + }, + { + "epoch": 1.7338760971886529, + "ewc_loss": 0.06734641641378403, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.00033166728098876774, + "grad_norm": 7.888800144195557, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8605263233184814, + "num_tokens": 519894442.0, + "step": 13630 + }, + { + "epoch": 1.7340033074672434, + "ewc_loss": 0.06852851808071136, + "ewc_loss_diag": 3.409385681152344e-05, + "ewc_loss_parallel": 0.0003434882964938879, + "grad_norm": 8.156238555908203, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8614283800125122, + "num_tokens": 519932524.0, + "step": 13631 + }, + { + "epoch": 1.7341305177458337, + "ewc_loss": 0.06818296015262604, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003351499035488814, + "grad_norm": 7.915552616119385, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8599357008934021, + "num_tokens": 519974366.0, + "step": 13632 + }, + { + "epoch": 1.7342577280244242, + "ewc_loss": 0.06852410733699799, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003385614254511893, + "grad_norm": 8.0375337600708, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8716346025466919, + "num_tokens": 520013983.0, + "step": 13633 + }, + { + "epoch": 1.7343849383030148, + "ewc_loss": 0.06829537451267242, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033627409720793366, + "grad_norm": 8.002740859985352, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8647897243499756, + "num_tokens": 520047671.0, + "step": 13634 + }, + { + "epoch": 1.7345121485816053, + "ewc_loss": 0.06819559633731842, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033527633058838546, + "grad_norm": 7.957552909851074, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8577553629875183, + "num_tokens": 520086574.0, + "step": 13635 + }, + { + "epoch": 1.7346393588601958, + "ewc_loss": 0.06841005384922028, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003374209045432508, + "grad_norm": 7.978569507598877, + "learning_rate": 1e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8475194573402405, + "num_tokens": 520130613.0, + "step": 13636 + }, + { + "epoch": 1.7347665691387864, + "ewc_loss": 0.06812354922294617, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033455577795393765, + "grad_norm": 7.994806289672852, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8797190189361572, + "num_tokens": 520163763.0, + "step": 13637 + }, + { + "epoch": 1.7348937794173769, + "ewc_loss": 0.0682511180639267, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003358314570505172, + "grad_norm": 8.0093355178833, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8587526082992554, + "num_tokens": 520198663.0, + "step": 13638 + }, + { + "epoch": 1.7350209896959674, + "ewc_loss": 0.06813707202672958, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003346910234540701, + "grad_norm": 8.00648307800293, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8654130697250366, + "num_tokens": 520231737.0, + "step": 13639 + }, + { + "epoch": 1.735148199974558, + "ewc_loss": 0.06812787801027298, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033459908445365727, + "grad_norm": 7.891668319702148, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8664904832839966, + "num_tokens": 520275664.0, + "step": 13640 + }, + { + "epoch": 1.7352754102531485, + "ewc_loss": 0.06825503706932068, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003358706599101424, + "grad_norm": 7.991216659545898, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.851212739944458, + "num_tokens": 520316100.0, + "step": 13641 + }, + { + "epoch": 1.735402620531739, + "ewc_loss": 0.0679992288351059, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003333125787321478, + "grad_norm": 7.999340534210205, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8589190244674683, + "num_tokens": 520348167.0, + "step": 13642 + }, + { + "epoch": 1.7355298308103295, + "ewc_loss": 0.0681086927652359, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000334407202899456, + "grad_norm": 7.934449672698975, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8480833768844604, + "num_tokens": 520385234.0, + "step": 13643 + }, + { + "epoch": 1.73565704108892, + "ewc_loss": 0.06814675033092499, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033478785189799964, + "grad_norm": 7.902986526489258, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8450927734375, + "num_tokens": 520423051.0, + "step": 13644 + }, + { + "epoch": 1.7357842513675106, + "ewc_loss": 0.06814125180244446, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003347328747622669, + "grad_norm": 7.888685703277588, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8530176877975464, + "num_tokens": 520467202.0, + "step": 13645 + }, + { + "epoch": 1.735911461646101, + "ewc_loss": 0.06817461550235748, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003350664919707924, + "grad_norm": 7.916263580322266, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8598959445953369, + "num_tokens": 520501434.0, + "step": 13646 + }, + { + "epoch": 1.7360386719246916, + "ewc_loss": 0.06813859939575195, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033470633206889033, + "grad_norm": 7.9440131187438965, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8579628467559814, + "num_tokens": 520542757.0, + "step": 13647 + }, + { + "epoch": 1.7361658822032822, + "ewc_loss": 0.06819356977939606, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003352560452185571, + "grad_norm": 7.939028739929199, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.851604163646698, + "num_tokens": 520585216.0, + "step": 13648 + }, + { + "epoch": 1.7362930924818727, + "ewc_loss": 0.06807580590248108, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003340783587191254, + "grad_norm": 7.900577545166016, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8621535301208496, + "num_tokens": 520620066.0, + "step": 13649 + }, + { + "epoch": 1.736420302760463, + "ewc_loss": 0.06815040111541748, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003348242898937315, + "grad_norm": 7.925028324127197, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8681952953338623, + "num_tokens": 520660917.0, + "step": 13650 + }, + { + "epoch": 1.7365475130390535, + "ewc_loss": 0.06811212003231049, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003344414581079036, + "grad_norm": 7.910938739776611, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8527023792266846, + "num_tokens": 520704255.0, + "step": 13651 + }, + { + "epoch": 1.736674723317644, + "ewc_loss": 0.06824515014886856, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033577182330191135, + "grad_norm": 7.958189964294434, + "learning_rate": 1e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8482199311256409, + "num_tokens": 520743036.0, + "step": 13652 + }, + { + "epoch": 1.7368019335962346, + "ewc_loss": 0.06808655709028244, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000334185897372663, + "grad_norm": 7.945366859436035, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.854487419128418, + "num_tokens": 520778325.0, + "step": 13653 + }, + { + "epoch": 1.736929143874825, + "ewc_loss": 0.06823792308568954, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003356995584908873, + "grad_norm": 7.965155601501465, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8746799230575562, + "num_tokens": 520815226.0, + "step": 13654 + }, + { + "epoch": 1.7370563541534156, + "ewc_loss": 0.06816999614238739, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033502024598419666, + "grad_norm": 8.074446678161621, + "learning_rate": 1e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8411053419113159, + "num_tokens": 520856606.0, + "step": 13655 + }, + { + "epoch": 1.737183564432006, + "ewc_loss": 0.06811736524105072, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003344939323142171, + "grad_norm": 7.982696056365967, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.852529764175415, + "num_tokens": 520893493.0, + "step": 13656 + }, + { + "epoch": 1.7373107747105965, + "ewc_loss": 0.06815468519926071, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033486715983599424, + "grad_norm": 7.936376094818115, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8681957721710205, + "num_tokens": 520930391.0, + "step": 13657 + }, + { + "epoch": 1.737437984989187, + "ewc_loss": 0.0681258887052536, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003345792356412858, + "grad_norm": 8.001684188842773, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8620024919509888, + "num_tokens": 520970512.0, + "step": 13658 + }, + { + "epoch": 1.7375651952677775, + "ewc_loss": 0.06799712777137756, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033329156576655805, + "grad_norm": 7.915589332580566, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8534813523292542, + "num_tokens": 521011787.0, + "step": 13659 + }, + { + "epoch": 1.737692405546368, + "ewc_loss": 0.06823718547821045, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003356921370141208, + "grad_norm": 7.984888553619385, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8640544414520264, + "num_tokens": 521053494.0, + "step": 13660 + }, + { + "epoch": 1.7378196158249586, + "ewc_loss": 0.06801630556583405, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003334833600092679, + "grad_norm": 7.942476749420166, + "learning_rate": 1e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8498942852020264, + "num_tokens": 521089291.0, + "step": 13661 + }, + { + "epoch": 1.7379468261035491, + "ewc_loss": 0.06816136091947556, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000334933924023062, + "grad_norm": 7.893096446990967, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8736751079559326, + "num_tokens": 521128910.0, + "step": 13662 + }, + { + "epoch": 1.7380740363821396, + "ewc_loss": 0.06813442707061768, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033466462627984583, + "grad_norm": 7.934262752532959, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8569639325141907, + "num_tokens": 521168175.0, + "step": 13663 + }, + { + "epoch": 1.7382012466607302, + "ewc_loss": 0.06810936331748962, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033441392588429153, + "grad_norm": 7.9619903564453125, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8587503433227539, + "num_tokens": 521208020.0, + "step": 13664 + }, + { + "epoch": 1.7383284569393207, + "ewc_loss": 0.06805485486984253, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003338688693474978, + "grad_norm": 7.897389888763428, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8726791739463806, + "num_tokens": 521249828.0, + "step": 13665 + }, + { + "epoch": 1.7384556672179112, + "ewc_loss": 0.06812888383865356, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033460918348282576, + "grad_norm": 7.883823394775391, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8645243644714355, + "num_tokens": 521289940.0, + "step": 13666 + }, + { + "epoch": 1.7385828774965018, + "ewc_loss": 0.0680706724524498, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033402704866603017, + "grad_norm": 7.910036563873291, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8803671002388, + "num_tokens": 521327426.0, + "step": 13667 + }, + { + "epoch": 1.7387100877750923, + "ewc_loss": 0.06819376349449158, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033525796607136726, + "grad_norm": 7.9303765296936035, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8524429202079773, + "num_tokens": 521372112.0, + "step": 13668 + }, + { + "epoch": 1.7388372980536828, + "ewc_loss": 0.06813229620456696, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003346432640682906, + "grad_norm": 7.964221000671387, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.861223578453064, + "num_tokens": 521410842.0, + "step": 13669 + }, + { + "epoch": 1.7389645083322733, + "ewc_loss": 0.06806036084890366, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033392393379472196, + "grad_norm": 7.888413906097412, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8709644079208374, + "num_tokens": 521446854.0, + "step": 13670 + }, + { + "epoch": 1.7390917186108639, + "ewc_loss": 0.06830087304115295, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000336329045239836, + "grad_norm": 7.997694492340088, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8517192602157593, + "num_tokens": 521486284.0, + "step": 13671 + }, + { + "epoch": 1.7392189288894544, + "ewc_loss": 0.06804423779249191, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033376269857399166, + "grad_norm": 7.968803882598877, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8484683036804199, + "num_tokens": 521522716.0, + "step": 13672 + }, + { + "epoch": 1.739346139168045, + "ewc_loss": 0.06819018721580505, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003352222265675664, + "grad_norm": 8.018110275268555, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8564177751541138, + "num_tokens": 521551549.0, + "step": 13673 + }, + { + "epoch": 1.7394733494466355, + "ewc_loss": 0.06801086664199829, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033342899405397475, + "grad_norm": 7.89274787902832, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8644769787788391, + "num_tokens": 521589552.0, + "step": 13674 + }, + { + "epoch": 1.7396005597252258, + "ewc_loss": 0.06809277832508087, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033424815046601, + "grad_norm": 7.918868541717529, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8730760812759399, + "num_tokens": 521627906.0, + "step": 13675 + }, + { + "epoch": 1.7397277700038163, + "ewc_loss": 0.06804545968770981, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003337749221827835, + "grad_norm": 7.9638566970825195, + "learning_rate": 1e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8453569412231445, + "num_tokens": 521666615.0, + "step": 13676 + }, + { + "epoch": 1.7398549802824068, + "ewc_loss": 0.06802297383546829, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003335500368848443, + "grad_norm": 7.938999176025391, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8588700294494629, + "num_tokens": 521707742.0, + "step": 13677 + }, + { + "epoch": 1.7399821905609973, + "ewc_loss": 0.06796564161777496, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000332976778736338, + "grad_norm": 7.9506306648254395, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8505094647407532, + "num_tokens": 521746848.0, + "step": 13678 + }, + { + "epoch": 1.7401094008395879, + "ewc_loss": 0.06802709400653839, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033359130611643195, + "grad_norm": 7.917013168334961, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8619868755340576, + "num_tokens": 521782917.0, + "step": 13679 + }, + { + "epoch": 1.7402366111181784, + "ewc_loss": 0.0681806206703186, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003351265622768551, + "grad_norm": 7.96044397354126, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8591096997261047, + "num_tokens": 521825609.0, + "step": 13680 + }, + { + "epoch": 1.7403638213967687, + "ewc_loss": 0.06800410896539688, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003333614149596542, + "grad_norm": 7.878824234008789, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8748502135276794, + "num_tokens": 521862277.0, + "step": 13681 + }, + { + "epoch": 1.7404910316753592, + "ewc_loss": 0.06824605166912079, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003357808745931834, + "grad_norm": 8.030501365661621, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8795082569122314, + "num_tokens": 521894951.0, + "step": 13682 + }, + { + "epoch": 1.7406182419539498, + "ewc_loss": 0.06784556061029434, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033177592558786273, + "grad_norm": 7.858998775482178, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.861459493637085, + "num_tokens": 521932158.0, + "step": 13683 + }, + { + "epoch": 1.7407454522325403, + "ewc_loss": 0.06845077872276306, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003378280671313405, + "grad_norm": 8.014233589172363, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8670939207077026, + "num_tokens": 521971050.0, + "step": 13684 + }, + { + "epoch": 1.7408726625111308, + "ewc_loss": 0.06780947744846344, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003314151254016906, + "grad_norm": 7.9738874435424805, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8710623383522034, + "num_tokens": 522007617.0, + "step": 13685 + }, + { + "epoch": 1.7409998727897213, + "ewc_loss": 0.06817777454853058, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003350980405230075, + "grad_norm": 7.977248668670654, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8497675061225891, + "num_tokens": 522053484.0, + "step": 13686 + }, + { + "epoch": 1.7411270830683119, + "ewc_loss": 0.06798841059207916, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003332043706905097, + "grad_norm": 7.9221110343933105, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8568905591964722, + "num_tokens": 522088047.0, + "step": 13687 + }, + { + "epoch": 1.7412542933469024, + "ewc_loss": 0.06812496483325958, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000334570009727031, + "grad_norm": 7.952192306518555, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8606365919113159, + "num_tokens": 522129865.0, + "step": 13688 + }, + { + "epoch": 1.741381503625493, + "ewc_loss": 0.0680990219116211, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033431051997467875, + "grad_norm": 7.944448471069336, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8569662570953369, + "num_tokens": 522170154.0, + "step": 13689 + }, + { + "epoch": 1.7415087139040835, + "ewc_loss": 0.06869517266750336, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033538922434672713, + "grad_norm": 7.988913536071777, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8859010934829712, + "num_tokens": 522204586.0, + "step": 13690 + }, + { + "epoch": 1.741635924182674, + "ewc_loss": 0.06809365004301071, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003342567943036556, + "grad_norm": 7.97291374206543, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8658039569854736, + "num_tokens": 522241942.0, + "step": 13691 + }, + { + "epoch": 1.7417631344612645, + "ewc_loss": 0.0681314468383789, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003346347948536277, + "grad_norm": 7.971748352050781, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8584681749343872, + "num_tokens": 522275800.0, + "step": 13692 + }, + { + "epoch": 1.741890344739855, + "ewc_loss": 0.0680968165397644, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000334288488375023, + "grad_norm": 7.932109355926514, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8549716472625732, + "num_tokens": 522317024.0, + "step": 13693 + }, + { + "epoch": 1.7420175550184456, + "ewc_loss": 0.06804642826318741, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003337846137583256, + "grad_norm": 7.888994216918945, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.872455894947052, + "num_tokens": 522358510.0, + "step": 13694 + }, + { + "epoch": 1.742144765297036, + "ewc_loss": 0.06824427098035812, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003357630339451134, + "grad_norm": 7.996611595153809, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8543069362640381, + "num_tokens": 522395960.0, + "step": 13695 + }, + { + "epoch": 1.7422719755756266, + "ewc_loss": 0.06813464313745499, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003346667508594692, + "grad_norm": 7.935056686401367, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8701031804084778, + "num_tokens": 522428807.0, + "step": 13696 + }, + { + "epoch": 1.7423991858542172, + "ewc_loss": 0.06870394945144653, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033547694329172373, + "grad_norm": 7.973109722137451, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8620776534080505, + "num_tokens": 522464472.0, + "step": 13697 + }, + { + "epoch": 1.7425263961328077, + "ewc_loss": 0.06817597150802612, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033508005435578525, + "grad_norm": 7.910399436950684, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8757222890853882, + "num_tokens": 522509522.0, + "step": 13698 + }, + { + "epoch": 1.742653606411398, + "ewc_loss": 0.06837907433509827, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033711109426803887, + "grad_norm": 8.006246566772461, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8462462425231934, + "num_tokens": 522545581.0, + "step": 13699 + }, + { + "epoch": 1.7427808166899885, + "ewc_loss": 0.06800161302089691, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033333644387312233, + "grad_norm": 7.929975986480713, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8647761344909668, + "num_tokens": 522582072.0, + "step": 13700 + }, + { + "epoch": 1.742908026968579, + "ewc_loss": 0.06835043430328369, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003368246543686837, + "grad_norm": 8.012909889221191, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8650493025779724, + "num_tokens": 522618183.0, + "step": 13701 + }, + { + "epoch": 1.7430352372471696, + "ewc_loss": 0.0681169331073761, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033448959584347904, + "grad_norm": 7.873517036437988, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8513628244400024, + "num_tokens": 522660578.0, + "step": 13702 + }, + { + "epoch": 1.74316244752576, + "ewc_loss": 0.06834933161735535, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033681365312077105, + "grad_norm": 7.954216003417969, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8735915422439575, + "num_tokens": 522701934.0, + "step": 13703 + }, + { + "epoch": 1.7432896578043506, + "ewc_loss": 0.06812804937362671, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003346008015796542, + "grad_norm": 7.8865485191345215, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8799912333488464, + "num_tokens": 522737404.0, + "step": 13704 + }, + { + "epoch": 1.743416868082941, + "ewc_loss": 0.06853939592838287, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003387142496649176, + "grad_norm": 8.04779052734375, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8568044900894165, + "num_tokens": 522774188.0, + "step": 13705 + }, + { + "epoch": 1.7435440783615315, + "ewc_loss": 0.0681300014257431, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033462035935372114, + "grad_norm": 7.931306838989258, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8710125684738159, + "num_tokens": 522810467.0, + "step": 13706 + }, + { + "epoch": 1.743671288640122, + "ewc_loss": 0.068403460085392, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003373549261596054, + "grad_norm": 8.004547119140625, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8497172594070435, + "num_tokens": 522845246.0, + "step": 13707 + }, + { + "epoch": 1.7437984989187125, + "ewc_loss": 0.06815437227487564, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003348640457261354, + "grad_norm": 7.986642360687256, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8721905946731567, + "num_tokens": 522877150.0, + "step": 13708 + }, + { + "epoch": 1.743925709197303, + "ewc_loss": 0.06826020777225494, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033592243562452495, + "grad_norm": 8.054709434509277, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.876638650894165, + "num_tokens": 522913161.0, + "step": 13709 + }, + { + "epoch": 1.7440529194758936, + "ewc_loss": 0.06799952685832977, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033331557642668486, + "grad_norm": 7.926726818084717, + "learning_rate": 1e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.848465085029602, + "num_tokens": 522953464.0, + "step": 13710 + }, + { + "epoch": 1.744180129754484, + "ewc_loss": 0.0682569071650505, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033588940277695656, + "grad_norm": 7.973668575286865, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.866696834564209, + "num_tokens": 522996874.0, + "step": 13711 + }, + { + "epoch": 1.7443073400330746, + "ewc_loss": 0.06855780631303787, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033401555265299976, + "grad_norm": 13.527118682861328, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8645411729812622, + "num_tokens": 523031349.0, + "step": 13712 + }, + { + "epoch": 1.7444345503116652, + "ewc_loss": 0.0757681131362915, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00041100141243077815, + "grad_norm": 8.759726524353027, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8547343015670776, + "num_tokens": 523068910.0, + "step": 13713 + }, + { + "epoch": 1.7445617605902557, + "ewc_loss": 0.06844593584537506, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033777972566895187, + "grad_norm": 8.093668937683105, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8764994144439697, + "num_tokens": 523106179.0, + "step": 13714 + }, + { + "epoch": 1.7446889708688462, + "ewc_loss": 0.06901376694440842, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034345799940638244, + "grad_norm": 8.215763092041016, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8641880750656128, + "num_tokens": 523140827.0, + "step": 13715 + }, + { + "epoch": 1.7448161811474368, + "ewc_loss": 0.06927307695150375, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034605106338858604, + "grad_norm": 8.058012962341309, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8755139708518982, + "num_tokens": 523182880.0, + "step": 13716 + }, + { + "epoch": 1.7449433914260273, + "ewc_loss": 0.06868050992488861, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034012540709227324, + "grad_norm": 8.202152252197266, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8653925657272339, + "num_tokens": 523213059.0, + "step": 13717 + }, + { + "epoch": 1.7450706017046178, + "ewc_loss": 0.06853263080120087, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003386466414667666, + "grad_norm": 8.047586441040039, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8604146242141724, + "num_tokens": 523254398.0, + "step": 13718 + }, + { + "epoch": 1.7451978119832083, + "ewc_loss": 0.06872246414422989, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003405449679121375, + "grad_norm": 8.094002723693848, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8744747638702393, + "num_tokens": 523293325.0, + "step": 13719 + }, + { + "epoch": 1.7453250222617989, + "ewc_loss": 0.06825847178697586, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003359050315339118, + "grad_norm": 7.941596508026123, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8517643809318542, + "num_tokens": 523334402.0, + "step": 13720 + }, + { + "epoch": 1.7454522325403894, + "ewc_loss": 0.06857015192508698, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003390217898413539, + "grad_norm": 8.102378845214844, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8664872050285339, + "num_tokens": 523376238.0, + "step": 13721 + }, + { + "epoch": 1.74557944281898, + "ewc_loss": 0.06820252537727356, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003353455394972116, + "grad_norm": 7.933634281158447, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8844060897827148, + "num_tokens": 523417074.0, + "step": 13722 + }, + { + "epoch": 1.7457066530975704, + "ewc_loss": 0.06864983588457108, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003398186818230897, + "grad_norm": 8.013648986816406, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8701821565628052, + "num_tokens": 523457984.0, + "step": 13723 + }, + { + "epoch": 1.7458338633761608, + "ewc_loss": 0.0682205781340599, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033552609966136515, + "grad_norm": 7.991035461425781, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8579466938972473, + "num_tokens": 523490071.0, + "step": 13724 + }, + { + "epoch": 1.7459610736547513, + "ewc_loss": 0.06848150491714478, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003381354035809636, + "grad_norm": 8.015053749084473, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8565786480903625, + "num_tokens": 523530105.0, + "step": 13725 + }, + { + "epoch": 1.7460882839333418, + "ewc_loss": 0.06829094886779785, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033622983028180897, + "grad_norm": 7.9378662109375, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8767576217651367, + "num_tokens": 523576698.0, + "step": 13726 + }, + { + "epoch": 1.7462154942119323, + "ewc_loss": 0.06833188235759735, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000336639117449522, + "grad_norm": 8.013850212097168, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.863714873790741, + "num_tokens": 523614387.0, + "step": 13727 + }, + { + "epoch": 1.7463427044905229, + "ewc_loss": 0.06821297854185104, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003354501095600426, + "grad_norm": 8.019695281982422, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8581381440162659, + "num_tokens": 523651149.0, + "step": 13728 + }, + { + "epoch": 1.7464699147691134, + "ewc_loss": 0.06812258064746857, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000334546115482226, + "grad_norm": 7.9893975257873535, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8754466772079468, + "num_tokens": 523688448.0, + "step": 13729 + }, + { + "epoch": 1.7465971250477037, + "ewc_loss": 0.06829176843166351, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033623797935433686, + "grad_norm": 8.016435623168945, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8664889335632324, + "num_tokens": 523725805.0, + "step": 13730 + }, + { + "epoch": 1.7467243353262942, + "ewc_loss": 0.0680791437625885, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003341117990203202, + "grad_norm": 7.991450309753418, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8625431060791016, + "num_tokens": 523763564.0, + "step": 13731 + }, + { + "epoch": 1.7468515456048848, + "ewc_loss": 0.068174809217453, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033506835461594164, + "grad_norm": 7.9741339683532715, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8582591414451599, + "num_tokens": 523804422.0, + "step": 13732 + }, + { + "epoch": 1.7469787558834753, + "ewc_loss": 0.06818310171365738, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003351513296365738, + "grad_norm": 8.033084869384766, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8721185326576233, + "num_tokens": 523845889.0, + "step": 13733 + }, + { + "epoch": 1.7471059661620658, + "ewc_loss": 0.06811273097991943, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033444762811996043, + "grad_norm": 7.989363193511963, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8737711906433105, + "num_tokens": 523883623.0, + "step": 13734 + }, + { + "epoch": 1.7472331764406563, + "ewc_loss": 0.06813094764947891, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003346297889947891, + "grad_norm": 8.051589012145996, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.868992805480957, + "num_tokens": 523917273.0, + "step": 13735 + }, + { + "epoch": 1.7473603867192469, + "ewc_loss": 0.06805124878883362, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033383278059773147, + "grad_norm": 7.961607933044434, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8598847389221191, + "num_tokens": 523956149.0, + "step": 13736 + }, + { + "epoch": 1.7474875969978374, + "ewc_loss": 0.06815202534198761, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033484052983112633, + "grad_norm": 7.951257705688477, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8598556518554688, + "num_tokens": 523997229.0, + "step": 13737 + }, + { + "epoch": 1.747614807276428, + "ewc_loss": 0.06806646287441254, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003339849354233593, + "grad_norm": 7.967581272125244, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8679007887840271, + "num_tokens": 524032921.0, + "step": 13738 + }, + { + "epoch": 1.7477420175550185, + "ewc_loss": 0.06822612881660461, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003355816297698766, + "grad_norm": 8.061031341552734, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8649052381515503, + "num_tokens": 524065016.0, + "step": 13739 + }, + { + "epoch": 1.747869227833609, + "ewc_loss": 0.06796035915613174, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033292389707639813, + "grad_norm": 7.9434027671813965, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8512492179870605, + "num_tokens": 524107001.0, + "step": 13740 + }, + { + "epoch": 1.7479964381121995, + "ewc_loss": 0.06822952628135681, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033561556483618915, + "grad_norm": 8.07083797454834, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8575664758682251, + "num_tokens": 524149283.0, + "step": 13741 + }, + { + "epoch": 1.74812364839079, + "ewc_loss": 0.06801880896091461, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033350844751112163, + "grad_norm": 7.928232192993164, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8621677160263062, + "num_tokens": 524187240.0, + "step": 13742 + }, + { + "epoch": 1.7482508586693806, + "ewc_loss": 0.068269744515419, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003360177215654403, + "grad_norm": 8.04213809967041, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8720390796661377, + "num_tokens": 524216793.0, + "step": 13743 + }, + { + "epoch": 1.748378068947971, + "ewc_loss": 0.06795742362737656, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003328945313114673, + "grad_norm": 7.950004577636719, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8680921792984009, + "num_tokens": 524257629.0, + "step": 13744 + }, + { + "epoch": 1.7485052792265616, + "ewc_loss": 0.06841367483139038, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033745705150067806, + "grad_norm": 8.043861389160156, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8730068206787109, + "num_tokens": 524291090.0, + "step": 13745 + }, + { + "epoch": 1.7486324895051522, + "ewc_loss": 0.06796862185001373, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033300649374723434, + "grad_norm": 7.915101528167725, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8758898973464966, + "num_tokens": 524324882.0, + "step": 13746 + }, + { + "epoch": 1.7487596997837427, + "ewc_loss": 0.06838870048522949, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033720736973918974, + "grad_norm": 8.092147827148438, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8460281491279602, + "num_tokens": 524359422.0, + "step": 13747 + }, + { + "epoch": 1.748886910062333, + "ewc_loss": 0.0680418312549591, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033373862970620394, + "grad_norm": 7.9817328453063965, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8685160875320435, + "num_tokens": 524398383.0, + "step": 13748 + }, + { + "epoch": 1.7490141203409235, + "ewc_loss": 0.0683218389749527, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003365387092344463, + "grad_norm": 8.01023006439209, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8597181439399719, + "num_tokens": 524437984.0, + "step": 13749 + }, + { + "epoch": 1.749141330619514, + "ewc_loss": 0.06814990937709808, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003348194295540452, + "grad_norm": 8.060179710388184, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8679497241973877, + "num_tokens": 524471324.0, + "step": 13750 + }, + { + "epoch": 1.7492685408981046, + "ewc_loss": 0.06814448535442352, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033476512180641294, + "grad_norm": 7.973801612854004, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8538169860839844, + "num_tokens": 524511471.0, + "step": 13751 + }, + { + "epoch": 1.749395751176695, + "ewc_loss": 0.06826120615005493, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003359323600307107, + "grad_norm": 8.051359176635742, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8654739856719971, + "num_tokens": 524541743.0, + "step": 13752 + }, + { + "epoch": 1.7495229614552856, + "ewc_loss": 0.06801867485046387, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003335071087349206, + "grad_norm": 7.956137180328369, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8633531928062439, + "num_tokens": 524581470.0, + "step": 13753 + }, + { + "epoch": 1.749650171733876, + "ewc_loss": 0.06828809529542923, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033620125032030046, + "grad_norm": 8.003279685974121, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8635052442550659, + "num_tokens": 524612722.0, + "step": 13754 + }, + { + "epoch": 1.7497773820124665, + "ewc_loss": 0.06815239787101746, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003348443133290857, + "grad_norm": 7.965972423553467, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8710959553718567, + "num_tokens": 524647640.0, + "step": 13755 + }, + { + "epoch": 1.749904592291057, + "ewc_loss": 0.068316251039505, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033648282987996936, + "grad_norm": 8.109136581420898, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8567054867744446, + "num_tokens": 524682366.0, + "step": 13756 + }, + { + "epoch": 1.7500318025696475, + "ewc_loss": 0.06789734959602356, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033229379914700985, + "grad_norm": 7.970605850219727, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8629010915756226, + "num_tokens": 524724242.0, + "step": 13757 + }, + { + "epoch": 1.750159012848238, + "ewc_loss": 0.06829903274774551, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003363106516189873, + "grad_norm": 8.090508460998535, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.849125862121582, + "num_tokens": 524757373.0, + "step": 13758 + }, + { + "epoch": 1.7502862231268286, + "ewc_loss": 0.06790791451931, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003323994460515678, + "grad_norm": 7.9601263999938965, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8522981405258179, + "num_tokens": 524797704.0, + "step": 13759 + }, + { + "epoch": 1.750413433405419, + "ewc_loss": 0.06819648295640945, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033528514904901385, + "grad_norm": 7.9799604415893555, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8654232621192932, + "num_tokens": 524836710.0, + "step": 13760 + }, + { + "epoch": 1.7505406436840096, + "ewc_loss": 0.06801652908325195, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033348563010804355, + "grad_norm": 7.923436641693115, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8602663278579712, + "num_tokens": 524870224.0, + "step": 13761 + }, + { + "epoch": 1.7506678539626002, + "ewc_loss": 0.06827504932880402, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033607083605602384, + "grad_norm": 8.043304443359375, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8663219809532166, + "num_tokens": 524905345.0, + "step": 13762 + }, + { + "epoch": 1.7507950642411907, + "ewc_loss": 0.0679856389760971, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003331767220515758, + "grad_norm": 7.932528018951416, + "learning_rate": 1e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8468301296234131, + "num_tokens": 524943893.0, + "step": 13763 + }, + { + "epoch": 1.7509222745197812, + "ewc_loss": 0.06829814612865448, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003363018040545285, + "grad_norm": 7.964282512664795, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8606802225112915, + "num_tokens": 524982924.0, + "step": 13764 + }, + { + "epoch": 1.7510494847983717, + "ewc_loss": 0.06810206174850464, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003343409625813365, + "grad_norm": 8.0156888961792, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8611024618148804, + "num_tokens": 525016749.0, + "step": 13765 + }, + { + "epoch": 1.7511766950769623, + "ewc_loss": 0.06810063123703003, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003343266434967518, + "grad_norm": 7.962557792663574, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8657047748565674, + "num_tokens": 525049178.0, + "step": 13766 + }, + { + "epoch": 1.7513039053555528, + "ewc_loss": 0.06831282377243042, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033644860377535224, + "grad_norm": 8.03620433807373, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8479442596435547, + "num_tokens": 525083456.0, + "step": 13767 + }, + { + "epoch": 1.7514311156341433, + "ewc_loss": 0.06803379952907562, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033365836134180427, + "grad_norm": 7.914229869842529, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8662394881248474, + "num_tokens": 525120159.0, + "step": 13768 + }, + { + "epoch": 1.7515583259127339, + "ewc_loss": 0.06832675635814667, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003365879238117486, + "grad_norm": 7.995608806610107, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8671959042549133, + "num_tokens": 525159174.0, + "step": 13769 + }, + { + "epoch": 1.7516855361913244, + "ewc_loss": 0.06817510724067688, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003350714105181396, + "grad_norm": 7.946657657623291, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8612828850746155, + "num_tokens": 525198373.0, + "step": 13770 + }, + { + "epoch": 1.751812746469915, + "ewc_loss": 0.06822916865348816, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033561198506504297, + "grad_norm": 7.938869953155518, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8690053820610046, + "num_tokens": 525240618.0, + "step": 13771 + }, + { + "epoch": 1.7519399567485054, + "ewc_loss": 0.06867614388465881, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000335198943503201, + "grad_norm": 7.962813377380371, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8785767555236816, + "num_tokens": 525274899.0, + "step": 13772 + }, + { + "epoch": 1.7520671670270958, + "ewc_loss": 0.06811368465423584, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003344572032801807, + "grad_norm": 7.9638471603393555, + "learning_rate": 1e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.8419812917709351, + "num_tokens": 525310144.0, + "step": 13773 + }, + { + "epoch": 1.7521943773056863, + "ewc_loss": 0.06815223395824432, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033484268351458013, + "grad_norm": 7.957244396209717, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8717966675758362, + "num_tokens": 525348565.0, + "step": 13774 + }, + { + "epoch": 1.7523215875842768, + "ewc_loss": 0.06866830587387085, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000335120566887781, + "grad_norm": 7.9915690422058105, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8694930076599121, + "num_tokens": 525383709.0, + "step": 13775 + }, + { + "epoch": 1.7524487978628673, + "ewc_loss": 0.06862461566925049, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000334683689288795, + "grad_norm": 7.9006123542785645, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8571640849113464, + "num_tokens": 525423983.0, + "step": 13776 + }, + { + "epoch": 1.7525760081414579, + "ewc_loss": 0.06878965348005295, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033633405109867454, + "grad_norm": 8.001405715942383, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8693684339523315, + "num_tokens": 525457357.0, + "step": 13777 + }, + { + "epoch": 1.7527032184200484, + "ewc_loss": 0.06808564066886902, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003341767005622387, + "grad_norm": 7.895598411560059, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8609554767608643, + "num_tokens": 525497021.0, + "step": 13778 + }, + { + "epoch": 1.7528304286986387, + "ewc_loss": 0.06848268955945969, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033814721973612905, + "grad_norm": 7.979795932769775, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8787080645561218, + "num_tokens": 525533592.0, + "step": 13779 + }, + { + "epoch": 1.7529576389772292, + "ewc_loss": 0.06808462738990784, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003341665433254093, + "grad_norm": 7.916224002838135, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8551379442214966, + "num_tokens": 525572747.0, + "step": 13780 + }, + { + "epoch": 1.7530848492558198, + "ewc_loss": 0.06834906339645386, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003368109464645386, + "grad_norm": 7.98922872543335, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8682752251625061, + "num_tokens": 525613578.0, + "step": 13781 + }, + { + "epoch": 1.7532120595344103, + "ewc_loss": 0.06863449513912201, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033478240948170424, + "grad_norm": 7.9074506759643555, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8695825338363647, + "num_tokens": 525655047.0, + "step": 13782 + }, + { + "epoch": 1.7533392698130008, + "ewc_loss": 0.06830623745918274, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003363827127031982, + "grad_norm": 8.028910636901855, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8598291277885437, + "num_tokens": 525689561.0, + "step": 13783 + }, + { + "epoch": 1.7534664800915913, + "ewc_loss": 0.0680931806564331, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033425213769078255, + "grad_norm": 7.89022970199585, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8603804111480713, + "num_tokens": 525726946.0, + "step": 13784 + }, + { + "epoch": 1.7535936903701819, + "ewc_loss": 0.06849440932273865, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033826444996520877, + "grad_norm": 8.084152221679688, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8546556234359741, + "num_tokens": 525762089.0, + "step": 13785 + }, + { + "epoch": 1.7537209006487724, + "ewc_loss": 0.06798883527517319, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003332086780574173, + "grad_norm": 7.904906749725342, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8679696917533875, + "num_tokens": 525796081.0, + "step": 13786 + }, + { + "epoch": 1.753848110927363, + "ewc_loss": 0.06846390664577484, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033795935451053083, + "grad_norm": 8.092653274536133, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8523939251899719, + "num_tokens": 525829945.0, + "step": 13787 + }, + { + "epoch": 1.7539753212059535, + "ewc_loss": 0.06857751309871674, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033421264379285276, + "grad_norm": 7.911595821380615, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8720373511314392, + "num_tokens": 525871289.0, + "step": 13788 + }, + { + "epoch": 1.754102531484544, + "ewc_loss": 0.06884714961051941, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033690902637317777, + "grad_norm": 8.058528900146484, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8640938401222229, + "num_tokens": 525908455.0, + "step": 13789 + }, + { + "epoch": 1.7542297417631345, + "ewc_loss": 0.0683813989162445, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033225154038518667, + "grad_norm": 7.873979568481445, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8841903805732727, + "num_tokens": 525943734.0, + "step": 13790 + }, + { + "epoch": 1.754356952041725, + "ewc_loss": 0.06897011399269104, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003381386341061443, + "grad_norm": 8.01666259765625, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8869052529335022, + "num_tokens": 525988070.0, + "step": 13791 + }, + { + "epoch": 1.7544841623203156, + "ewc_loss": 0.0683993324637413, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033243081998080015, + "grad_norm": 7.840904235839844, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8777869343757629, + "num_tokens": 526027940.0, + "step": 13792 + }, + { + "epoch": 1.754611372598906, + "ewc_loss": 0.06844091415405273, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033772949245758355, + "grad_norm": 8.03566837310791, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8736429810523987, + "num_tokens": 526061357.0, + "step": 13793 + }, + { + "epoch": 1.7547385828774966, + "ewc_loss": 0.06803934276103973, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003337137750349939, + "grad_norm": 7.879948139190674, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8658838272094727, + "num_tokens": 526102157.0, + "step": 13794 + }, + { + "epoch": 1.7548657931560872, + "ewc_loss": 0.06865131855010986, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033983352477662265, + "grad_norm": 8.075189590454102, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8545961976051331, + "num_tokens": 526137241.0, + "step": 13795 + }, + { + "epoch": 1.7549930034346777, + "ewc_loss": 0.06794237345457077, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003327440354041755, + "grad_norm": 7.955957889556885, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8551201224327087, + "num_tokens": 526169079.0, + "step": 13796 + }, + { + "epoch": 1.755120213713268, + "ewc_loss": 0.06846769154071808, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033799727680161595, + "grad_norm": 7.988373756408691, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.868182897567749, + "num_tokens": 526212212.0, + "step": 13797 + }, + { + "epoch": 1.7552474239918585, + "ewc_loss": 0.06823426485061646, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033566297497600317, + "grad_norm": 7.945976257324219, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8706904649734497, + "num_tokens": 526250380.0, + "step": 13798 + }, + { + "epoch": 1.755374634270449, + "ewc_loss": 0.06842601299285889, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033758048084564507, + "grad_norm": 8.010517120361328, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8699047565460205, + "num_tokens": 526292525.0, + "step": 13799 + }, + { + "epoch": 1.7555018445490396, + "ewc_loss": 0.06822845339775085, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033560485462658107, + "grad_norm": 7.948920249938965, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8647768497467041, + "num_tokens": 526330427.0, + "step": 13800 + }, + { + "epoch": 1.75562905482763, + "ewc_loss": 0.06841958314180374, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033751613227650523, + "grad_norm": 8.059297561645508, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8645247220993042, + "num_tokens": 526362954.0, + "step": 13801 + }, + { + "epoch": 1.7557562651062206, + "ewc_loss": 0.0679919496178627, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000333239819156006, + "grad_norm": 7.989107131958008, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8568789958953857, + "num_tokens": 526394310.0, + "step": 13802 + }, + { + "epoch": 1.755883475384811, + "ewc_loss": 0.06836866587400436, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003370069607626647, + "grad_norm": 8.002802848815918, + "learning_rate": 1e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8439611196517944, + "num_tokens": 526438507.0, + "step": 13803 + }, + { + "epoch": 1.7560106856634015, + "ewc_loss": 0.0682186484336853, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003355067747179419, + "grad_norm": 8.042549133300781, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8600057363510132, + "num_tokens": 526475124.0, + "step": 13804 + }, + { + "epoch": 1.756137895941992, + "ewc_loss": 0.06820762157440186, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003353965003043413, + "grad_norm": 7.944341659545898, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8715090751647949, + "num_tokens": 526514936.0, + "step": 13805 + }, + { + "epoch": 1.7562651062205825, + "ewc_loss": 0.06885029375553131, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003369404294062406, + "grad_norm": 8.07859992980957, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.857246994972229, + "num_tokens": 526548649.0, + "step": 13806 + }, + { + "epoch": 1.756392316499173, + "ewc_loss": 0.06850762665271759, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033351380261592567, + "grad_norm": 8.024014472961426, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8480456471443176, + "num_tokens": 526589028.0, + "step": 13807 + }, + { + "epoch": 1.7565195267777636, + "ewc_loss": 0.06822160631418228, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003355363733135164, + "grad_norm": 7.975109577178955, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8642427921295166, + "num_tokens": 526626327.0, + "step": 13808 + }, + { + "epoch": 1.756646737056354, + "ewc_loss": 0.06809563934803009, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003342766722198576, + "grad_norm": 7.970461368560791, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8603787422180176, + "num_tokens": 526672264.0, + "step": 13809 + }, + { + "epoch": 1.7567739473349446, + "ewc_loss": 0.0681937038898468, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033525735489092767, + "grad_norm": 8.003488540649414, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8575707077980042, + "num_tokens": 526709979.0, + "step": 13810 + }, + { + "epoch": 1.7569011576135352, + "ewc_loss": 0.0679941475391388, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033326176344417036, + "grad_norm": 7.930032730102539, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8762438297271729, + "num_tokens": 526745002.0, + "step": 13811 + }, + { + "epoch": 1.7570283678921257, + "ewc_loss": 0.06828384101390839, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033615875872783363, + "grad_norm": 8.001049041748047, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8795034885406494, + "num_tokens": 526785684.0, + "step": 13812 + }, + { + "epoch": 1.7571555781707162, + "ewc_loss": 0.0680827870965004, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003341481788083911, + "grad_norm": 7.933685779571533, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8833713531494141, + "num_tokens": 526820246.0, + "step": 13813 + }, + { + "epoch": 1.7572827884493067, + "ewc_loss": 0.06819760799407959, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003352963540237397, + "grad_norm": 7.963314533233643, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8667504787445068, + "num_tokens": 526854815.0, + "step": 13814 + }, + { + "epoch": 1.7574099987278973, + "ewc_loss": 0.06812866777181625, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033460697159171104, + "grad_norm": 7.9272332191467285, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8692132830619812, + "num_tokens": 526896336.0, + "step": 13815 + }, + { + "epoch": 1.7575372090064878, + "ewc_loss": 0.06825262308120728, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033584656193852425, + "grad_norm": 7.954599857330322, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8752202987670898, + "num_tokens": 526939586.0, + "step": 13816 + }, + { + "epoch": 1.7576644192850783, + "ewc_loss": 0.06819495558738708, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033526986953802407, + "grad_norm": 7.961339473724365, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8687447905540466, + "num_tokens": 526975027.0, + "step": 13817 + }, + { + "epoch": 1.7577916295636689, + "ewc_loss": 0.06821861863136292, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033550645457580686, + "grad_norm": 7.951794147491455, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8663439750671387, + "num_tokens": 527017820.0, + "step": 13818 + }, + { + "epoch": 1.7579188398422594, + "ewc_loss": 0.06818237900733948, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003351440536789596, + "grad_norm": 8.055076599121094, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8749419450759888, + "num_tokens": 527051647.0, + "step": 13819 + }, + { + "epoch": 1.75804605012085, + "ewc_loss": 0.0680820494890213, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003341407864354551, + "grad_norm": 7.939160346984863, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8681043386459351, + "num_tokens": 527089590.0, + "step": 13820 + }, + { + "epoch": 1.7581732603994404, + "ewc_loss": 0.06822311878204346, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003355515073053539, + "grad_norm": 8.007943153381348, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.861856997013092, + "num_tokens": 527126696.0, + "step": 13821 + }, + { + "epoch": 1.7583004706780307, + "ewc_loss": 0.06805676221847534, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000333887932356447, + "grad_norm": 7.97256326675415, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8545008897781372, + "num_tokens": 527164823.0, + "step": 13822 + }, + { + "epoch": 1.7584276809566213, + "ewc_loss": 0.06824490427970886, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003357693203724921, + "grad_norm": 7.977449417114258, + "learning_rate": 1e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8466009497642517, + "num_tokens": 527208032.0, + "step": 13823 + }, + { + "epoch": 1.7585548912352118, + "ewc_loss": 0.0681452825665474, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033477312535978854, + "grad_norm": 8.088543891906738, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8659050464630127, + "num_tokens": 527242773.0, + "step": 13824 + }, + { + "epoch": 1.7586821015138023, + "ewc_loss": 0.06798334419727325, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003331537882331759, + "grad_norm": 7.945946216583252, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8674962520599365, + "num_tokens": 527277968.0, + "step": 13825 + }, + { + "epoch": 1.7588093117923929, + "ewc_loss": 0.06817373633384705, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033505764440633357, + "grad_norm": 8.008708953857422, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8780645132064819, + "num_tokens": 527311077.0, + "step": 13826 + }, + { + "epoch": 1.7589365220709834, + "ewc_loss": 0.0679684579372406, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033300486393272877, + "grad_norm": 8.134843826293945, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8707958459854126, + "num_tokens": 527340543.0, + "step": 13827 + }, + { + "epoch": 1.7590637323495737, + "ewc_loss": 0.0678725317120552, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003320456307847053, + "grad_norm": 7.984471797943115, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8619688153266907, + "num_tokens": 527379596.0, + "step": 13828 + }, + { + "epoch": 1.7591909426281642, + "ewc_loss": 0.06816790997982025, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033499940764158964, + "grad_norm": 8.044739723205566, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8645059466362, + "num_tokens": 527417873.0, + "step": 13829 + }, + { + "epoch": 1.7593181529067548, + "ewc_loss": 0.06787043809890747, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033202467602677643, + "grad_norm": 7.886444568634033, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.867188572883606, + "num_tokens": 527458607.0, + "step": 13830 + }, + { + "epoch": 1.7594453631853453, + "ewc_loss": 0.06840389966964722, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003373592917341739, + "grad_norm": 8.114080429077148, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8562740087509155, + "num_tokens": 527493299.0, + "step": 13831 + }, + { + "epoch": 1.7595725734639358, + "ewc_loss": 0.06781108677387238, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003314311907161027, + "grad_norm": 7.866923809051514, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8511336445808411, + "num_tokens": 527536065.0, + "step": 13832 + }, + { + "epoch": 1.7596997837425263, + "ewc_loss": 0.06863280385732651, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033964833710342646, + "grad_norm": 8.093994140625, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8487613797187805, + "num_tokens": 527576477.0, + "step": 13833 + }, + { + "epoch": 1.7598269940211169, + "ewc_loss": 0.06778834760189056, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003312037733849138, + "grad_norm": 7.836160659790039, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8615133762359619, + "num_tokens": 527611777.0, + "step": 13834 + }, + { + "epoch": 1.7599542042997074, + "ewc_loss": 0.06867466866970062, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034006699570454657, + "grad_norm": 8.089310646057129, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8811507821083069, + "num_tokens": 527650236.0, + "step": 13835 + }, + { + "epoch": 1.760081414578298, + "ewc_loss": 0.06802591681480408, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033357946085743606, + "grad_norm": 7.913680076599121, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8687944412231445, + "num_tokens": 527685486.0, + "step": 13836 + }, + { + "epoch": 1.7602086248568884, + "ewc_loss": 0.06852301955223083, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033855054061859846, + "grad_norm": 8.046562194824219, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8508539199829102, + "num_tokens": 527725386.0, + "step": 13837 + }, + { + "epoch": 1.760335835135479, + "ewc_loss": 0.0681377723813057, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033469803747721016, + "grad_norm": 7.991766929626465, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8515666723251343, + "num_tokens": 527756213.0, + "step": 13838 + }, + { + "epoch": 1.7604630454140695, + "ewc_loss": 0.06841882318258286, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000337508536176756, + "grad_norm": 7.993917942047119, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8677071332931519, + "num_tokens": 527793884.0, + "step": 13839 + }, + { + "epoch": 1.76059025569266, + "ewc_loss": 0.06831961870193481, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003365164448041469, + "grad_norm": 8.044390678405762, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8617648482322693, + "num_tokens": 527826428.0, + "step": 13840 + }, + { + "epoch": 1.7607174659712506, + "ewc_loss": 0.06816366314888, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003349569160491228, + "grad_norm": 7.962409496307373, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8572795391082764, + "num_tokens": 527866691.0, + "step": 13841 + }, + { + "epoch": 1.760844676249841, + "ewc_loss": 0.06829705834388733, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033629091922193766, + "grad_norm": 7.929707050323486, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8729633688926697, + "num_tokens": 527909482.0, + "step": 13842 + }, + { + "epoch": 1.7609718865284316, + "ewc_loss": 0.06823089718818665, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003356293309479952, + "grad_norm": 7.957733154296875, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8515415787696838, + "num_tokens": 527947818.0, + "step": 13843 + }, + { + "epoch": 1.7610990968070221, + "ewc_loss": 0.06824866682291031, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003358069807291031, + "grad_norm": 8.056656837463379, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8666486740112305, + "num_tokens": 527986760.0, + "step": 13844 + }, + { + "epoch": 1.7612263070856127, + "ewc_loss": 0.06802396476268768, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003335599321871996, + "grad_norm": 7.937726974487305, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8578522801399231, + "num_tokens": 528024742.0, + "step": 13845 + }, + { + "epoch": 1.761353517364203, + "ewc_loss": 0.06839875876903534, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003373079525772482, + "grad_norm": 8.141586303710938, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8483530282974243, + "num_tokens": 528060781.0, + "step": 13846 + }, + { + "epoch": 1.7614807276427935, + "ewc_loss": 0.06776328384876251, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033095316030085087, + "grad_norm": 7.92755126953125, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8660740852355957, + "num_tokens": 528099051.0, + "step": 13847 + }, + { + "epoch": 1.761607937921384, + "ewc_loss": 0.06846136599779129, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033793397597037256, + "grad_norm": 8.096580505371094, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8576027750968933, + "num_tokens": 528138325.0, + "step": 13848 + }, + { + "epoch": 1.7617351481999746, + "ewc_loss": 0.06777787208557129, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033109908690676093, + "grad_norm": 7.891131401062012, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8550848364830017, + "num_tokens": 528177997.0, + "step": 13849 + }, + { + "epoch": 1.761862358478565, + "ewc_loss": 0.06847579777240753, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003380783018656075, + "grad_norm": 8.030771255493164, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8608810901641846, + "num_tokens": 528216360.0, + "step": 13850 + }, + { + "epoch": 1.7619895687571556, + "ewc_loss": 0.06789925694465637, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000332312862155959, + "grad_norm": 7.902277946472168, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.879555881023407, + "num_tokens": 528251100.0, + "step": 13851 + }, + { + "epoch": 1.762116779035746, + "ewc_loss": 0.06846898794174194, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003380102280061692, + "grad_norm": 8.086584091186523, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8684203028678894, + "num_tokens": 528286298.0, + "step": 13852 + }, + { + "epoch": 1.7622439893143365, + "ewc_loss": 0.06790440529584885, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033236434683203697, + "grad_norm": 7.8813066482543945, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8796228170394897, + "num_tokens": 528322970.0, + "step": 13853 + }, + { + "epoch": 1.762371199592927, + "ewc_loss": 0.06849874556064606, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003383078146725893, + "grad_norm": 8.081500053405762, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8811020851135254, + "num_tokens": 528355191.0, + "step": 13854 + }, + { + "epoch": 1.7624984098715175, + "ewc_loss": 0.06790770590305328, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003323974087834358, + "grad_norm": 8.05046272277832, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8634113073348999, + "num_tokens": 528388975.0, + "step": 13855 + }, + { + "epoch": 1.762625620150108, + "ewc_loss": 0.06817954778671265, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003351158229634166, + "grad_norm": 7.977806091308594, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.868732213973999, + "num_tokens": 528432139.0, + "step": 13856 + }, + { + "epoch": 1.7627528304286986, + "ewc_loss": 0.06801346689462662, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003334549837745726, + "grad_norm": 7.92917537689209, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8727753162384033, + "num_tokens": 528468315.0, + "step": 13857 + }, + { + "epoch": 1.762880040707289, + "ewc_loss": 0.0681275874376297, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003345962322782725, + "grad_norm": 7.963634014129639, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8537207841873169, + "num_tokens": 528510368.0, + "step": 13858 + }, + { + "epoch": 1.7630072509858796, + "ewc_loss": 0.06796788424253464, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033299915958195925, + "grad_norm": 7.909980297088623, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8659184575080872, + "num_tokens": 528545637.0, + "step": 13859 + }, + { + "epoch": 1.7631344612644702, + "ewc_loss": 0.06807488203048706, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033406910370104015, + "grad_norm": 7.980764865875244, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8691366314888, + "num_tokens": 528586267.0, + "step": 13860 + }, + { + "epoch": 1.7632616715430607, + "ewc_loss": 0.06833798438310623, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003342587442602962, + "grad_norm": 7.964704990386963, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8681157827377319, + "num_tokens": 528629640.0, + "step": 13861 + }, + { + "epoch": 1.7633888818216512, + "ewc_loss": 0.06818395107984543, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033515982795506716, + "grad_norm": 8.007438659667969, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8490469455718994, + "num_tokens": 528665373.0, + "step": 13862 + }, + { + "epoch": 1.7635160921002417, + "ewc_loss": 0.06801699846982956, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003334903158247471, + "grad_norm": 7.93436336517334, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8794561624526978, + "num_tokens": 528703426.0, + "step": 13863 + }, + { + "epoch": 1.7636433023788323, + "ewc_loss": 0.068270243704319, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003360227565281093, + "grad_norm": 7.995236396789551, + "learning_rate": 1e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8447089195251465, + "num_tokens": 528744886.0, + "step": 13864 + }, + { + "epoch": 1.7637705126574228, + "ewc_loss": 0.06817839294672012, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003351042396388948, + "grad_norm": 7.993680477142334, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8760299682617188, + "num_tokens": 528779773.0, + "step": 13865 + }, + { + "epoch": 1.7638977229360133, + "ewc_loss": 0.06829962134361267, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033631655969657004, + "grad_norm": 7.956757068634033, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8664949536323547, + "num_tokens": 528821878.0, + "step": 13866 + }, + { + "epoch": 1.7640249332146039, + "ewc_loss": 0.06825204193592072, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000335840682964772, + "grad_norm": 8.04858112335205, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8654922246932983, + "num_tokens": 528858056.0, + "step": 13867 + }, + { + "epoch": 1.7641521434931944, + "ewc_loss": 0.06811866164207458, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033450688351877034, + "grad_norm": 7.9457688331604, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8601800203323364, + "num_tokens": 528903116.0, + "step": 13868 + }, + { + "epoch": 1.764279353771785, + "ewc_loss": 0.06835173070430756, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033683766378089786, + "grad_norm": 7.965011119842529, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8579789400100708, + "num_tokens": 528941561.0, + "step": 13869 + }, + { + "epoch": 1.7644065640503754, + "ewc_loss": 0.06821240484714508, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033544437610544264, + "grad_norm": 8.007523536682129, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8799085021018982, + "num_tokens": 528976215.0, + "step": 13870 + }, + { + "epoch": 1.7645337743289657, + "ewc_loss": 0.06813788414001465, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033469920163042843, + "grad_norm": 7.933196067810059, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8717378377914429, + "num_tokens": 529019280.0, + "step": 13871 + }, + { + "epoch": 1.7646609846075563, + "ewc_loss": 0.06833448261022568, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033666513627395034, + "grad_norm": 8.016580581665039, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8680604100227356, + "num_tokens": 529060733.0, + "step": 13872 + }, + { + "epoch": 1.7647881948861468, + "ewc_loss": 0.06812410056591034, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033456127857789397, + "grad_norm": 7.994389057159424, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8661932349205017, + "num_tokens": 529094948.0, + "step": 13873 + }, + { + "epoch": 1.7649154051647373, + "ewc_loss": 0.06832632422447205, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033658358734101057, + "grad_norm": 8.009613037109375, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8753607869148254, + "num_tokens": 529131374.0, + "step": 13874 + }, + { + "epoch": 1.7650426154433279, + "ewc_loss": 0.06816494464874268, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033496975083835423, + "grad_norm": 8.134622573852539, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8609797954559326, + "num_tokens": 529169133.0, + "step": 13875 + }, + { + "epoch": 1.7651698257219184, + "ewc_loss": 0.0680093765258789, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033341406378895044, + "grad_norm": 7.939377307891846, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.862878680229187, + "num_tokens": 529206467.0, + "step": 13876 + }, + { + "epoch": 1.7652970360005087, + "ewc_loss": 0.06834226846694946, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033674295991659164, + "grad_norm": 7.97312593460083, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8547691106796265, + "num_tokens": 529244891.0, + "step": 13877 + }, + { + "epoch": 1.7654242462790992, + "ewc_loss": 0.06818641722202301, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033518444979563355, + "grad_norm": 8.065139770507812, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8660319447517395, + "num_tokens": 529277384.0, + "step": 13878 + }, + { + "epoch": 1.7655514565576897, + "ewc_loss": 0.06814126670360565, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033473296207375824, + "grad_norm": 7.925548553466797, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.867711067199707, + "num_tokens": 529322530.0, + "step": 13879 + }, + { + "epoch": 1.7656786668362803, + "ewc_loss": 0.06839754432439804, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033729575807228684, + "grad_norm": 8.00298023223877, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8589773774147034, + "num_tokens": 529356345.0, + "step": 13880 + }, + { + "epoch": 1.7658058771148708, + "ewc_loss": 0.06811963021755219, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003345166041981429, + "grad_norm": 7.946141719818115, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8662371039390564, + "num_tokens": 529391877.0, + "step": 13881 + }, + { + "epoch": 1.7659330873934613, + "ewc_loss": 0.06841573119163513, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033747756970115006, + "grad_norm": 8.034812927246094, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8710116147994995, + "num_tokens": 529425460.0, + "step": 13882 + }, + { + "epoch": 1.7660602976720519, + "ewc_loss": 0.06812751293182373, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003345954173710197, + "grad_norm": 7.93876314163208, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8535744547843933, + "num_tokens": 529459660.0, + "step": 13883 + }, + { + "epoch": 1.7661875079506424, + "ewc_loss": 0.06853950768709183, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003387153847143054, + "grad_norm": 8.028498649597168, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8572176694869995, + "num_tokens": 529501839.0, + "step": 13884 + }, + { + "epoch": 1.766314718229233, + "ewc_loss": 0.06815622001886368, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033488249755464494, + "grad_norm": 7.90081787109375, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8615700006484985, + "num_tokens": 529540828.0, + "step": 13885 + }, + { + "epoch": 1.7664419285078234, + "ewc_loss": 0.06855078041553497, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033882816205732524, + "grad_norm": 8.051867485046387, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8645157814025879, + "num_tokens": 529581578.0, + "step": 13886 + }, + { + "epoch": 1.766569138786414, + "ewc_loss": 0.06814339756965637, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003347543242853135, + "grad_norm": 7.936075687408447, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8639475107192993, + "num_tokens": 529624363.0, + "step": 13887 + }, + { + "epoch": 1.7666963490650045, + "ewc_loss": 0.06852443516254425, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033856465597637, + "grad_norm": 8.024435043334961, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8718115091323853, + "num_tokens": 529665286.0, + "step": 13888 + }, + { + "epoch": 1.766823559343595, + "ewc_loss": 0.0682438537478447, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033575884299352765, + "grad_norm": 7.998111248016357, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8601694107055664, + "num_tokens": 529704150.0, + "step": 13889 + }, + { + "epoch": 1.7669507696221856, + "ewc_loss": 0.06824131309986115, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033573340624570847, + "grad_norm": 7.935680389404297, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8662073612213135, + "num_tokens": 529742110.0, + "step": 13890 + }, + { + "epoch": 1.767077979900776, + "ewc_loss": 0.06848756223917007, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003381959395483136, + "grad_norm": 8.058794021606445, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8635474443435669, + "num_tokens": 529776158.0, + "step": 13891 + }, + { + "epoch": 1.7672051901793666, + "ewc_loss": 0.06811575591564178, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003344778378959745, + "grad_norm": 8.001411437988281, + "learning_rate": 1e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.840283989906311, + "num_tokens": 529814002.0, + "step": 13892 + }, + { + "epoch": 1.7673324004579571, + "ewc_loss": 0.06833305209875107, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003366508171893656, + "grad_norm": 8.06066608428955, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8662710189819336, + "num_tokens": 529846696.0, + "step": 13893 + }, + { + "epoch": 1.7674596107365477, + "ewc_loss": 0.06816092133522034, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033492950024083257, + "grad_norm": 8.02673053741455, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8609276413917542, + "num_tokens": 529881557.0, + "step": 13894 + }, + { + "epoch": 1.767586821015138, + "ewc_loss": 0.06808371096849442, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033415743382647634, + "grad_norm": 7.9869489669799805, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8506683707237244, + "num_tokens": 529924337.0, + "step": 13895 + }, + { + "epoch": 1.7677140312937285, + "ewc_loss": 0.06815657019615173, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003348859609104693, + "grad_norm": 7.942085266113281, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8616169691085815, + "num_tokens": 529967195.0, + "step": 13896 + }, + { + "epoch": 1.767841241572319, + "ewc_loss": 0.0680953785777092, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003342740819789469, + "grad_norm": 7.964248180389404, + "learning_rate": 1e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8446738123893738, + "num_tokens": 530003454.0, + "step": 13897 + }, + { + "epoch": 1.7679684518509096, + "ewc_loss": 0.06814827024936676, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033480298588983715, + "grad_norm": 7.984548568725586, + "learning_rate": 1e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.8381912708282471, + "num_tokens": 530042577.0, + "step": 13898 + }, + { + "epoch": 1.7680956621295, + "ewc_loss": 0.0683201253414154, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033652159618213773, + "grad_norm": 8.022340774536133, + "learning_rate": 1e-06, + "loss": 0.517, + "mean_token_accuracy": 0.847529947757721, + "num_tokens": 530084903.0, + "step": 13899 + }, + { + "epoch": 1.7682228724080906, + "ewc_loss": 0.06823687255382538, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033568908111192286, + "grad_norm": 8.057439804077148, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8604090213775635, + "num_tokens": 530117598.0, + "step": 13900 + }, + { + "epoch": 1.768350082686681, + "ewc_loss": 0.06821426749229431, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003354629734531045, + "grad_norm": 8.001816749572754, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8668885231018066, + "num_tokens": 530151891.0, + "step": 13901 + }, + { + "epoch": 1.7684772929652715, + "ewc_loss": 0.06831081956624985, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033642849302850664, + "grad_norm": 7.958487033843994, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.855351448059082, + "num_tokens": 530191263.0, + "step": 13902 + }, + { + "epoch": 1.768604503243862, + "ewc_loss": 0.06820948421955109, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033541518496349454, + "grad_norm": 7.968044281005859, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8594104647636414, + "num_tokens": 530231156.0, + "step": 13903 + }, + { + "epoch": 1.7687317135224525, + "ewc_loss": 0.0682394728064537, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033571504172869027, + "grad_norm": 7.94271993637085, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.876945972442627, + "num_tokens": 530270089.0, + "step": 13904 + }, + { + "epoch": 1.768858923801043, + "ewc_loss": 0.06890976428985596, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003375351370777935, + "grad_norm": 8.715903282165527, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8671029210090637, + "num_tokens": 530309808.0, + "step": 13905 + }, + { + "epoch": 1.7689861340796336, + "ewc_loss": 0.06758411973714828, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003291614993941039, + "grad_norm": 7.753673076629639, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8513584136962891, + "num_tokens": 530357723.0, + "step": 13906 + }, + { + "epoch": 1.769113344358224, + "ewc_loss": 0.0693209171295166, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003465294430498034, + "grad_norm": 8.276664733886719, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.850420355796814, + "num_tokens": 530389586.0, + "step": 13907 + }, + { + "epoch": 1.7692405546368146, + "ewc_loss": 0.0676136389374733, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00032945669954642653, + "grad_norm": 7.83085298538208, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8529109954833984, + "num_tokens": 530430356.0, + "step": 13908 + }, + { + "epoch": 1.7693677649154052, + "ewc_loss": 0.06943432986736298, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003476635611150414, + "grad_norm": 8.249049186706543, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8691035509109497, + "num_tokens": 530472859.0, + "step": 13909 + }, + { + "epoch": 1.7694949751939957, + "ewc_loss": 0.06798471510410309, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000333167496137321, + "grad_norm": 7.902902126312256, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8513578772544861, + "num_tokens": 530515422.0, + "step": 13910 + }, + { + "epoch": 1.7696221854725862, + "ewc_loss": 0.06904928386211395, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003438131825532764, + "grad_norm": 8.16351318359375, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8623966574668884, + "num_tokens": 530553842.0, + "step": 13911 + }, + { + "epoch": 1.7697493957511767, + "ewc_loss": 0.06811994314193726, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003345197474118322, + "grad_norm": 7.9467692375183105, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8626040816307068, + "num_tokens": 530592593.0, + "step": 13912 + }, + { + "epoch": 1.7698766060297673, + "ewc_loss": 0.06872569024562836, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003405771858524531, + "grad_norm": 8.12498950958252, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8573085069656372, + "num_tokens": 530632309.0, + "step": 13913 + }, + { + "epoch": 1.7700038163083578, + "ewc_loss": 0.0681646466255188, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033496672403998673, + "grad_norm": 7.995458602905273, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8585783243179321, + "num_tokens": 530667415.0, + "step": 13914 + }, + { + "epoch": 1.7701310265869483, + "ewc_loss": 0.06843946129083633, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003377149405423552, + "grad_norm": 8.072370529174805, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8606468439102173, + "num_tokens": 530705736.0, + "step": 13915 + }, + { + "epoch": 1.7702582368655388, + "ewc_loss": 0.06818708777427673, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003351911436766386, + "grad_norm": 7.941403388977051, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8717995882034302, + "num_tokens": 530744931.0, + "step": 13916 + }, + { + "epoch": 1.7703854471441294, + "ewc_loss": 0.06843312084674835, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003376515523996204, + "grad_norm": 7.983017444610596, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8758219480514526, + "num_tokens": 530784700.0, + "step": 13917 + }, + { + "epoch": 1.77051265742272, + "ewc_loss": 0.068405881524086, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033737916965037584, + "grad_norm": 8.023581504821777, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8794937133789062, + "num_tokens": 530821097.0, + "step": 13918 + }, + { + "epoch": 1.7706398677013102, + "ewc_loss": 0.0683017149567604, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000336337456246838, + "grad_norm": 7.992128849029541, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8561511039733887, + "num_tokens": 530860444.0, + "step": 13919 + }, + { + "epoch": 1.7707670779799007, + "ewc_loss": 0.06843540072441101, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003376743115950376, + "grad_norm": 7.970179080963135, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8610051274299622, + "num_tokens": 530907304.0, + "step": 13920 + }, + { + "epoch": 1.7708942882584913, + "ewc_loss": 0.06844650208950043, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003377853718120605, + "grad_norm": 8.056158065795898, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.854291558265686, + "num_tokens": 530945692.0, + "step": 13921 + }, + { + "epoch": 1.7710214985370818, + "ewc_loss": 0.06828422844409943, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003361626295372844, + "grad_norm": 7.996971130371094, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8727552890777588, + "num_tokens": 530986695.0, + "step": 13922 + }, + { + "epoch": 1.7711487088156723, + "ewc_loss": 0.06861290335655212, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033944935421459377, + "grad_norm": 8.029678344726562, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8741706013679504, + "num_tokens": 531022656.0, + "step": 13923 + }, + { + "epoch": 1.7712759190942629, + "ewc_loss": 0.06837677955627441, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003370881313458085, + "grad_norm": 7.962958335876465, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8704483509063721, + "num_tokens": 531056896.0, + "step": 13924 + }, + { + "epoch": 1.7714031293728534, + "ewc_loss": 0.06862308830022812, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003395511885173619, + "grad_norm": 8.07059097290039, + "learning_rate": 1e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8379033803939819, + "num_tokens": 531097286.0, + "step": 13925 + }, + { + "epoch": 1.7715303396514437, + "ewc_loss": 0.06842280924320221, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033754835021682084, + "grad_norm": 7.96660852432251, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8717846274375916, + "num_tokens": 531136430.0, + "step": 13926 + }, + { + "epoch": 1.7716575499300342, + "ewc_loss": 0.06856981664896011, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003390184720046818, + "grad_norm": 8.10032844543457, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.853928804397583, + "num_tokens": 531176698.0, + "step": 13927 + }, + { + "epoch": 1.7717847602086247, + "ewc_loss": 0.06828062236309052, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000336126540787518, + "grad_norm": 7.914154052734375, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8611758947372437, + "num_tokens": 531214343.0, + "step": 13928 + }, + { + "epoch": 1.7719119704872153, + "ewc_loss": 0.06874129176139832, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034073321148753166, + "grad_norm": 8.118861198425293, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8755080699920654, + "num_tokens": 531252325.0, + "step": 13929 + }, + { + "epoch": 1.7720391807658058, + "ewc_loss": 0.06822924315929413, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003356127126608044, + "grad_norm": 7.991191864013672, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8510082960128784, + "num_tokens": 531290378.0, + "step": 13930 + }, + { + "epoch": 1.7721663910443963, + "ewc_loss": 0.0686306357383728, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033962668385356665, + "grad_norm": 8.015547752380371, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.853569507598877, + "num_tokens": 531330406.0, + "step": 13931 + }, + { + "epoch": 1.7722936013229869, + "ewc_loss": 0.06833525002002716, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033667279058136046, + "grad_norm": 7.950204372406006, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8625810146331787, + "num_tokens": 531367425.0, + "step": 13932 + }, + { + "epoch": 1.7724208116015774, + "ewc_loss": 0.06852370500564575, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033855740912258625, + "grad_norm": 7.977650165557861, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8829023241996765, + "num_tokens": 531406465.0, + "step": 13933 + }, + { + "epoch": 1.772548021880168, + "ewc_loss": 0.06854631006717682, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033878342946991324, + "grad_norm": 7.996212959289551, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8528474569320679, + "num_tokens": 531445356.0, + "step": 13934 + }, + { + "epoch": 1.7726752321587584, + "ewc_loss": 0.06844750046730042, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003377952962182462, + "grad_norm": 7.986277103424072, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8595468401908875, + "num_tokens": 531484737.0, + "step": 13935 + }, + { + "epoch": 1.772802442437349, + "ewc_loss": 0.06863035261631012, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003396238898858428, + "grad_norm": 8.042590141296387, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8598998188972473, + "num_tokens": 531523453.0, + "step": 13936 + }, + { + "epoch": 1.7729296527159395, + "ewc_loss": 0.06845373660326004, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000337857665726915, + "grad_norm": 7.971096992492676, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8700408935546875, + "num_tokens": 531559627.0, + "step": 13937 + }, + { + "epoch": 1.77305686299453, + "ewc_loss": 0.06872149556875229, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003405352763365954, + "grad_norm": 8.011083602905273, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8677096366882324, + "num_tokens": 531597677.0, + "step": 13938 + }, + { + "epoch": 1.7731840732731206, + "ewc_loss": 0.06862565875053406, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003395769454073161, + "grad_norm": 7.970874309539795, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8711362481117249, + "num_tokens": 531642753.0, + "step": 13939 + }, + { + "epoch": 1.773311283551711, + "ewc_loss": 0.06872282177209854, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034054851857945323, + "grad_norm": 8.05684757232666, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8559902310371399, + "num_tokens": 531688249.0, + "step": 13940 + }, + { + "epoch": 1.7734384938303016, + "ewc_loss": 0.06848787516355515, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033819908276200294, + "grad_norm": 7.9623026847839355, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8769913911819458, + "num_tokens": 531727994.0, + "step": 13941 + }, + { + "epoch": 1.7735657041088921, + "ewc_loss": 0.06885598599910736, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003418801643420011, + "grad_norm": 8.09057331085205, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8652322292327881, + "num_tokens": 531764428.0, + "step": 13942 + }, + { + "epoch": 1.7736929143874827, + "ewc_loss": 0.0683995857834816, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000337316159857437, + "grad_norm": 7.951294422149658, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8579046130180359, + "num_tokens": 531806604.0, + "step": 13943 + }, + { + "epoch": 1.773820124666073, + "ewc_loss": 0.06885533034801483, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034187364508397877, + "grad_norm": 8.078938484191895, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.881023108959198, + "num_tokens": 531845504.0, + "step": 13944 + }, + { + "epoch": 1.7739473349446635, + "ewc_loss": 0.06840522587299347, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033737256308086216, + "grad_norm": 7.961886882781982, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8707308769226074, + "num_tokens": 531880962.0, + "step": 13945 + }, + { + "epoch": 1.774074545223254, + "ewc_loss": 0.06867840886116028, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003401043941266835, + "grad_norm": 8.051054000854492, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8603379726409912, + "num_tokens": 531918182.0, + "step": 13946 + }, + { + "epoch": 1.7742017555018446, + "ewc_loss": 0.0684671401977539, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003379917470738292, + "grad_norm": 7.946322917938232, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8647838234901428, + "num_tokens": 531958340.0, + "step": 13947 + }, + { + "epoch": 1.774328965780435, + "ewc_loss": 0.06864114850759506, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033973180688917637, + "grad_norm": 8.035331726074219, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8508957028388977, + "num_tokens": 531996951.0, + "step": 13948 + }, + { + "epoch": 1.7744561760590256, + "ewc_loss": 0.06852118670940399, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003385322052054107, + "grad_norm": 7.935936450958252, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8669900894165039, + "num_tokens": 532036963.0, + "step": 13949 + }, + { + "epoch": 1.774583386337616, + "ewc_loss": 0.06869581341743469, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003402784641366452, + "grad_norm": 8.07754898071289, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8572369813919067, + "num_tokens": 532070937.0, + "step": 13950 + }, + { + "epoch": 1.7747105966162064, + "ewc_loss": 0.06842826306819916, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033760289079509676, + "grad_norm": 7.96435546875, + "learning_rate": 1e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8493120670318604, + "num_tokens": 532107866.0, + "step": 13951 + }, + { + "epoch": 1.774837806894797, + "ewc_loss": 0.06888142228126526, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003421345609240234, + "grad_norm": 8.077234268188477, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8557365536689758, + "num_tokens": 532151485.0, + "step": 13952 + }, + { + "epoch": 1.7749650171733875, + "ewc_loss": 0.0683479905128479, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033680026535876095, + "grad_norm": 7.942079544067383, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8700381517410278, + "num_tokens": 532191209.0, + "step": 13953 + }, + { + "epoch": 1.775092227451978, + "ewc_loss": 0.06884607672691345, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034178109490312636, + "grad_norm": 8.081767082214355, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8575451374053955, + "num_tokens": 532232219.0, + "step": 13954 + }, + { + "epoch": 1.7752194377305686, + "ewc_loss": 0.0684811919927597, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033813223126344383, + "grad_norm": 8.011641502380371, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8513624668121338, + "num_tokens": 532265330.0, + "step": 13955 + }, + { + "epoch": 1.775346648009159, + "ewc_loss": 0.06870679557323456, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034038827288895845, + "grad_norm": 8.068087577819824, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8600175380706787, + "num_tokens": 532295964.0, + "step": 13956 + }, + { + "epoch": 1.7754738582877496, + "ewc_loss": 0.06842324137687683, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033755277399905026, + "grad_norm": 7.971419811248779, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8651398420333862, + "num_tokens": 532333804.0, + "step": 13957 + }, + { + "epoch": 1.7756010685663401, + "ewc_loss": 0.06872323900461197, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000340552709531039, + "grad_norm": 8.082368850708008, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8673751354217529, + "num_tokens": 532373088.0, + "step": 13958 + }, + { + "epoch": 1.7757282788449307, + "ewc_loss": 0.06839775294065475, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033729785354807973, + "grad_norm": 7.9678497314453125, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8593348264694214, + "num_tokens": 532409473.0, + "step": 13959 + }, + { + "epoch": 1.7758554891235212, + "ewc_loss": 0.06861834228038788, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033950377837754786, + "grad_norm": 7.971989154815674, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8755604028701782, + "num_tokens": 532447883.0, + "step": 13960 + }, + { + "epoch": 1.7759826994021117, + "ewc_loss": 0.06848704069852829, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003381907008588314, + "grad_norm": 8.007562637329102, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8670843839645386, + "num_tokens": 532485783.0, + "step": 13961 + }, + { + "epoch": 1.7761099096807023, + "ewc_loss": 0.06846866756677628, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003380069974809885, + "grad_norm": 8.041661262512207, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8687162399291992, + "num_tokens": 532511082.0, + "step": 13962 + }, + { + "epoch": 1.7762371199592928, + "ewc_loss": 0.06851150095462799, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033843537676148117, + "grad_norm": 7.961310386657715, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8813540935516357, + "num_tokens": 532555516.0, + "step": 13963 + }, + { + "epoch": 1.7763643302378833, + "ewc_loss": 0.06855792552232742, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003388995537534356, + "grad_norm": 8.066051483154297, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8477596044540405, + "num_tokens": 532599200.0, + "step": 13964 + }, + { + "epoch": 1.7764915405164738, + "ewc_loss": 0.06831388175487518, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033645908115431666, + "grad_norm": 7.944628715515137, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8721694946289062, + "num_tokens": 532636335.0, + "step": 13965 + }, + { + "epoch": 1.7766187507950644, + "ewc_loss": 0.06857988238334656, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033911908394657075, + "grad_norm": 8.018835067749023, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8757504224777222, + "num_tokens": 532678337.0, + "step": 13966 + }, + { + "epoch": 1.776745961073655, + "ewc_loss": 0.068341463804245, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003367349272593856, + "grad_norm": 8.003057479858398, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8597006797790527, + "num_tokens": 532715051.0, + "step": 13967 + }, + { + "epoch": 1.7768731713522452, + "ewc_loss": 0.06838659942150116, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003371863276697695, + "grad_norm": 7.937074184417725, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8736387491226196, + "num_tokens": 532750836.0, + "step": 13968 + }, + { + "epoch": 1.7770003816308357, + "ewc_loss": 0.06859821081161499, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003393024089746177, + "grad_norm": 8.064187049865723, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8691838383674622, + "num_tokens": 532790331.0, + "step": 13969 + }, + { + "epoch": 1.7771275919094263, + "ewc_loss": 0.06832601130008698, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033658044412732124, + "grad_norm": 7.984485149383545, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8596127033233643, + "num_tokens": 532830169.0, + "step": 13970 + }, + { + "epoch": 1.7772548021880168, + "ewc_loss": 0.06860071420669556, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003393274964764714, + "grad_norm": 8.055628776550293, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8562589287757874, + "num_tokens": 532865957.0, + "step": 13971 + }, + { + "epoch": 1.7773820124666073, + "ewc_loss": 0.068293496966362, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033625532523728907, + "grad_norm": 7.965211391448975, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8598901033401489, + "num_tokens": 532906112.0, + "step": 13972 + }, + { + "epoch": 1.7775092227451978, + "ewc_loss": 0.06857015192508698, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033902181894518435, + "grad_norm": 8.076101303100586, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8701687455177307, + "num_tokens": 532939676.0, + "step": 13973 + }, + { + "epoch": 1.7776364330237884, + "ewc_loss": 0.0683397427201271, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003367177559994161, + "grad_norm": 8.020337104797363, + "learning_rate": 1e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.845262885093689, + "num_tokens": 532977668.0, + "step": 13974 + }, + { + "epoch": 1.7777636433023787, + "ewc_loss": 0.0684175193309784, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033749546855688095, + "grad_norm": 8.015212059020996, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8603286147117615, + "num_tokens": 533013862.0, + "step": 13975 + }, + { + "epoch": 1.7778908535809692, + "ewc_loss": 0.06838274747133255, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003371477941982448, + "grad_norm": 8.009531021118164, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8518853783607483, + "num_tokens": 533051222.0, + "step": 13976 + }, + { + "epoch": 1.7780180638595597, + "ewc_loss": 0.06846721470355988, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033799250377342105, + "grad_norm": 7.993605136871338, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8608617782592773, + "num_tokens": 533090182.0, + "step": 13977 + }, + { + "epoch": 1.7781452741381503, + "ewc_loss": 0.06857539713382721, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033907423494383693, + "grad_norm": 8.072824478149414, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8757878541946411, + "num_tokens": 533129600.0, + "step": 13978 + }, + { + "epoch": 1.7782724844167408, + "ewc_loss": 0.06828330457210541, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003361534036230296, + "grad_norm": 8.004276275634766, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8630735874176025, + "num_tokens": 533172351.0, + "step": 13979 + }, + { + "epoch": 1.7783996946953313, + "ewc_loss": 0.06861695647239685, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033948986674658954, + "grad_norm": 8.049227714538574, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8608453273773193, + "num_tokens": 533211583.0, + "step": 13980 + }, + { + "epoch": 1.7785269049739219, + "ewc_loss": 0.06844945251941681, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033781479578465223, + "grad_norm": 8.011478424072266, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8700845241546631, + "num_tokens": 533249960.0, + "step": 13981 + }, + { + "epoch": 1.7786541152525124, + "ewc_loss": 0.06852349638938904, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003385552845429629, + "grad_norm": 8.00949478149414, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8701710104942322, + "num_tokens": 533287328.0, + "step": 13982 + }, + { + "epoch": 1.778781325531103, + "ewc_loss": 0.06855455785989761, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033886588062159717, + "grad_norm": 8.067136764526367, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8646073341369629, + "num_tokens": 533327772.0, + "step": 13983 + }, + { + "epoch": 1.7789085358096934, + "ewc_loss": 0.0683906078338623, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033722640364430845, + "grad_norm": 7.939676284790039, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8564510941505432, + "num_tokens": 533372004.0, + "step": 13984 + }, + { + "epoch": 1.779035746088284, + "ewc_loss": 0.06872954219579697, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003406157484278083, + "grad_norm": 8.101682662963867, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.860020637512207, + "num_tokens": 533409502.0, + "step": 13985 + }, + { + "epoch": 1.7791629563668745, + "ewc_loss": 0.0682191476225853, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033551178057678044, + "grad_norm": 7.938944339752197, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8511369228363037, + "num_tokens": 533451160.0, + "step": 13986 + }, + { + "epoch": 1.779290166645465, + "ewc_loss": 0.06876934319734573, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003410137433093041, + "grad_norm": 8.113731384277344, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8706889152526855, + "num_tokens": 533480413.0, + "step": 13987 + }, + { + "epoch": 1.7794173769240555, + "ewc_loss": 0.0682482123374939, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033580244053155184, + "grad_norm": 7.990662574768066, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8515936732292175, + "num_tokens": 533518826.0, + "step": 13988 + }, + { + "epoch": 1.779544587202646, + "ewc_loss": 0.06867383420467377, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034005861380137503, + "grad_norm": 8.049674987792969, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8532843589782715, + "num_tokens": 533563760.0, + "step": 13989 + }, + { + "epoch": 1.7796717974812366, + "ewc_loss": 0.06845784932374954, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033789878943935037, + "grad_norm": 7.997303009033203, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.862164318561554, + "num_tokens": 533601499.0, + "step": 13990 + }, + { + "epoch": 1.7797990077598271, + "ewc_loss": 0.06850805878639221, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003384008596185595, + "grad_norm": 8.015674591064453, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8675265312194824, + "num_tokens": 533640757.0, + "step": 13991 + }, + { + "epoch": 1.7799262180384177, + "ewc_loss": 0.06856364011764526, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033895671367645264, + "grad_norm": 8.006089210510254, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8653845191001892, + "num_tokens": 533678047.0, + "step": 13992 + }, + { + "epoch": 1.780053428317008, + "ewc_loss": 0.06843490153551102, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003376693348400295, + "grad_norm": 7.975377082824707, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8629775643348694, + "num_tokens": 533720903.0, + "step": 13993 + }, + { + "epoch": 1.7801806385955985, + "ewc_loss": 0.06865455210208893, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033986583002842963, + "grad_norm": 8.05257797241211, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8652148246765137, + "num_tokens": 533759720.0, + "step": 13994 + }, + { + "epoch": 1.780307848874189, + "ewc_loss": 0.06843452155590057, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033766558044590056, + "grad_norm": 8.004844665527344, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8511190414428711, + "num_tokens": 533796122.0, + "step": 13995 + }, + { + "epoch": 1.7804350591527796, + "ewc_loss": 0.06862993538379669, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003396196407265961, + "grad_norm": 8.023200988769531, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8687912821769714, + "num_tokens": 533836326.0, + "step": 13996 + }, + { + "epoch": 1.78056226943137, + "ewc_loss": 0.06845971941947937, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033791750320233405, + "grad_norm": 7.976310729980469, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8741145133972168, + "num_tokens": 533867874.0, + "step": 13997 + }, + { + "epoch": 1.7806894797099606, + "ewc_loss": 0.06867808103561401, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034010110539384186, + "grad_norm": 8.05559253692627, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8740667104721069, + "num_tokens": 533902877.0, + "step": 13998 + }, + { + "epoch": 1.780816689988551, + "ewc_loss": 0.06856058537960052, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003389261255506426, + "grad_norm": 8.035033226013184, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8672152757644653, + "num_tokens": 533937440.0, + "step": 13999 + }, + { + "epoch": 1.7809439002671414, + "ewc_loss": 0.06871147453784943, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034043501364067197, + "grad_norm": 8.079174995422363, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.872478187084198, + "num_tokens": 533972308.0, + "step": 14000 + }, + { + "epoch": 1.781071110545732, + "ewc_loss": 0.06853899359703064, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003387102042324841, + "grad_norm": 8.03902816772461, + "learning_rate": 1e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.8495492935180664, + "num_tokens": 534011913.0, + "step": 14001 + }, + { + "epoch": 1.7811983208243225, + "ewc_loss": 0.06860599666833878, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033938029082491994, + "grad_norm": 8.032918930053711, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8740147352218628, + "num_tokens": 534057235.0, + "step": 14002 + }, + { + "epoch": 1.781325531102913, + "ewc_loss": 0.06852607429027557, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003385810414329171, + "grad_norm": 8.052279472351074, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8635156750679016, + "num_tokens": 534092850.0, + "step": 14003 + }, + { + "epoch": 1.7814527413815036, + "ewc_loss": 0.06840988993644714, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033741924562491477, + "grad_norm": 7.956708908081055, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8685669898986816, + "num_tokens": 534135310.0, + "step": 14004 + }, + { + "epoch": 1.781579951660094, + "ewc_loss": 0.06863363087177277, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003396565734874457, + "grad_norm": 8.037860870361328, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8612326383590698, + "num_tokens": 534176008.0, + "step": 14005 + }, + { + "epoch": 1.7817071619386846, + "ewc_loss": 0.06847009062767029, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003380212001502514, + "grad_norm": 8.041786193847656, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8625736236572266, + "num_tokens": 534208217.0, + "step": 14006 + }, + { + "epoch": 1.7818343722172751, + "ewc_loss": 0.068470299243927, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003380232956260443, + "grad_norm": 7.9950432777404785, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8667821884155273, + "num_tokens": 534247564.0, + "step": 14007 + }, + { + "epoch": 1.7819615824958657, + "ewc_loss": 0.068527951836586, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003385998716112226, + "grad_norm": 8.032658576965332, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8447117805480957, + "num_tokens": 534281144.0, + "step": 14008 + }, + { + "epoch": 1.7820887927744562, + "ewc_loss": 0.06850925832986832, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003384128795005381, + "grad_norm": 8.028594970703125, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8525562286376953, + "num_tokens": 534317893.0, + "step": 14009 + }, + { + "epoch": 1.7822160030530467, + "ewc_loss": 0.06847941875457764, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033811453613452613, + "grad_norm": 7.955667495727539, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8602813482284546, + "num_tokens": 534356116.0, + "step": 14010 + }, + { + "epoch": 1.7823432133316373, + "ewc_loss": 0.0686115026473999, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003394353261683136, + "grad_norm": 8.030004501342773, + "learning_rate": 1e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8368657827377319, + "num_tokens": 534390499.0, + "step": 14011 + }, + { + "epoch": 1.7824704236102278, + "ewc_loss": 0.0684526190161705, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003378464898560196, + "grad_norm": 8.02563190460205, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8584722280502319, + "num_tokens": 534424242.0, + "step": 14012 + }, + { + "epoch": 1.7825976338888183, + "ewc_loss": 0.06866706907749176, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033999100560322404, + "grad_norm": 8.076233863830566, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.854779839515686, + "num_tokens": 534467465.0, + "step": 14013 + }, + { + "epoch": 1.7827248441674088, + "ewc_loss": 0.06832171976566315, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.000336537545081228, + "grad_norm": 7.939667701721191, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8571097254753113, + "num_tokens": 534502959.0, + "step": 14014 + }, + { + "epoch": 1.7828520544459994, + "ewc_loss": 0.06886863708496094, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034200664958916605, + "grad_norm": 8.081910133361816, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8644195795059204, + "num_tokens": 534549888.0, + "step": 14015 + }, + { + "epoch": 1.78297926472459, + "ewc_loss": 0.0683077871799469, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00033639822504483163, + "grad_norm": 7.921889305114746, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8658328056335449, + "num_tokens": 534591686.0, + "step": 14016 + }, + { + "epoch": 1.7831064750031802, + "ewc_loss": 0.0689227283000946, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034254760248586535, + "grad_norm": 8.073575019836426, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8795945048332214, + "num_tokens": 534628584.0, + "step": 14017 + }, + { + "epoch": 1.7832336852817707, + "ewc_loss": 0.06830784678459167, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003363987780176103, + "grad_norm": 7.971214294433594, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8526440262794495, + "num_tokens": 534665684.0, + "step": 14018 + }, + { + "epoch": 1.7833608955603613, + "ewc_loss": 0.0688423365354538, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034174363827332854, + "grad_norm": 8.114930152893066, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.865404486656189, + "num_tokens": 534701604.0, + "step": 14019 + }, + { + "epoch": 1.7834881058389518, + "ewc_loss": 0.06833690404891968, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003366893797647208, + "grad_norm": 7.9850687980651855, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8482379913330078, + "num_tokens": 534741765.0, + "step": 14020 + }, + { + "epoch": 1.7836153161175423, + "ewc_loss": 0.06893417239189148, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034022057661786675, + "grad_norm": 8.065874099731445, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8726215362548828, + "num_tokens": 534780331.0, + "step": 14021 + }, + { + "epoch": 1.7837425263961328, + "ewc_loss": 0.06875511258840561, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003384300216566771, + "grad_norm": 8.016508102416992, + "learning_rate": 1e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8502510190010071, + "num_tokens": 534816254.0, + "step": 14022 + }, + { + "epoch": 1.7838697366747234, + "ewc_loss": 0.06878767162561417, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033875563531182706, + "grad_norm": 8.114089965820312, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8613969087600708, + "num_tokens": 534851367.0, + "step": 14023 + }, + { + "epoch": 1.7839969469533137, + "ewc_loss": 0.06865230947732925, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003374019870534539, + "grad_norm": 7.987876892089844, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8607739210128784, + "num_tokens": 534889077.0, + "step": 14024 + }, + { + "epoch": 1.7841241572319042, + "ewc_loss": 0.06888563930988312, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003397352702450007, + "grad_norm": 8.065611839294434, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8525775671005249, + "num_tokens": 534924249.0, + "step": 14025 + }, + { + "epoch": 1.7842513675104947, + "ewc_loss": 0.06866075843572617, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003374864754732698, + "grad_norm": 8.012428283691406, + "learning_rate": 1e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.8466973304748535, + "num_tokens": 534966370.0, + "step": 14026 + }, + { + "epoch": 1.7843785777890853, + "ewc_loss": 0.06874905526638031, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003383694274816662, + "grad_norm": 8.072600364685059, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.868845522403717, + "num_tokens": 535005869.0, + "step": 14027 + }, + { + "epoch": 1.7845057880676758, + "ewc_loss": 0.0688600093126297, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003370375488884747, + "grad_norm": 8.009024620056152, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8599538207054138, + "num_tokens": 535042057.0, + "step": 14028 + }, + { + "epoch": 1.7846329983462663, + "ewc_loss": 0.0687880665063858, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033875962253659964, + "grad_norm": 8.077142715454102, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8565750122070312, + "num_tokens": 535074410.0, + "step": 14029 + }, + { + "epoch": 1.7847602086248568, + "ewc_loss": 0.06842655688524246, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033514449023641646, + "grad_norm": 7.958837985992432, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8537349700927734, + "num_tokens": 535120356.0, + "step": 14030 + }, + { + "epoch": 1.7848874189034474, + "ewc_loss": 0.06906706094741821, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000339108140906319, + "grad_norm": 8.116571426391602, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8529089689254761, + "num_tokens": 535156939.0, + "step": 14031 + }, + { + "epoch": 1.785014629182038, + "ewc_loss": 0.06864182651042938, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003348558093421161, + "grad_norm": 8.08971881866455, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8650764226913452, + "num_tokens": 535195553.0, + "step": 14032 + }, + { + "epoch": 1.7851418394606284, + "ewc_loss": 0.06881852447986603, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003366227028891444, + "grad_norm": 8.012025833129883, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8680118322372437, + "num_tokens": 535235692.0, + "step": 14033 + }, + { + "epoch": 1.785269049739219, + "ewc_loss": 0.06897348910570145, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003381723945494741, + "grad_norm": 8.050447463989258, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8600184917449951, + "num_tokens": 535269265.0, + "step": 14034 + }, + { + "epoch": 1.7853962600178095, + "ewc_loss": 0.06906165182590485, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00033661266206763685, + "grad_norm": 7.98753023147583, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8780407905578613, + "num_tokens": 535303825.0, + "step": 14035 + }, + { + "epoch": 1.7855234702964, + "ewc_loss": 0.06896491348743439, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003380866255611181, + "grad_norm": 8.076430320739746, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.855707049369812, + "num_tokens": 535338802.0, + "step": 14036 + }, + { + "epoch": 1.7856506805749905, + "ewc_loss": 0.06877056509256363, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003361431590747088, + "grad_norm": 7.986028671264648, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8692148923873901, + "num_tokens": 535376592.0, + "step": 14037 + }, + { + "epoch": 1.785777890853581, + "ewc_loss": 0.06908680498600006, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003393055812921375, + "grad_norm": 8.08456802368164, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.858416736125946, + "num_tokens": 535416528.0, + "step": 14038 + }, + { + "epoch": 1.7859051011321716, + "ewc_loss": 0.06900578737258911, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00033605395583435893, + "grad_norm": 8.022833824157715, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8585706949234009, + "num_tokens": 535450892.0, + "step": 14039 + }, + { + "epoch": 1.7860323114107621, + "ewc_loss": 0.06895385682582855, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033797608921304345, + "grad_norm": 8.131587028503418, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8647992610931396, + "num_tokens": 535488198.0, + "step": 14040 + }, + { + "epoch": 1.7861595216893527, + "ewc_loss": 0.0686739981174469, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033517746487632394, + "grad_norm": 7.996143341064453, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8608102798461914, + "num_tokens": 535526020.0, + "step": 14041 + }, + { + "epoch": 1.786286731967943, + "ewc_loss": 0.06893499195575714, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000337787379976362, + "grad_norm": 8.1348876953125, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8667672276496887, + "num_tokens": 535560400.0, + "step": 14042 + }, + { + "epoch": 1.7864139422465335, + "ewc_loss": 0.06858205795288086, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003342580748721957, + "grad_norm": 7.963465690612793, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8550515174865723, + "num_tokens": 535594699.0, + "step": 14043 + }, + { + "epoch": 1.786541152525124, + "ewc_loss": 0.06902536749839783, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003386912285350263, + "grad_norm": 8.092263221740723, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8672872185707092, + "num_tokens": 535627019.0, + "step": 14044 + }, + { + "epoch": 1.7866683628037145, + "ewc_loss": 0.06864627450704575, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033490025089122355, + "grad_norm": 7.965567588806152, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8717243671417236, + "num_tokens": 535667479.0, + "step": 14045 + }, + { + "epoch": 1.786795573082305, + "ewc_loss": 0.06899219751358032, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033835944486781955, + "grad_norm": 8.08792781829834, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8570976257324219, + "num_tokens": 535705211.0, + "step": 14046 + }, + { + "epoch": 1.7869227833608956, + "ewc_loss": 0.06865683197975159, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033500578138045967, + "grad_norm": 8.065133094787598, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8427290916442871, + "num_tokens": 535744907.0, + "step": 14047 + }, + { + "epoch": 1.787049993639486, + "ewc_loss": 0.06879832595586777, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033642075140960515, + "grad_norm": 8.098074913024902, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8601088523864746, + "num_tokens": 535782526.0, + "step": 14048 + }, + { + "epoch": 1.7871772039180764, + "ewc_loss": 0.06877298653125763, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003361673152539879, + "grad_norm": 8.021099090576172, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8781172037124634, + "num_tokens": 535824153.0, + "step": 14049 + }, + { + "epoch": 1.787304414196667, + "ewc_loss": 0.06890963762998581, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033753388561308384, + "grad_norm": 8.05679988861084, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8740874528884888, + "num_tokens": 535859019.0, + "step": 14050 + }, + { + "epoch": 1.7874316244752575, + "ewc_loss": 0.06870118528604507, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033544935286045074, + "grad_norm": 8.048609733581543, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8751275539398193, + "num_tokens": 535896866.0, + "step": 14051 + }, + { + "epoch": 1.787558834753848, + "ewc_loss": 0.06916282325983047, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003376243112143129, + "grad_norm": 8.022018432617188, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8636904954910278, + "num_tokens": 535937128.0, + "step": 14052 + }, + { + "epoch": 1.7876860450324386, + "ewc_loss": 0.06872668862342834, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003357043315190822, + "grad_norm": 8.026249885559082, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8785493969917297, + "num_tokens": 535972727.0, + "step": 14053 + }, + { + "epoch": 1.787813255311029, + "ewc_loss": 0.06887932121753693, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003372306819073856, + "grad_norm": 8.050756454467773, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.867512583732605, + "num_tokens": 536011491.0, + "step": 14054 + }, + { + "epoch": 1.7879404655896196, + "ewc_loss": 0.06872065365314484, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033564402838237584, + "grad_norm": 8.010335922241211, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8671116828918457, + "num_tokens": 536048074.0, + "step": 14055 + }, + { + "epoch": 1.7880676758682101, + "ewc_loss": 0.0689045786857605, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033748330315575004, + "grad_norm": 8.075014114379883, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8704738020896912, + "num_tokens": 536089554.0, + "step": 14056 + }, + { + "epoch": 1.7881948861468007, + "ewc_loss": 0.06882119923830032, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003366494784131646, + "grad_norm": 7.9546990394592285, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8734613656997681, + "num_tokens": 536127672.0, + "step": 14057 + }, + { + "epoch": 1.7883220964253912, + "ewc_loss": 0.06874857097864151, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033836462534964085, + "grad_norm": 8.051236152648926, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8635877966880798, + "num_tokens": 536157517.0, + "step": 14058 + }, + { + "epoch": 1.7884493067039817, + "ewc_loss": 0.06880682706832886, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033650576369836926, + "grad_norm": 8.016780853271484, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8759177327156067, + "num_tokens": 536192855.0, + "step": 14059 + }, + { + "epoch": 1.7885765169825723, + "ewc_loss": 0.06878828257322311, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003387617180123925, + "grad_norm": 8.052465438842773, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8641105890274048, + "num_tokens": 536231727.0, + "step": 14060 + }, + { + "epoch": 1.7887037272611628, + "ewc_loss": 0.06882572919130325, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033669479307718575, + "grad_norm": 7.984021186828613, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8766100406646729, + "num_tokens": 536270738.0, + "step": 14061 + }, + { + "epoch": 1.7888309375397533, + "ewc_loss": 0.06914526224136353, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000339890131726861, + "grad_norm": 8.032957077026367, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8814836740493774, + "num_tokens": 536313006.0, + "step": 14062 + }, + { + "epoch": 1.7889581478183438, + "ewc_loss": 0.0688600093126297, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003370376070961356, + "grad_norm": 8.029747009277344, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8573195934295654, + "num_tokens": 536350697.0, + "step": 14063 + }, + { + "epoch": 1.7890853580969344, + "ewc_loss": 0.06876767426729202, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003385556337889284, + "grad_norm": 8.005544662475586, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8713451623916626, + "num_tokens": 536394252.0, + "step": 14064 + }, + { + "epoch": 1.789212568375525, + "ewc_loss": 0.0687943547964096, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003388223994988948, + "grad_norm": 8.116243362426758, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8605254888534546, + "num_tokens": 536426591.0, + "step": 14065 + }, + { + "epoch": 1.7893397786541152, + "ewc_loss": 0.06859353184700012, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033681420609354973, + "grad_norm": 8.01675796508789, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.850407600402832, + "num_tokens": 536468567.0, + "step": 14066 + }, + { + "epoch": 1.7894669889327057, + "ewc_loss": 0.06892413645982742, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034012028481811285, + "grad_norm": 8.131589889526367, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8539137244224548, + "num_tokens": 536506352.0, + "step": 14067 + }, + { + "epoch": 1.7895941992112963, + "ewc_loss": 0.06847184896469116, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033559740404598415, + "grad_norm": 8.14003849029541, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8561403155326843, + "num_tokens": 536538122.0, + "step": 14068 + }, + { + "epoch": 1.7897214094898868, + "ewc_loss": 0.06878113746643066, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033869032631628215, + "grad_norm": 8.1101655960083, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8544167280197144, + "num_tokens": 536574617.0, + "step": 14069 + }, + { + "epoch": 1.7898486197684773, + "ewc_loss": 0.06847825646400452, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003356614906806499, + "grad_norm": 7.969765663146973, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8758845925331116, + "num_tokens": 536611895.0, + "step": 14070 + }, + { + "epoch": 1.7899758300470678, + "ewc_loss": 0.06862865388393402, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003371654893271625, + "grad_norm": 8.075947761535645, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8566651344299316, + "num_tokens": 536651336.0, + "step": 14071 + }, + { + "epoch": 1.7901030403256584, + "ewc_loss": 0.06845908612012863, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033546978374943137, + "grad_norm": 7.9863481521606445, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8575835227966309, + "num_tokens": 536692566.0, + "step": 14072 + }, + { + "epoch": 1.7902302506042487, + "ewc_loss": 0.06870894879102707, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003379684058018029, + "grad_norm": 8.086681365966797, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8694783449172974, + "num_tokens": 536730855.0, + "step": 14073 + }, + { + "epoch": 1.7903574608828392, + "ewc_loss": 0.06847549974918365, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033563387114554644, + "grad_norm": 8.04145336151123, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8606958389282227, + "num_tokens": 536766478.0, + "step": 14074 + }, + { + "epoch": 1.7904846711614297, + "ewc_loss": 0.06864272058010101, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000337306089932099, + "grad_norm": 8.020668029785156, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.85687255859375, + "num_tokens": 536799900.0, + "step": 14075 + }, + { + "epoch": 1.7906118814400203, + "ewc_loss": 0.06866657733917236, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033754471223801374, + "grad_norm": 8.143900871276855, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8705729246139526, + "num_tokens": 536843597.0, + "step": 14076 + }, + { + "epoch": 1.7907390917186108, + "ewc_loss": 0.06850915402173996, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033597045694477856, + "grad_norm": 8.170622825622559, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8741090893745422, + "num_tokens": 536879189.0, + "step": 14077 + }, + { + "epoch": 1.7908663019972013, + "ewc_loss": 0.06856566667556763, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033653556602075696, + "grad_norm": 8.129862785339355, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8683167099952698, + "num_tokens": 536918099.0, + "step": 14078 + }, + { + "epoch": 1.7909935122757918, + "ewc_loss": 0.06843963265419006, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000335275282850489, + "grad_norm": 8.087374687194824, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8539329767227173, + "num_tokens": 536954410.0, + "step": 14079 + }, + { + "epoch": 1.7911207225543824, + "ewc_loss": 0.06850433349609375, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033592229010537267, + "grad_norm": 8.08651351928711, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8846546411514282, + "num_tokens": 536986688.0, + "step": 14080 + }, + { + "epoch": 1.791247932832973, + "ewc_loss": 0.06850026547908783, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003358815738465637, + "grad_norm": 8.0552339553833, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8641745448112488, + "num_tokens": 537024181.0, + "step": 14081 + }, + { + "epoch": 1.7913751431115634, + "ewc_loss": 0.0684255063533783, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033513392554596066, + "grad_norm": 8.053339958190918, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8674753904342651, + "num_tokens": 537064174.0, + "step": 14082 + }, + { + "epoch": 1.791502353390154, + "ewc_loss": 0.06870199739933014, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003354574437253177, + "grad_norm": 8.122840881347656, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8546077013015747, + "num_tokens": 537100279.0, + "step": 14083 + }, + { + "epoch": 1.7916295636687445, + "ewc_loss": 0.06841520965099335, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003350309852976352, + "grad_norm": 8.073911666870117, + "learning_rate": 1e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8401749730110168, + "num_tokens": 537135339.0, + "step": 14084 + }, + { + "epoch": 1.791756773947335, + "ewc_loss": 0.06844636797904968, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033534260001033545, + "grad_norm": 8.036848068237305, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8572157621383667, + "num_tokens": 537169608.0, + "step": 14085 + }, + { + "epoch": 1.7918839842259255, + "ewc_loss": 0.06834714859724045, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003343503922224045, + "grad_norm": 7.977057456970215, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8714544773101807, + "num_tokens": 537205560.0, + "step": 14086 + }, + { + "epoch": 1.792011194504516, + "ewc_loss": 0.06863739341497421, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033725282992236316, + "grad_norm": 8.071106910705566, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8513785004615784, + "num_tokens": 537238697.0, + "step": 14087 + }, + { + "epoch": 1.7921384047831066, + "ewc_loss": 0.0683714747428894, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033459364203736186, + "grad_norm": 8.026914596557617, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8582924604415894, + "num_tokens": 537282365.0, + "step": 14088 + }, + { + "epoch": 1.7922656150616971, + "ewc_loss": 0.0685974583029747, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003368534962646663, + "grad_norm": 8.015108108520508, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8705946207046509, + "num_tokens": 537320805.0, + "step": 14089 + }, + { + "epoch": 1.7923928253402877, + "ewc_loss": 0.06846354156732559, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003355143126100302, + "grad_norm": 7.955993175506592, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.866199791431427, + "num_tokens": 537354607.0, + "step": 14090 + }, + { + "epoch": 1.792520035618878, + "ewc_loss": 0.06871967017650604, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003380756243132055, + "grad_norm": 8.113065719604492, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8749296069145203, + "num_tokens": 537386672.0, + "step": 14091 + }, + { + "epoch": 1.7926472458974685, + "ewc_loss": 0.06862697750329971, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003347072924952954, + "grad_norm": 7.940979957580566, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8648359179496765, + "num_tokens": 537430882.0, + "step": 14092 + }, + { + "epoch": 1.792774456176059, + "ewc_loss": 0.06880775094032288, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033895636443048716, + "grad_norm": 8.10520076751709, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8665117025375366, + "num_tokens": 537465114.0, + "step": 14093 + }, + { + "epoch": 1.7929016664546495, + "ewc_loss": 0.06874221563339233, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003358596586622298, + "grad_norm": 8.02759075164795, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8573070764541626, + "num_tokens": 537502820.0, + "step": 14094 + }, + { + "epoch": 1.79302887673324, + "ewc_loss": 0.06889393925666809, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003373768995516002, + "grad_norm": 8.035294532775879, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.871006965637207, + "num_tokens": 537537422.0, + "step": 14095 + }, + { + "epoch": 1.7931560870118306, + "ewc_loss": 0.06879320740699768, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003363695286680013, + "grad_norm": 8.03812026977539, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8678907155990601, + "num_tokens": 537566828.0, + "step": 14096 + }, + { + "epoch": 1.793283297290421, + "ewc_loss": 0.06864918768405914, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003373707295395434, + "grad_norm": 8.011842727661133, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8616984486579895, + "num_tokens": 537606102.0, + "step": 14097 + }, + { + "epoch": 1.7934105075690114, + "ewc_loss": 0.06888623535633087, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003372998908162117, + "grad_norm": 7.978676795959473, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8697413802146912, + "num_tokens": 537638527.0, + "step": 14098 + }, + { + "epoch": 1.793537717847602, + "ewc_loss": 0.0688944160938263, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003373816143721342, + "grad_norm": 8.017767906188965, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8620549440383911, + "num_tokens": 537683375.0, + "step": 14099 + }, + { + "epoch": 1.7936649281261925, + "ewc_loss": 0.06884300708770752, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003368675825186074, + "grad_norm": 7.955241680145264, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8585542440414429, + "num_tokens": 537731354.0, + "step": 14100 + }, + { + "epoch": 1.793792138404783, + "ewc_loss": 0.06893611699342728, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003377986722625792, + "grad_norm": 8.066854476928711, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8594529628753662, + "num_tokens": 537770055.0, + "step": 14101 + }, + { + "epoch": 1.7939193486833735, + "ewc_loss": 0.06882356107234955, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033667313982732594, + "grad_norm": 8.031158447265625, + "learning_rate": 1e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.8421204090118408, + "num_tokens": 537807977.0, + "step": 14102 + }, + { + "epoch": 1.794046558961964, + "ewc_loss": 0.0687803328037262, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033868226455524564, + "grad_norm": 8.010480880737305, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8574615716934204, + "num_tokens": 537847809.0, + "step": 14103 + }, + { + "epoch": 1.7941737692405546, + "ewc_loss": 0.06869757920503616, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003378546971362084, + "grad_norm": 8.013937950134277, + "learning_rate": 1e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8446154594421387, + "num_tokens": 537885506.0, + "step": 14104 + }, + { + "epoch": 1.7943009795191451, + "ewc_loss": 0.06880639493465424, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003389428311493248, + "grad_norm": 8.069729804992676, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8582636117935181, + "num_tokens": 537918936.0, + "step": 14105 + }, + { + "epoch": 1.7944281897977357, + "ewc_loss": 0.06859812140464783, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003368601610418409, + "grad_norm": 7.9639081954956055, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8679119348526001, + "num_tokens": 537954205.0, + "step": 14106 + }, + { + "epoch": 1.7945554000763262, + "ewc_loss": 0.06899683177471161, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003408472693990916, + "grad_norm": 8.107318878173828, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8665592074394226, + "num_tokens": 537989190.0, + "step": 14107 + }, + { + "epoch": 1.7946826103549167, + "ewc_loss": 0.06855866312980652, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003364655131008476, + "grad_norm": 7.958735942840576, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8544545769691467, + "num_tokens": 538028852.0, + "step": 14108 + }, + { + "epoch": 1.7948098206335072, + "ewc_loss": 0.06918559968471527, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003402934526093304, + "grad_norm": 8.0958890914917, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8684913516044617, + "num_tokens": 538061245.0, + "step": 14109 + }, + { + "epoch": 1.7949370309120978, + "ewc_loss": 0.06878667324781418, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033630424877628684, + "grad_norm": 7.9369893074035645, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8758503198623657, + "num_tokens": 538103238.0, + "step": 14110 + }, + { + "epoch": 1.7950642411906883, + "ewc_loss": 0.06922125071287155, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034065000363625586, + "grad_norm": 8.115721702575684, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8501662015914917, + "num_tokens": 538142692.0, + "step": 14111 + }, + { + "epoch": 1.7951914514692788, + "ewc_loss": 0.0685000866651535, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003358798276167363, + "grad_norm": 7.93648624420166, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8616245985031128, + "num_tokens": 538183155.0, + "step": 14112 + }, + { + "epoch": 1.7953186617478694, + "ewc_loss": 0.06932412832975388, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003416787658352405, + "grad_norm": 8.105757713317871, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8618617653846741, + "num_tokens": 538222063.0, + "step": 14113 + }, + { + "epoch": 1.7954458720264599, + "ewc_loss": 0.06886523216962814, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000337089819367975, + "grad_norm": 8.013934135437012, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8648437261581421, + "num_tokens": 538259847.0, + "step": 14114 + }, + { + "epoch": 1.7955730823050502, + "ewc_loss": 0.06910543888807297, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003394919040147215, + "grad_norm": 8.054753303527832, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8699585199356079, + "num_tokens": 538298557.0, + "step": 14115 + }, + { + "epoch": 1.7957002925836407, + "ewc_loss": 0.06889605522155762, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003373980289325118, + "grad_norm": 7.974343776702881, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8599005937576294, + "num_tokens": 538337926.0, + "step": 14116 + }, + { + "epoch": 1.7958275028622313, + "ewc_loss": 0.06916762888431549, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034011382376775146, + "grad_norm": 8.060073852539062, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8679153919219971, + "num_tokens": 538382755.0, + "step": 14117 + }, + { + "epoch": 1.7959547131408218, + "ewc_loss": 0.06892237067222595, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033766121487133205, + "grad_norm": 8.037653923034668, + "learning_rate": 1e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.8421915173530579, + "num_tokens": 538417639.0, + "step": 14118 + }, + { + "epoch": 1.7960819234194123, + "ewc_loss": 0.06903769820928574, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003388144832570106, + "grad_norm": 8.031011581420898, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8587203025817871, + "num_tokens": 538458444.0, + "step": 14119 + }, + { + "epoch": 1.7962091336980028, + "ewc_loss": 0.06897831708192825, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033822067780420184, + "grad_norm": 7.991260051727295, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8653438091278076, + "num_tokens": 538496580.0, + "step": 14120 + }, + { + "epoch": 1.7963363439765934, + "ewc_loss": 0.06904392689466476, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000338876765454188, + "grad_norm": 7.966503620147705, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8726712465286255, + "num_tokens": 538538474.0, + "step": 14121 + }, + { + "epoch": 1.7964635542551837, + "ewc_loss": 0.06887208670377731, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033959976281039417, + "grad_norm": 8.010801315307617, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8652008771896362, + "num_tokens": 538576828.0, + "step": 14122 + }, + { + "epoch": 1.7965907645337742, + "ewc_loss": 0.06866331398487091, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033751208684407175, + "grad_norm": 7.956211090087891, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.870166540145874, + "num_tokens": 538611630.0, + "step": 14123 + }, + { + "epoch": 1.7967179748123647, + "ewc_loss": 0.06880282610654831, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003389071498531848, + "grad_norm": 8.209813117980957, + "learning_rate": 1e-06, + "loss": 0.5589, + "mean_token_accuracy": 0.8391886949539185, + "num_tokens": 538659537.0, + "step": 14124 + }, + { + "epoch": 1.7968451850909553, + "ewc_loss": 0.06847228854894638, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003356017987243831, + "grad_norm": 7.880100727081299, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8718022704124451, + "num_tokens": 538700216.0, + "step": 14125 + }, + { + "epoch": 1.7969723953695458, + "ewc_loss": 0.06920033693313599, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034288226743228734, + "grad_norm": 8.10128402709961, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.861282467842102, + "num_tokens": 538737719.0, + "step": 14126 + }, + { + "epoch": 1.7970996056481363, + "ewc_loss": 0.06839065998792648, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003347855235915631, + "grad_norm": 7.884498596191406, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8702620267868042, + "num_tokens": 538774723.0, + "step": 14127 + }, + { + "epoch": 1.7972268159267268, + "ewc_loss": 0.06926774233579636, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034355634124949574, + "grad_norm": 8.069087982177734, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8718909621238708, + "num_tokens": 538815288.0, + "step": 14128 + }, + { + "epoch": 1.7973540262053174, + "ewc_loss": 0.06866610795259476, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003375399683136493, + "grad_norm": 7.932357311248779, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8631923198699951, + "num_tokens": 538858239.0, + "step": 14129 + }, + { + "epoch": 1.797481236483908, + "ewc_loss": 0.06913022696971893, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034218121436424553, + "grad_norm": 8.12307357788086, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8655745387077332, + "num_tokens": 538896947.0, + "step": 14130 + }, + { + "epoch": 1.7976084467624984, + "ewc_loss": 0.06869742274284363, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003378531546331942, + "grad_norm": 7.902906894683838, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8605109453201294, + "num_tokens": 538936511.0, + "step": 14131 + }, + { + "epoch": 1.797735657041089, + "ewc_loss": 0.06934486329555511, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003443275927565992, + "grad_norm": 8.147464752197266, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8566164970397949, + "num_tokens": 538969198.0, + "step": 14132 + }, + { + "epoch": 1.7978628673196795, + "ewc_loss": 0.06867894530296326, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003376683162059635, + "grad_norm": 7.930766582489014, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8739787936210632, + "num_tokens": 539002258.0, + "step": 14133 + }, + { + "epoch": 1.79799007759827, + "ewc_loss": 0.06940416991710663, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034492064150981605, + "grad_norm": 8.077472686767578, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8495644330978394, + "num_tokens": 539039993.0, + "step": 14134 + }, + { + "epoch": 1.7981172878768605, + "ewc_loss": 0.0687774121761322, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033865298610180616, + "grad_norm": 7.947200298309326, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8608795404434204, + "num_tokens": 539080713.0, + "step": 14135 + }, + { + "epoch": 1.798244498155451, + "ewc_loss": 0.06934646517038345, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034434357075951993, + "grad_norm": 8.044118881225586, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8743935823440552, + "num_tokens": 539121115.0, + "step": 14136 + }, + { + "epoch": 1.7983717084340416, + "ewc_loss": 0.0689784437417984, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003406633040867746, + "grad_norm": 7.9540839195251465, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8817060589790344, + "num_tokens": 539159889.0, + "step": 14137 + }, + { + "epoch": 1.7984989187126321, + "ewc_loss": 0.06924218684434891, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034330078051425517, + "grad_norm": 8.029261589050293, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8625950217247009, + "num_tokens": 539201508.0, + "step": 14138 + }, + { + "epoch": 1.7986261289912227, + "ewc_loss": 0.06905153393745422, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034139424678869545, + "grad_norm": 8.031657218933105, + "learning_rate": 1e-06, + "loss": 0.5275, + "mean_token_accuracy": 0.8456672430038452, + "num_tokens": 539238640.0, + "step": 14139 + }, + { + "epoch": 1.798753339269813, + "ewc_loss": 0.06889036297798157, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034222399699501693, + "grad_norm": 8.045387268066406, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8585835695266724, + "num_tokens": 539274797.0, + "step": 14140 + }, + { + "epoch": 1.7988805495484035, + "ewc_loss": 0.06914395093917847, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003423184098210186, + "grad_norm": 8.032471656799316, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8514230251312256, + "num_tokens": 539308410.0, + "step": 14141 + }, + { + "epoch": 1.799007759826994, + "ewc_loss": 0.06912972778081894, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003421761794015765, + "grad_norm": 8.039556503295898, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8566140532493591, + "num_tokens": 539345578.0, + "step": 14142 + }, + { + "epoch": 1.7991349701055845, + "ewc_loss": 0.06912489235401154, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034212777973152697, + "grad_norm": 8.087568283081055, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8477993011474609, + "num_tokens": 539378946.0, + "step": 14143 + }, + { + "epoch": 1.799262180384175, + "ewc_loss": 0.06904034316539764, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034128237166441977, + "grad_norm": 7.9756364822387695, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8759114742279053, + "num_tokens": 539417663.0, + "step": 14144 + }, + { + "epoch": 1.7993893906627656, + "ewc_loss": 0.06925562024116516, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000343435094691813, + "grad_norm": 8.063700675964355, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8778790235519409, + "num_tokens": 539459597.0, + "step": 14145 + }, + { + "epoch": 1.799516600941356, + "ewc_loss": 0.06881546229124069, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003390335477888584, + "grad_norm": 7.966938018798828, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8654654026031494, + "num_tokens": 539499579.0, + "step": 14146 + }, + { + "epoch": 1.7996438112199464, + "ewc_loss": 0.0692952573299408, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003438314888626337, + "grad_norm": 8.1248197555542, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8728734850883484, + "num_tokens": 539538505.0, + "step": 14147 + }, + { + "epoch": 1.799771021498537, + "ewc_loss": 0.06882746517658234, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000339153571985662, + "grad_norm": 8.016217231750488, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8575741052627563, + "num_tokens": 539574824.0, + "step": 14148 + }, + { + "epoch": 1.7998982317771275, + "ewc_loss": 0.06907132267951965, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034159209462814033, + "grad_norm": 8.078476905822754, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8510944843292236, + "num_tokens": 539614181.0, + "step": 14149 + }, + { + "epoch": 1.800025442055718, + "ewc_loss": 0.06885501742362976, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003394290688447654, + "grad_norm": 7.9766316413879395, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8509932160377502, + "num_tokens": 539656713.0, + "step": 14150 + }, + { + "epoch": 1.8001526523343085, + "ewc_loss": 0.0690782442688942, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003416613326407969, + "grad_norm": 8.100152015686035, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8616424202919006, + "num_tokens": 539695436.0, + "step": 14151 + }, + { + "epoch": 1.800279862612899, + "ewc_loss": 0.06876009702682495, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033847984741441905, + "grad_norm": 7.943075656890869, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8654413223266602, + "num_tokens": 539736549.0, + "step": 14152 + }, + { + "epoch": 1.8004070728914896, + "ewc_loss": 0.06931589543819427, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034403783502057195, + "grad_norm": 8.094433784484863, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8682065010070801, + "num_tokens": 539776326.0, + "step": 14153 + }, + { + "epoch": 1.8005342831700801, + "ewc_loss": 0.06870195269584656, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033789841108955443, + "grad_norm": 7.939336776733398, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8710322380065918, + "num_tokens": 539819665.0, + "step": 14154 + }, + { + "epoch": 1.8006614934486707, + "ewc_loss": 0.06933815032243729, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034426042111590505, + "grad_norm": 8.108500480651855, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8623098731040955, + "num_tokens": 539858175.0, + "step": 14155 + }, + { + "epoch": 1.8007887037272612, + "ewc_loss": 0.06871730089187622, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003380519337952137, + "grad_norm": 7.948758602142334, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8629592061042786, + "num_tokens": 539895152.0, + "step": 14156 + }, + { + "epoch": 1.8009159140058517, + "ewc_loss": 0.06923943758010864, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000343273306498304, + "grad_norm": 8.08535099029541, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.868028998374939, + "num_tokens": 539941033.0, + "step": 14157 + }, + { + "epoch": 1.8010431242844422, + "ewc_loss": 0.06883150339126587, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000339193909894675, + "grad_norm": 8.036266326904297, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8657494187355042, + "num_tokens": 539977959.0, + "step": 14158 + }, + { + "epoch": 1.8011703345630328, + "ewc_loss": 0.06893377006053925, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003402166476007551, + "grad_norm": 8.075116157531738, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8615559339523315, + "num_tokens": 540014857.0, + "step": 14159 + }, + { + "epoch": 1.8012975448416233, + "ewc_loss": 0.06883767247200012, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003392556100152433, + "grad_norm": 7.962270736694336, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8607618808746338, + "num_tokens": 540055775.0, + "step": 14160 + }, + { + "epoch": 1.8014247551202138, + "ewc_loss": 0.06909605860710144, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000341839506290853, + "grad_norm": 8.115827560424805, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8716878890991211, + "num_tokens": 540090287.0, + "step": 14161 + }, + { + "epoch": 1.8015519653988044, + "ewc_loss": 0.06881493330001831, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033902822178788483, + "grad_norm": 7.991225719451904, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8712467551231384, + "num_tokens": 540129139.0, + "step": 14162 + }, + { + "epoch": 1.8016791756773949, + "ewc_loss": 0.06912504881620407, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003421293804422021, + "grad_norm": 8.113187789916992, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8623142838478088, + "num_tokens": 540165187.0, + "step": 14163 + }, + { + "epoch": 1.8018063859559852, + "ewc_loss": 0.06871560215950012, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003380349080543965, + "grad_norm": 8.015885353088379, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8753014206886292, + "num_tokens": 540201113.0, + "step": 14164 + }, + { + "epoch": 1.8019335962345757, + "ewc_loss": 0.0690762847661972, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003416417457628995, + "grad_norm": 8.050810813903809, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8642433881759644, + "num_tokens": 540237448.0, + "step": 14165 + }, + { + "epoch": 1.8020608065131662, + "ewc_loss": 0.068740613758564, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003382850263733417, + "grad_norm": 7.966592788696289, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.856212854385376, + "num_tokens": 540279703.0, + "step": 14166 + }, + { + "epoch": 1.8021880167917568, + "ewc_loss": 0.06898854672908783, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003407644107937813, + "grad_norm": 8.100815773010254, + "learning_rate": 1e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8444609045982361, + "num_tokens": 540313071.0, + "step": 14167 + }, + { + "epoch": 1.8023152270703473, + "ewc_loss": 0.06877951323986053, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003386740863788873, + "grad_norm": 7.964430332183838, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8677866458892822, + "num_tokens": 540353940.0, + "step": 14168 + }, + { + "epoch": 1.8024424373489378, + "ewc_loss": 0.06909088045358658, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034178770147264004, + "grad_norm": 8.090320587158203, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.859069287776947, + "num_tokens": 540391241.0, + "step": 14169 + }, + { + "epoch": 1.8025696476275284, + "ewc_loss": 0.06873379647731781, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000338216865202412, + "grad_norm": 8.014678001403809, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8761849999427795, + "num_tokens": 540420205.0, + "step": 14170 + }, + { + "epoch": 1.8026968579061187, + "ewc_loss": 0.06904458999633789, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034132477594539523, + "grad_norm": 8.05899715423584, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8672389984130859, + "num_tokens": 540452121.0, + "step": 14171 + }, + { + "epoch": 1.8028240681847092, + "ewc_loss": 0.06883788108825684, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003392577636986971, + "grad_norm": 7.9580559730529785, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8630075454711914, + "num_tokens": 540487415.0, + "step": 14172 + }, + { + "epoch": 1.8029512784632997, + "ewc_loss": 0.06914756447076797, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034235455677844584, + "grad_norm": 8.072053909301758, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8530393838882446, + "num_tokens": 540522293.0, + "step": 14173 + }, + { + "epoch": 1.8030784887418903, + "ewc_loss": 0.06894853711128235, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034036432043649256, + "grad_norm": 8.020748138427734, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8750278949737549, + "num_tokens": 540560345.0, + "step": 14174 + }, + { + "epoch": 1.8032056990204808, + "ewc_loss": 0.06906628608703613, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003415417158976197, + "grad_norm": 7.9987616539001465, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8721011281013489, + "num_tokens": 540598299.0, + "step": 14175 + }, + { + "epoch": 1.8033329092990713, + "ewc_loss": 0.06907282769680023, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003416071704123169, + "grad_norm": 8.000431060791016, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8739131689071655, + "num_tokens": 540634480.0, + "step": 14176 + }, + { + "epoch": 1.8034601195776618, + "ewc_loss": 0.06906311213970184, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034151008003391325, + "grad_norm": 8.032516479492188, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8542953729629517, + "num_tokens": 540679473.0, + "step": 14177 + }, + { + "epoch": 1.8035873298562524, + "ewc_loss": 0.0690748393535614, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003416272811591625, + "grad_norm": 8.067789077758789, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8572765588760376, + "num_tokens": 540722259.0, + "step": 14178 + }, + { + "epoch": 1.803714540134843, + "ewc_loss": 0.06885312497615814, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033941020956262946, + "grad_norm": 7.995439529418945, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8677768707275391, + "num_tokens": 540758058.0, + "step": 14179 + }, + { + "epoch": 1.8038417504134334, + "ewc_loss": 0.0690544843673706, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034142369986511767, + "grad_norm": 8.067296981811523, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8584613800048828, + "num_tokens": 540798535.0, + "step": 14180 + }, + { + "epoch": 1.803968960692024, + "ewc_loss": 0.06895318627357483, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034041079925373197, + "grad_norm": 8.047377586364746, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8624252676963806, + "num_tokens": 540839423.0, + "step": 14181 + }, + { + "epoch": 1.8040961709706145, + "ewc_loss": 0.06894946098327637, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034037348814308643, + "grad_norm": 8.073138236999512, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8687861561775208, + "num_tokens": 540887512.0, + "step": 14182 + }, + { + "epoch": 1.804223381249205, + "ewc_loss": 0.06915777921676636, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034001527819782495, + "grad_norm": 8.077130317687988, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8660829067230225, + "num_tokens": 540922292.0, + "step": 14183 + }, + { + "epoch": 1.8043505915277955, + "ewc_loss": 0.06881147623062134, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003389936173334718, + "grad_norm": 7.993676662445068, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.870590090751648, + "num_tokens": 540964105.0, + "step": 14184 + }, + { + "epoch": 1.804477801806386, + "ewc_loss": 0.06893661618232727, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003402450820431113, + "grad_norm": 8.105644226074219, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8613938689231873, + "num_tokens": 541001793.0, + "step": 14185 + }, + { + "epoch": 1.8046050120849766, + "ewc_loss": 0.06875573843717575, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003384363080840558, + "grad_norm": 7.9812750816345215, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.868770956993103, + "num_tokens": 541041104.0, + "step": 14186 + }, + { + "epoch": 1.8047322223635671, + "ewc_loss": 0.06905008852481842, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034137978218495846, + "grad_norm": 8.046902656555176, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.857208788394928, + "num_tokens": 541079643.0, + "step": 14187 + }, + { + "epoch": 1.8048594326421576, + "ewc_loss": 0.06872040033340454, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003380828711669892, + "grad_norm": 8.00655460357666, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8685652017593384, + "num_tokens": 541118554.0, + "step": 14188 + }, + { + "epoch": 1.804986642920748, + "ewc_loss": 0.06895464658737183, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034042535116896033, + "grad_norm": 8.050786972045898, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.851227343082428, + "num_tokens": 541156893.0, + "step": 14189 + }, + { + "epoch": 1.8051138531993385, + "ewc_loss": 0.06880752742290497, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033895421074703336, + "grad_norm": 7.988478660583496, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8800132274627686, + "num_tokens": 541197283.0, + "step": 14190 + }, + { + "epoch": 1.805241063477929, + "ewc_loss": 0.0688326358795166, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034164663520641625, + "grad_norm": 8.088557243347168, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.876697301864624, + "num_tokens": 541232343.0, + "step": 14191 + }, + { + "epoch": 1.8053682737565195, + "ewc_loss": 0.06874142587184906, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033829317544586957, + "grad_norm": 8.007107734680176, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8549314141273499, + "num_tokens": 541270274.0, + "step": 14192 + }, + { + "epoch": 1.80549548403511, + "ewc_loss": 0.0690346285700798, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003412251826375723, + "grad_norm": 8.08751106262207, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8575860261917114, + "num_tokens": 541306018.0, + "step": 14193 + }, + { + "epoch": 1.8056226943137006, + "ewc_loss": 0.06878383457660675, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003387172764632851, + "grad_norm": 7.993653774261475, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8640385866165161, + "num_tokens": 541344974.0, + "step": 14194 + }, + { + "epoch": 1.805749904592291, + "ewc_loss": 0.06908658146858215, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000341744686011225, + "grad_norm": 8.067045211791992, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.872775673866272, + "num_tokens": 541378590.0, + "step": 14195 + }, + { + "epoch": 1.8058771148708814, + "ewc_loss": 0.06887742877006531, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033965319744311273, + "grad_norm": 8.025650024414062, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8661682605743408, + "num_tokens": 541415945.0, + "step": 14196 + }, + { + "epoch": 1.806004325149472, + "ewc_loss": 0.06907881796360016, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034166709519922733, + "grad_norm": 8.060858726501465, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8612610697746277, + "num_tokens": 541459577.0, + "step": 14197 + }, + { + "epoch": 1.8061315354280625, + "ewc_loss": 0.06886063516139984, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033948521013371646, + "grad_norm": 7.960416316986084, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8636494874954224, + "num_tokens": 541501041.0, + "step": 14198 + }, + { + "epoch": 1.806258745706653, + "ewc_loss": 0.06898398697376251, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034316020901314914, + "grad_norm": 8.094221115112305, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8568515181541443, + "num_tokens": 541538587.0, + "step": 14199 + }, + { + "epoch": 1.8063859559852435, + "ewc_loss": 0.06853767484426498, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003386970784049481, + "grad_norm": 8.152412414550781, + "learning_rate": 1e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8510028123855591, + "num_tokens": 541582225.0, + "step": 14200 + }, + { + "epoch": 1.806513166263834, + "ewc_loss": 0.06866960972547531, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034001641324721277, + "grad_norm": 8.059955596923828, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.872906506061554, + "num_tokens": 541616956.0, + "step": 14201 + }, + { + "epoch": 1.8066403765424246, + "ewc_loss": 0.0687406063079834, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00034072637208737433, + "grad_norm": 8.103302001953125, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8529590368270874, + "num_tokens": 541650814.0, + "step": 14202 + }, + { + "epoch": 1.8067675868210151, + "ewc_loss": 0.06855667382478714, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003388870391063392, + "grad_norm": 7.9463067054748535, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8730272054672241, + "num_tokens": 541692167.0, + "step": 14203 + }, + { + "epoch": 1.8068947970996057, + "ewc_loss": 0.06922242045402527, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003431030781939626, + "grad_norm": 8.12200927734375, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8536299467086792, + "num_tokens": 541732427.0, + "step": 14204 + }, + { + "epoch": 1.8070220073781962, + "ewc_loss": 0.06871073693037033, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003379862755537033, + "grad_norm": 7.957470417022705, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.863699197769165, + "num_tokens": 541773256.0, + "step": 14205 + }, + { + "epoch": 1.8071492176567867, + "ewc_loss": 0.06928571313619614, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003437360282987356, + "grad_norm": 8.103659629821777, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8694025278091431, + "num_tokens": 541815008.0, + "step": 14206 + }, + { + "epoch": 1.8072764279353772, + "ewc_loss": 0.06886696815490723, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003395485691726208, + "grad_norm": 8.030633926391602, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8664817214012146, + "num_tokens": 541853462.0, + "step": 14207 + }, + { + "epoch": 1.8074036382139678, + "ewc_loss": 0.06912939250469208, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034217280335724354, + "grad_norm": 8.102540969848633, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8586210012435913, + "num_tokens": 541890487.0, + "step": 14208 + }, + { + "epoch": 1.8075308484925583, + "ewc_loss": 0.06888878345489502, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003397667605895549, + "grad_norm": 8.050562858581543, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8670990467071533, + "num_tokens": 541929857.0, + "step": 14209 + }, + { + "epoch": 1.8076580587711488, + "ewc_loss": 0.06915031373500824, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034238205989822745, + "grad_norm": 8.093852043151855, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8562390208244324, + "num_tokens": 541969923.0, + "step": 14210 + }, + { + "epoch": 1.8077852690497394, + "ewc_loss": 0.0689668133854866, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034054703428409994, + "grad_norm": 8.066641807556152, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8735759258270264, + "num_tokens": 542008802.0, + "step": 14211 + }, + { + "epoch": 1.8079124793283299, + "ewc_loss": 0.06892159581184387, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003400948189664632, + "grad_norm": 8.030778884887695, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8775152564048767, + "num_tokens": 542049276.0, + "step": 14212 + }, + { + "epoch": 1.8080396896069202, + "ewc_loss": 0.06908786296844482, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003417575790081173, + "grad_norm": 8.101787567138672, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8570577502250671, + "num_tokens": 542086921.0, + "step": 14213 + }, + { + "epoch": 1.8081668998855107, + "ewc_loss": 0.0688597559928894, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003394765080884099, + "grad_norm": 8.055610656738281, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8631672859191895, + "num_tokens": 542124291.0, + "step": 14214 + }, + { + "epoch": 1.8082941101641012, + "ewc_loss": 0.0697937160730362, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003414918028283864, + "grad_norm": 8.233988761901855, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8520480394363403, + "num_tokens": 542167600.0, + "step": 14215 + }, + { + "epoch": 1.8084213204426918, + "ewc_loss": 0.06854158639907837, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033629481913521886, + "grad_norm": 8.02840518951416, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8582228422164917, + "num_tokens": 542201134.0, + "step": 14216 + }, + { + "epoch": 1.8085485307212823, + "ewc_loss": 0.06904944777488708, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000341373379342258, + "grad_norm": 8.126556396484375, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8506849408149719, + "num_tokens": 542242590.0, + "step": 14217 + }, + { + "epoch": 1.8086757409998728, + "ewc_loss": 0.06861606240272522, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003370395570527762, + "grad_norm": 8.004032135009766, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8636547327041626, + "num_tokens": 542281945.0, + "step": 14218 + }, + { + "epoch": 1.8088029512784631, + "ewc_loss": 0.06906195729970932, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000341498467605561, + "grad_norm": 8.120482444763184, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8717247843742371, + "num_tokens": 542319389.0, + "step": 14219 + }, + { + "epoch": 1.8089301615570537, + "ewc_loss": 0.06865383684635162, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033741729566827416, + "grad_norm": 7.988681793212891, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8510492444038391, + "num_tokens": 542356912.0, + "step": 14220 + }, + { + "epoch": 1.8090573718356442, + "ewc_loss": 0.06910359859466553, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003419149434193969, + "grad_norm": 8.11546516418457, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8557256460189819, + "num_tokens": 542394054.0, + "step": 14221 + }, + { + "epoch": 1.8091845821142347, + "ewc_loss": 0.06875117868185043, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003383907023817301, + "grad_norm": 8.021973609924316, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8684576153755188, + "num_tokens": 542431063.0, + "step": 14222 + }, + { + "epoch": 1.8093117923928252, + "ewc_loss": 0.06899680197238922, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003408469201531261, + "grad_norm": 8.084400177001953, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8519741296768188, + "num_tokens": 542471398.0, + "step": 14223 + }, + { + "epoch": 1.8094390026714158, + "ewc_loss": 0.06880666315555573, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003389455087017268, + "grad_norm": 7.993762016296387, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8487671613693237, + "num_tokens": 542516288.0, + "step": 14224 + }, + { + "epoch": 1.8095662129500063, + "ewc_loss": 0.0690263956785202, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034114281879737973, + "grad_norm": 8.120543479919434, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8617949485778809, + "num_tokens": 542552292.0, + "step": 14225 + }, + { + "epoch": 1.8096934232285968, + "ewc_loss": 0.068659707903862, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033747596899047494, + "grad_norm": 7.972459316253662, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8661203384399414, + "num_tokens": 542592380.0, + "step": 14226 + }, + { + "epoch": 1.8098206335071874, + "ewc_loss": 0.06919993460178375, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034287828020751476, + "grad_norm": 8.152935028076172, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8554973602294922, + "num_tokens": 542632698.0, + "step": 14227 + }, + { + "epoch": 1.8099478437857779, + "ewc_loss": 0.06878267973661423, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003387056931387633, + "grad_norm": 8.04307746887207, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8522058129310608, + "num_tokens": 542668234.0, + "step": 14228 + }, + { + "epoch": 1.8100750540643684, + "ewc_loss": 0.06939154863357544, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034235301427543163, + "grad_norm": 8.189122200012207, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.861577033996582, + "num_tokens": 542706205.0, + "step": 14229 + }, + { + "epoch": 1.810202264342959, + "ewc_loss": 0.06869607418775558, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003378396504558623, + "grad_norm": 8.085843086242676, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8608048558235168, + "num_tokens": 542737068.0, + "step": 14230 + }, + { + "epoch": 1.8103294746215495, + "ewc_loss": 0.06909014284610748, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034178030909970403, + "grad_norm": 8.130175590515137, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8752164840698242, + "num_tokens": 542769776.0, + "step": 14231 + }, + { + "epoch": 1.81045668490014, + "ewc_loss": 0.06875711679458618, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003384500741958618, + "grad_norm": 8.006101608276367, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8759955763816833, + "num_tokens": 542802440.0, + "step": 14232 + }, + { + "epoch": 1.8105838951787305, + "ewc_loss": 0.06901240348815918, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003410029166843742, + "grad_norm": 8.067899703979492, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8750383853912354, + "num_tokens": 542841158.0, + "step": 14233 + }, + { + "epoch": 1.810711105457321, + "ewc_loss": 0.06898380815982819, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033827556762844324, + "grad_norm": 8.053200721740723, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8587619066238403, + "num_tokens": 542876637.0, + "step": 14234 + }, + { + "epoch": 1.8108383157359116, + "ewc_loss": 0.06894354522228241, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003403143200557679, + "grad_norm": 8.030769348144531, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8699052929878235, + "num_tokens": 542915087.0, + "step": 14235 + }, + { + "epoch": 1.8109655260145021, + "ewc_loss": 0.06894703209400177, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034034918644465506, + "grad_norm": 8.080415725708008, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8558154702186584, + "num_tokens": 542950635.0, + "step": 14236 + }, + { + "epoch": 1.8110927362930926, + "ewc_loss": 0.06883347034454346, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003392135549802333, + "grad_norm": 8.033928871154785, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8592801094055176, + "num_tokens": 542990078.0, + "step": 14237 + }, + { + "epoch": 1.811219946571683, + "ewc_loss": 0.0692344382405281, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003407818730920553, + "grad_norm": 8.065930366516113, + "learning_rate": 1e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8374483585357666, + "num_tokens": 543030333.0, + "step": 14238 + }, + { + "epoch": 1.8113471568502735, + "ewc_loss": 0.0691654160618782, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003400916757527739, + "grad_norm": 8.000126838684082, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8569081425666809, + "num_tokens": 543070537.0, + "step": 14239 + }, + { + "epoch": 1.811474367128864, + "ewc_loss": 0.06906520575284958, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034153094748035073, + "grad_norm": 8.066593170166016, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8607405424118042, + "num_tokens": 543109137.0, + "step": 14240 + }, + { + "epoch": 1.8116015774074545, + "ewc_loss": 0.06886360794305801, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003395149833522737, + "grad_norm": 7.993049621582031, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8559991717338562, + "num_tokens": 543146616.0, + "step": 14241 + }, + { + "epoch": 1.811728787686045, + "ewc_loss": 0.06904949992895126, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003413739032112062, + "grad_norm": 8.113195419311523, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8771681189537048, + "num_tokens": 543178433.0, + "step": 14242 + }, + { + "epoch": 1.8118559979646356, + "ewc_loss": 0.06870457530021667, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033792469184845686, + "grad_norm": 7.985448360443115, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.875939667224884, + "num_tokens": 543215086.0, + "step": 14243 + }, + { + "epoch": 1.811983208243226, + "ewc_loss": 0.06909222155809402, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003418011183384806, + "grad_norm": 8.10676097869873, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.846870481967926, + "num_tokens": 543252111.0, + "step": 14244 + }, + { + "epoch": 1.8121104185218164, + "ewc_loss": 0.06894519925117493, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033788944710977376, + "grad_norm": 7.951689720153809, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8706363439559937, + "num_tokens": 543291938.0, + "step": 14245 + }, + { + "epoch": 1.812237628800407, + "ewc_loss": 0.069424107670784, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034267862793058157, + "grad_norm": 8.013242721557617, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8668036460876465, + "num_tokens": 543335141.0, + "step": 14246 + }, + { + "epoch": 1.8123648390789975, + "ewc_loss": 0.06918856501579285, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034032310941256583, + "grad_norm": 8.039705276489258, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8642700910568237, + "num_tokens": 543371305.0, + "step": 14247 + }, + { + "epoch": 1.812492049357588, + "ewc_loss": 0.06894402951002121, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003403192094992846, + "grad_norm": 7.993871688842773, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8741779923439026, + "num_tokens": 543407431.0, + "step": 14248 + }, + { + "epoch": 1.8126192596361785, + "ewc_loss": 0.06904609501361847, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003413398517295718, + "grad_norm": 8.02328109741211, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8626870512962341, + "num_tokens": 543450287.0, + "step": 14249 + }, + { + "epoch": 1.812746469914769, + "ewc_loss": 0.06891104578971863, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000339989346684888, + "grad_norm": 7.962540626525879, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8761872053146362, + "num_tokens": 543486521.0, + "step": 14250 + }, + { + "epoch": 1.8128736801933596, + "ewc_loss": 0.06909999251365662, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003418787964619696, + "grad_norm": 8.12913703918457, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8632998466491699, + "num_tokens": 543522157.0, + "step": 14251 + }, + { + "epoch": 1.8130008904719501, + "ewc_loss": 0.0690278485417366, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000338715995894745, + "grad_norm": 7.955493450164795, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8661458492279053, + "num_tokens": 543560503.0, + "step": 14252 + }, + { + "epoch": 1.8131281007505406, + "ewc_loss": 0.06925690174102783, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034344795858487487, + "grad_norm": 8.130183219909668, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8516970872879028, + "num_tokens": 543598943.0, + "step": 14253 + }, + { + "epoch": 1.8132553110291312, + "ewc_loss": 0.06900399178266525, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033847743179649115, + "grad_norm": 7.968016147613525, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8654874563217163, + "num_tokens": 543640466.0, + "step": 14254 + }, + { + "epoch": 1.8133825213077217, + "ewc_loss": 0.06922345608472824, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034311346826143563, + "grad_norm": 8.079046249389648, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8648391962051392, + "num_tokens": 543677581.0, + "step": 14255 + }, + { + "epoch": 1.8135097315863122, + "ewc_loss": 0.06894570589065552, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034033594420179725, + "grad_norm": 8.035981178283691, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8554627895355225, + "num_tokens": 543711220.0, + "step": 14256 + }, + { + "epoch": 1.8136369418649028, + "ewc_loss": 0.06910496950149536, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000341928651323542, + "grad_norm": 8.050846099853516, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8586137294769287, + "num_tokens": 543750109.0, + "step": 14257 + }, + { + "epoch": 1.8137641521434933, + "ewc_loss": 0.06899915635585785, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034087052335962653, + "grad_norm": 8.006171226501465, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8679052591323853, + "num_tokens": 543788040.0, + "step": 14258 + }, + { + "epoch": 1.8138913624220838, + "ewc_loss": 0.06932838261127472, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034172131563536823, + "grad_norm": 8.115842819213867, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8630709052085876, + "num_tokens": 543821037.0, + "step": 14259 + }, + { + "epoch": 1.8140185727006743, + "ewc_loss": 0.0689147412776947, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034002630854956806, + "grad_norm": 8.029793739318848, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8755473494529724, + "num_tokens": 543856331.0, + "step": 14260 + }, + { + "epoch": 1.8141457829792649, + "ewc_loss": 0.06914621591567993, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003423410526011139, + "grad_norm": 8.032674789428711, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.867973804473877, + "num_tokens": 543892898.0, + "step": 14261 + }, + { + "epoch": 1.8142729932578552, + "ewc_loss": 0.06902536749839783, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003411326324567199, + "grad_norm": 8.035436630249023, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8600635528564453, + "num_tokens": 543932858.0, + "step": 14262 + }, + { + "epoch": 1.8144002035364457, + "ewc_loss": 0.06911154091358185, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034199433866888285, + "grad_norm": 8.073920249938965, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8706502914428711, + "num_tokens": 543968145.0, + "step": 14263 + }, + { + "epoch": 1.8145274138150362, + "ewc_loss": 0.06931925565004349, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003416300460230559, + "grad_norm": 8.07376480102539, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8623855113983154, + "num_tokens": 544000495.0, + "step": 14264 + }, + { + "epoch": 1.8146546240936268, + "ewc_loss": 0.06903076171875, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034118653275072575, + "grad_norm": 8.018468856811523, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8570524454116821, + "num_tokens": 544041345.0, + "step": 14265 + }, + { + "epoch": 1.8147818343722173, + "ewc_loss": 0.06897002458572388, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034057919401675463, + "grad_norm": 8.010205268859863, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8687424659729004, + "num_tokens": 544080084.0, + "step": 14266 + }, + { + "epoch": 1.8149090446508078, + "ewc_loss": 0.06898921728134155, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003407710464671254, + "grad_norm": 8.096896171569824, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8556885719299316, + "num_tokens": 544115828.0, + "step": 14267 + }, + { + "epoch": 1.8150362549293981, + "ewc_loss": 0.0689418613910675, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003402975562494248, + "grad_norm": 8.020419120788574, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8589611649513245, + "num_tokens": 544153213.0, + "step": 14268 + }, + { + "epoch": 1.8151634652079887, + "ewc_loss": 0.06916689872741699, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003425479226280004, + "grad_norm": 8.010645866394043, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8665976524353027, + "num_tokens": 544192383.0, + "step": 14269 + }, + { + "epoch": 1.8152906754865792, + "ewc_loss": 0.06933870911598206, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003418245760258287, + "grad_norm": 8.022130012512207, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8655902147293091, + "num_tokens": 544232034.0, + "step": 14270 + }, + { + "epoch": 1.8154178857651697, + "ewc_loss": 0.06926598399877548, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003410973586142063, + "grad_norm": 8.0167818069458, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8462613821029663, + "num_tokens": 544266242.0, + "step": 14271 + }, + { + "epoch": 1.8155450960437602, + "ewc_loss": 0.06937693059444427, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003422068548388779, + "grad_norm": 8.089099884033203, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8539071083068848, + "num_tokens": 544304995.0, + "step": 14272 + }, + { + "epoch": 1.8156723063223508, + "ewc_loss": 0.06924721598625183, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003409096098039299, + "grad_norm": 7.944732189178467, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8624378442764282, + "num_tokens": 544341950.0, + "step": 14273 + }, + { + "epoch": 1.8157995166009413, + "ewc_loss": 0.06951268017292023, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034356434480287135, + "grad_norm": 8.107138633728027, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8597339391708374, + "num_tokens": 544379535.0, + "step": 14274 + }, + { + "epoch": 1.8159267268795318, + "ewc_loss": 0.06888598203659058, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033973873360082507, + "grad_norm": 7.961828708648682, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8622662425041199, + "num_tokens": 544416939.0, + "step": 14275 + }, + { + "epoch": 1.8160539371581224, + "ewc_loss": 0.06956218183040619, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034405937185510993, + "grad_norm": 8.072702407836914, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8756389617919922, + "num_tokens": 544461476.0, + "step": 14276 + }, + { + "epoch": 1.8161811474367129, + "ewc_loss": 0.06905624270439148, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033899987465701997, + "grad_norm": 7.932920455932617, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8637735247612, + "num_tokens": 544505051.0, + "step": 14277 + }, + { + "epoch": 1.8163083577153034, + "ewc_loss": 0.06936326622962952, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034451158717274666, + "grad_norm": 8.05521297454834, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8624163866043091, + "num_tokens": 544550408.0, + "step": 14278 + }, + { + "epoch": 1.816435567993894, + "ewc_loss": 0.06929567456245422, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000341394217684865, + "grad_norm": 7.977922439575195, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8629158735275269, + "num_tokens": 544586645.0, + "step": 14279 + }, + { + "epoch": 1.8165627782724845, + "ewc_loss": 0.06936922669410706, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003445711627136916, + "grad_norm": 8.090263366699219, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.855187177658081, + "num_tokens": 544628087.0, + "step": 14280 + }, + { + "epoch": 1.816689988551075, + "ewc_loss": 0.06903143227100372, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003411932266317308, + "grad_norm": 7.999871253967285, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8775565028190613, + "num_tokens": 544667961.0, + "step": 14281 + }, + { + "epoch": 1.8168171988296655, + "ewc_loss": 0.06937899440526962, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034466886427253485, + "grad_norm": 8.126683235168457, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8607564568519592, + "num_tokens": 544704919.0, + "step": 14282 + }, + { + "epoch": 1.816944409108256, + "ewc_loss": 0.06900640577077866, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034094296279363334, + "grad_norm": 7.964818000793457, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8680324554443359, + "num_tokens": 544741421.0, + "step": 14283 + }, + { + "epoch": 1.8170716193868466, + "ewc_loss": 0.06947268545627594, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003456058038864285, + "grad_norm": 8.146551132202148, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8667587637901306, + "num_tokens": 544776414.0, + "step": 14284 + }, + { + "epoch": 1.817198829665437, + "ewc_loss": 0.06902291625738144, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003411080688238144, + "grad_norm": 8.017680168151855, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8567524552345276, + "num_tokens": 544814964.0, + "step": 14285 + }, + { + "epoch": 1.8173260399440276, + "ewc_loss": 0.0694209635257721, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003450885124038905, + "grad_norm": 8.094344139099121, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8507145047187805, + "num_tokens": 544853381.0, + "step": 14286 + }, + { + "epoch": 1.817453250222618, + "ewc_loss": 0.06915129721164703, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003423918387852609, + "grad_norm": 8.02964973449707, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8662660717964172, + "num_tokens": 544892474.0, + "step": 14287 + }, + { + "epoch": 1.8175804605012085, + "ewc_loss": 0.0692674070596695, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003435529361013323, + "grad_norm": 8.13118839263916, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8700525760650635, + "num_tokens": 544927667.0, + "step": 14288 + }, + { + "epoch": 1.817707670779799, + "ewc_loss": 0.06894718110561371, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003403507580514997, + "grad_norm": 8.067273139953613, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8691392540931702, + "num_tokens": 544959780.0, + "step": 14289 + }, + { + "epoch": 1.8178348810583895, + "ewc_loss": 0.06918157637119293, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003426946350373328, + "grad_norm": 8.09446907043457, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.854728639125824, + "num_tokens": 545001832.0, + "step": 14290 + }, + { + "epoch": 1.81796209133698, + "ewc_loss": 0.06903386116027832, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034121752833016217, + "grad_norm": 8.04620361328125, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.871301531791687, + "num_tokens": 545038489.0, + "step": 14291 + }, + { + "epoch": 1.8180893016155706, + "ewc_loss": 0.06912975013256073, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034217635402455926, + "grad_norm": 8.190912246704102, + "learning_rate": 1e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8429546356201172, + "num_tokens": 545069086.0, + "step": 14292 + }, + { + "epoch": 1.818216511894161, + "ewc_loss": 0.06895947456359863, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034047363442368805, + "grad_norm": 8.142870903015137, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8664660453796387, + "num_tokens": 545104564.0, + "step": 14293 + }, + { + "epoch": 1.8183437221727514, + "ewc_loss": 0.0689886212348938, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003407650801818818, + "grad_norm": 8.049347877502441, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8681899309158325, + "num_tokens": 545149036.0, + "step": 14294 + }, + { + "epoch": 1.818470932451342, + "ewc_loss": 0.0691029354929924, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034190824953839183, + "grad_norm": 8.093867301940918, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8751340508460999, + "num_tokens": 545183825.0, + "step": 14295 + }, + { + "epoch": 1.8185981427299325, + "ewc_loss": 0.06889894604682922, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033986836206167936, + "grad_norm": 8.053515434265137, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8669295907020569, + "num_tokens": 545221274.0, + "step": 14296 + }, + { + "epoch": 1.818725353008523, + "ewc_loss": 0.06912727653980255, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003421516448725015, + "grad_norm": 8.11701488494873, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8708263039588928, + "num_tokens": 545259242.0, + "step": 14297 + }, + { + "epoch": 1.8188525632871135, + "ewc_loss": 0.06897090375423431, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003405879542697221, + "grad_norm": 8.167738914489746, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8486596345901489, + "num_tokens": 545289380.0, + "step": 14298 + }, + { + "epoch": 1.818979773565704, + "ewc_loss": 0.06884543597698212, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033933325903490186, + "grad_norm": 8.017178535461426, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8666030168533325, + "num_tokens": 545321935.0, + "step": 14299 + }, + { + "epoch": 1.8191069838442946, + "ewc_loss": 0.06904961913824081, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003413750964682549, + "grad_norm": 8.029852867126465, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8700377941131592, + "num_tokens": 545365714.0, + "step": 14300 + }, + { + "epoch": 1.8192341941228851, + "ewc_loss": 0.06903167068958282, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003411955840419978, + "grad_norm": 8.080224990844727, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.853499710559845, + "num_tokens": 545406838.0, + "step": 14301 + }, + { + "epoch": 1.8193614044014756, + "ewc_loss": 0.06902576982975006, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000341136590577662, + "grad_norm": 8.01186466217041, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8598169684410095, + "num_tokens": 545443265.0, + "step": 14302 + }, + { + "epoch": 1.8194886146800662, + "ewc_loss": 0.06918555498123169, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003427344490773976, + "grad_norm": 8.082027435302734, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8733344078063965, + "num_tokens": 545479903.0, + "step": 14303 + }, + { + "epoch": 1.8196158249586567, + "ewc_loss": 0.06916358321905136, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003425147442612797, + "grad_norm": 8.09315299987793, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8704975843429565, + "num_tokens": 545513767.0, + "step": 14304 + }, + { + "epoch": 1.8197430352372472, + "ewc_loss": 0.06908214092254639, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000341700273565948, + "grad_norm": 8.084287643432617, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8485000133514404, + "num_tokens": 545550541.0, + "step": 14305 + }, + { + "epoch": 1.8198702455158378, + "ewc_loss": 0.06911203265190125, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003419991990085691, + "grad_norm": 8.097835540771484, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.864080548286438, + "num_tokens": 545590739.0, + "step": 14306 + }, + { + "epoch": 1.8199974557944283, + "ewc_loss": 0.06906058639287949, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034148478880524635, + "grad_norm": 8.117461204528809, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8489762544631958, + "num_tokens": 545627127.0, + "step": 14307 + }, + { + "epoch": 1.8201246660730188, + "ewc_loss": 0.06894996017217636, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034037852310575545, + "grad_norm": 8.00549602508545, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8676532506942749, + "num_tokens": 545665473.0, + "step": 14308 + }, + { + "epoch": 1.8202518763516093, + "ewc_loss": 0.06914877891540527, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034236666397191584, + "grad_norm": 8.091184616088867, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8791832327842712, + "num_tokens": 545708018.0, + "step": 14309 + }, + { + "epoch": 1.8203790866301999, + "ewc_loss": 0.06898430734872818, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034072197740897536, + "grad_norm": 8.05894660949707, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8683105111122131, + "num_tokens": 545745635.0, + "step": 14310 + }, + { + "epoch": 1.8205062969087902, + "ewc_loss": 0.06919467449188232, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034282563137821853, + "grad_norm": 8.067988395690918, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8528603911399841, + "num_tokens": 545785260.0, + "step": 14311 + }, + { + "epoch": 1.8206335071873807, + "ewc_loss": 0.06907637417316437, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034164267708547413, + "grad_norm": 8.066122055053711, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.860525369644165, + "num_tokens": 545821696.0, + "step": 14312 + }, + { + "epoch": 1.8207607174659712, + "ewc_loss": 0.06903963536024094, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034127524122595787, + "grad_norm": 8.01440143585205, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8771078586578369, + "num_tokens": 545857971.0, + "step": 14313 + }, + { + "epoch": 1.8208879277445618, + "ewc_loss": 0.06917740404605865, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003426529292482883, + "grad_norm": 8.090797424316406, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8531394004821777, + "num_tokens": 545891000.0, + "step": 14314 + }, + { + "epoch": 1.8210151380231523, + "ewc_loss": 0.06899282336235046, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003408071934245527, + "grad_norm": 8.050436019897461, + "learning_rate": 1e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8482239246368408, + "num_tokens": 545932825.0, + "step": 14315 + }, + { + "epoch": 1.8211423483017428, + "ewc_loss": 0.06947734951972961, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003432110243011266, + "grad_norm": 8.083771705627441, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8536325693130493, + "num_tokens": 545972849.0, + "step": 14316 + }, + { + "epoch": 1.8212695585803331, + "ewc_loss": 0.06936698406934738, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034210734884254634, + "grad_norm": 8.06700325012207, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8758097887039185, + "num_tokens": 546011492.0, + "step": 14317 + }, + { + "epoch": 1.8213967688589237, + "ewc_loss": 0.06939394772052765, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000342376995831728, + "grad_norm": 8.07952880859375, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8589327335357666, + "num_tokens": 546048461.0, + "step": 14318 + }, + { + "epoch": 1.8215239791375142, + "ewc_loss": 0.06937985867261887, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034223610418848693, + "grad_norm": 8.037969589233398, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8727964162826538, + "num_tokens": 546087113.0, + "step": 14319 + }, + { + "epoch": 1.8216511894161047, + "ewc_loss": 0.06939645856618881, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003424020833335817, + "grad_norm": 8.08523178100586, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8663202524185181, + "num_tokens": 546124413.0, + "step": 14320 + }, + { + "epoch": 1.8217783996946952, + "ewc_loss": 0.0691738873720169, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034017639700323343, + "grad_norm": 8.024230003356934, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8749008178710938, + "num_tokens": 546165763.0, + "step": 14321 + }, + { + "epoch": 1.8219056099732858, + "ewc_loss": 0.06935885548591614, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003420260618440807, + "grad_norm": 8.042764663696289, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.86455237865448, + "num_tokens": 546203218.0, + "step": 14322 + }, + { + "epoch": 1.8220328202518763, + "ewc_loss": 0.06903554499149323, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003412343212403357, + "grad_norm": 8.0142183303833, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.853313684463501, + "num_tokens": 546247610.0, + "step": 14323 + }, + { + "epoch": 1.8221600305304668, + "ewc_loss": 0.06941249966621399, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034256247454322875, + "grad_norm": 8.056234359741211, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.856981635093689, + "num_tokens": 546288952.0, + "step": 14324 + }, + { + "epoch": 1.8222872408090574, + "ewc_loss": 0.06898283213376999, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003407072217669338, + "grad_norm": 8.02852725982666, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8745976090431213, + "num_tokens": 546322449.0, + "step": 14325 + }, + { + "epoch": 1.8224144510876479, + "ewc_loss": 0.06906801462173462, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003415590908844024, + "grad_norm": 7.986547470092773, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8692618012428284, + "num_tokens": 546362692.0, + "step": 14326 + }, + { + "epoch": 1.8225416613662384, + "ewc_loss": 0.06914777308702469, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003423566522542387, + "grad_norm": 8.063597679138184, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8480856418609619, + "num_tokens": 546398856.0, + "step": 14327 + }, + { + "epoch": 1.822668871644829, + "ewc_loss": 0.06906178593635559, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003414967213757336, + "grad_norm": 8.03208065032959, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.856377363204956, + "num_tokens": 546439633.0, + "step": 14328 + }, + { + "epoch": 1.8227960819234195, + "ewc_loss": 0.06920622289180756, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003429410862736404, + "grad_norm": 8.062328338623047, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8679232001304626, + "num_tokens": 546477282.0, + "step": 14329 + }, + { + "epoch": 1.82292329220201, + "ewc_loss": 0.06909763813018799, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003418553387746215, + "grad_norm": 8.020597457885742, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8748325109481812, + "num_tokens": 546514584.0, + "step": 14330 + }, + { + "epoch": 1.8230505024806005, + "ewc_loss": 0.06935875117778778, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003444664180278778, + "grad_norm": 8.124557495117188, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8651215434074402, + "num_tokens": 546550761.0, + "step": 14331 + }, + { + "epoch": 1.823177712759191, + "ewc_loss": 0.0690617710351944, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003414966631680727, + "grad_norm": 8.021135330200195, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.849615216255188, + "num_tokens": 546594157.0, + "step": 14332 + }, + { + "epoch": 1.8233049230377816, + "ewc_loss": 0.06934197247028351, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003442986053414643, + "grad_norm": 8.134079933166504, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8729288578033447, + "num_tokens": 546634671.0, + "step": 14333 + }, + { + "epoch": 1.823432133316372, + "ewc_loss": 0.06892596185207367, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034013850381597877, + "grad_norm": 8.049867630004883, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.864018976688385, + "num_tokens": 546672084.0, + "step": 14334 + }, + { + "epoch": 1.8235593435949626, + "ewc_loss": 0.06919261068105698, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003428049967624247, + "grad_norm": 8.16172981262207, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8562772870063782, + "num_tokens": 546701920.0, + "step": 14335 + }, + { + "epoch": 1.823686553873553, + "ewc_loss": 0.06888199597597122, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033969886135309935, + "grad_norm": 8.000411033630371, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8674372434616089, + "num_tokens": 546735776.0, + "step": 14336 + }, + { + "epoch": 1.8238137641521435, + "ewc_loss": 0.06933103501796722, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003441892913542688, + "grad_norm": 8.091045379638672, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8691734075546265, + "num_tokens": 546772991.0, + "step": 14337 + }, + { + "epoch": 1.823940974430734, + "ewc_loss": 0.06894834339618683, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003403623413760215, + "grad_norm": 7.9991374015808105, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8576330542564392, + "num_tokens": 546805310.0, + "step": 14338 + }, + { + "epoch": 1.8240681847093245, + "ewc_loss": 0.06930505484342575, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034392945235595107, + "grad_norm": 8.094958305358887, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8699285387992859, + "num_tokens": 546838240.0, + "step": 14339 + }, + { + "epoch": 1.824195394987915, + "ewc_loss": 0.06901293992996216, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034100832999683917, + "grad_norm": 8.003957748413086, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8593404293060303, + "num_tokens": 546878484.0, + "step": 14340 + }, + { + "epoch": 1.8243226052665056, + "ewc_loss": 0.06948629021644592, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034330037306062877, + "grad_norm": 8.063456535339355, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8791683912277222, + "num_tokens": 546919747.0, + "step": 14341 + }, + { + "epoch": 1.8244498155450959, + "ewc_loss": 0.06902676820755005, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034114657319150865, + "grad_norm": 7.994370937347412, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8636435866355896, + "num_tokens": 546962614.0, + "step": 14342 + }, + { + "epoch": 1.8245770258236864, + "ewc_loss": 0.06920871138572693, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003429660282563418, + "grad_norm": 8.094828605651855, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8571615219116211, + "num_tokens": 547000059.0, + "step": 14343 + }, + { + "epoch": 1.824704236102277, + "ewc_loss": 0.06927768141031265, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034121429780498147, + "grad_norm": 7.998960018157959, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8658455014228821, + "num_tokens": 547037372.0, + "step": 14344 + }, + { + "epoch": 1.8248314463808675, + "ewc_loss": 0.06942719221115112, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034270939067937434, + "grad_norm": 8.093698501586914, + "learning_rate": 1e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8510707020759583, + "num_tokens": 547071840.0, + "step": 14345 + }, + { + "epoch": 1.824958656659458, + "ewc_loss": 0.06915360689163208, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033997357240878046, + "grad_norm": 8.066373825073242, + "learning_rate": 1e-06, + "loss": 0.553, + "mean_token_accuracy": 0.8378794193267822, + "num_tokens": 547108916.0, + "step": 14346 + }, + { + "epoch": 1.8250858669380485, + "ewc_loss": 0.06947542726993561, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003431917866691947, + "grad_norm": 8.042757034301758, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8819822072982788, + "num_tokens": 547146991.0, + "step": 14347 + }, + { + "epoch": 1.825213077216639, + "ewc_loss": 0.06934157013893127, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003418531850911677, + "grad_norm": 8.074346542358398, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8747124671936035, + "num_tokens": 547184093.0, + "step": 14348 + }, + { + "epoch": 1.8253402874952296, + "ewc_loss": 0.06919096410274506, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034034717828035355, + "grad_norm": 8.004526138305664, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.869853138923645, + "num_tokens": 547217570.0, + "step": 14349 + }, + { + "epoch": 1.8254674977738201, + "ewc_loss": 0.0694727748632431, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003431652730796486, + "grad_norm": 8.071385383605957, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8615313172340393, + "num_tokens": 547259498.0, + "step": 14350 + }, + { + "epoch": 1.8255947080524106, + "ewc_loss": 0.06906327605247498, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003390702186152339, + "grad_norm": 7.984941482543945, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8630694150924683, + "num_tokens": 547298419.0, + "step": 14351 + }, + { + "epoch": 1.8257219183310012, + "ewc_loss": 0.06955711543560028, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034400870208628476, + "grad_norm": 8.052512168884277, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8718587160110474, + "num_tokens": 547342984.0, + "step": 14352 + }, + { + "epoch": 1.8258491286095917, + "ewc_loss": 0.06914056837558746, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003398431290406734, + "grad_norm": 8.082741737365723, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8642719984054565, + "num_tokens": 547383162.0, + "step": 14353 + }, + { + "epoch": 1.8259763388881822, + "ewc_loss": 0.06892429292201996, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034012182732112706, + "grad_norm": 8.044727325439453, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8849977254867554, + "num_tokens": 547416967.0, + "step": 14354 + }, + { + "epoch": 1.8261035491667728, + "ewc_loss": 0.06901279091835022, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034100684570148587, + "grad_norm": 8.044677734375, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8663010597229004, + "num_tokens": 547454695.0, + "step": 14355 + }, + { + "epoch": 1.8262307594453633, + "ewc_loss": 0.0689053013920784, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033993192482739687, + "grad_norm": 7.988145351409912, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8731530904769897, + "num_tokens": 547494258.0, + "step": 14356 + }, + { + "epoch": 1.8263579697239538, + "ewc_loss": 0.06908857822418213, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034176468034274876, + "grad_norm": 8.080890655517578, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8584609031677246, + "num_tokens": 547531404.0, + "step": 14357 + }, + { + "epoch": 1.8264851800025443, + "ewc_loss": 0.06888950616121292, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003397739492356777, + "grad_norm": 8.041111946105957, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.868617594242096, + "num_tokens": 547566553.0, + "step": 14358 + }, + { + "epoch": 1.8266123902811349, + "ewc_loss": 0.068989098072052, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034076994052156806, + "grad_norm": 8.017715454101562, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8555787801742554, + "num_tokens": 547605505.0, + "step": 14359 + }, + { + "epoch": 1.8267396005597252, + "ewc_loss": 0.06915721297264099, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003424510359764099, + "grad_norm": 8.029874801635742, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8711612224578857, + "num_tokens": 547643146.0, + "step": 14360 + }, + { + "epoch": 1.8268668108383157, + "ewc_loss": 0.06921225786209106, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034056007280014455, + "grad_norm": 8.036307334899902, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8616136908531189, + "num_tokens": 547683373.0, + "step": 14361 + }, + { + "epoch": 1.8269940211169062, + "ewc_loss": 0.06940841674804688, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034252164186909795, + "grad_norm": 8.082231521606445, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8583929538726807, + "num_tokens": 547716762.0, + "step": 14362 + }, + { + "epoch": 1.8271212313954968, + "ewc_loss": 0.06903517246246338, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034123065415769815, + "grad_norm": 8.038119316101074, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8706432580947876, + "num_tokens": 547756907.0, + "step": 14363 + }, + { + "epoch": 1.8272484416740873, + "ewc_loss": 0.06935639679431915, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034200146910734475, + "grad_norm": 8.062411308288574, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8591897487640381, + "num_tokens": 547793545.0, + "step": 14364 + }, + { + "epoch": 1.8273756519526778, + "ewc_loss": 0.06906134635210037, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003414923558011651, + "grad_norm": 8.058788299560547, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8545494079589844, + "num_tokens": 547836845.0, + "step": 14365 + }, + { + "epoch": 1.8275028622312681, + "ewc_loss": 0.06910395622253418, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003419184358790517, + "grad_norm": 8.04240608215332, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8707209825515747, + "num_tokens": 547875849.0, + "step": 14366 + }, + { + "epoch": 1.8276300725098586, + "ewc_loss": 0.06909210979938507, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003417999541852623, + "grad_norm": 8.053709030151367, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8573193550109863, + "num_tokens": 547918672.0, + "step": 14367 + }, + { + "epoch": 1.8277572827884492, + "ewc_loss": 0.06899547576904297, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003408336197026074, + "grad_norm": 8.087458610534668, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8513191938400269, + "num_tokens": 547958681.0, + "step": 14368 + }, + { + "epoch": 1.8278844930670397, + "ewc_loss": 0.06902695447206497, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034114846494048834, + "grad_norm": 8.048906326293945, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8577429056167603, + "num_tokens": 547996940.0, + "step": 14369 + }, + { + "epoch": 1.8280117033456302, + "ewc_loss": 0.06918678432703018, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003427467599976808, + "grad_norm": 8.098854064941406, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8727200031280518, + "num_tokens": 548030188.0, + "step": 14370 + }, + { + "epoch": 1.8281389136242208, + "ewc_loss": 0.0689966082572937, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003408449701964855, + "grad_norm": 8.039695739746094, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8789004683494568, + "num_tokens": 548066891.0, + "step": 14371 + }, + { + "epoch": 1.8282661239028113, + "ewc_loss": 0.0692179948091507, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034305884037166834, + "grad_norm": 8.030656814575195, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8689578175544739, + "num_tokens": 548104664.0, + "step": 14372 + }, + { + "epoch": 1.8283933341814018, + "ewc_loss": 0.06904610991477966, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034134002635255456, + "grad_norm": 8.101378440856934, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8587131500244141, + "num_tokens": 548143938.0, + "step": 14373 + }, + { + "epoch": 1.8285205444599923, + "ewc_loss": 0.06903314590454102, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003412103687878698, + "grad_norm": 8.022810935974121, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8625509738922119, + "num_tokens": 548183213.0, + "step": 14374 + }, + { + "epoch": 1.8286477547385829, + "ewc_loss": 0.06924863159656525, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034336524549871683, + "grad_norm": 8.09354305267334, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8477481603622437, + "num_tokens": 548221304.0, + "step": 14375 + }, + { + "epoch": 1.8287749650171734, + "ewc_loss": 0.06900511682033539, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034093004069291055, + "grad_norm": 8.015180587768555, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8614739775657654, + "num_tokens": 548261123.0, + "step": 14376 + }, + { + "epoch": 1.828902175295764, + "ewc_loss": 0.06932984292507172, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034417732967995107, + "grad_norm": 8.11375904083252, + "learning_rate": 1e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8427542448043823, + "num_tokens": 548305040.0, + "step": 14377 + }, + { + "epoch": 1.8290293855743545, + "ewc_loss": 0.06923139095306396, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034075145958922803, + "grad_norm": 8.050116539001465, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8710470199584961, + "num_tokens": 548338538.0, + "step": 14378 + }, + { + "epoch": 1.829156595852945, + "ewc_loss": 0.06925912946462631, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003434701939113438, + "grad_norm": 8.110179901123047, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8674064874649048, + "num_tokens": 548378510.0, + "step": 14379 + }, + { + "epoch": 1.8292838061315355, + "ewc_loss": 0.06913328915834427, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034221180249005556, + "grad_norm": 8.017770767211914, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8566110134124756, + "num_tokens": 548415790.0, + "step": 14380 + }, + { + "epoch": 1.829411016410126, + "ewc_loss": 0.06940658390522003, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034494479768909514, + "grad_norm": 8.152201652526855, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.854641854763031, + "num_tokens": 548447294.0, + "step": 14381 + }, + { + "epoch": 1.8295382266887166, + "ewc_loss": 0.06903193891048431, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003411983489058912, + "grad_norm": 8.039275169372559, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8706939816474915, + "num_tokens": 548484617.0, + "step": 14382 + }, + { + "epoch": 1.829665436967307, + "ewc_loss": 0.0693541094660759, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003444199974182993, + "grad_norm": 8.090836524963379, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8582731485366821, + "num_tokens": 548527552.0, + "step": 14383 + }, + { + "epoch": 1.8297926472458976, + "ewc_loss": 0.06902818381786346, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003411607176531106, + "grad_norm": 8.075124740600586, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8814753293991089, + "num_tokens": 548561322.0, + "step": 14384 + }, + { + "epoch": 1.829919857524488, + "ewc_loss": 0.06906014680862427, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034148036502301693, + "grad_norm": 8.053704261779785, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8742863535881042, + "num_tokens": 548594810.0, + "step": 14385 + }, + { + "epoch": 1.8300470678030785, + "ewc_loss": 0.06914019584655762, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003422808658797294, + "grad_norm": 8.053122520446777, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8557732105255127, + "num_tokens": 548631094.0, + "step": 14386 + }, + { + "epoch": 1.830174278081669, + "ewc_loss": 0.06914389133453369, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003423178568482399, + "grad_norm": 8.090004920959473, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8629090785980225, + "num_tokens": 548669162.0, + "step": 14387 + }, + { + "epoch": 1.8303014883602595, + "ewc_loss": 0.06922626495361328, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034314158256165683, + "grad_norm": 8.123621940612793, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8639445304870605, + "num_tokens": 548704555.0, + "step": 14388 + }, + { + "epoch": 1.83042869863885, + "ewc_loss": 0.06933113932609558, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003417489060666412, + "grad_norm": 8.033942222595215, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8777214288711548, + "num_tokens": 548741852.0, + "step": 14389 + }, + { + "epoch": 1.8305559089174406, + "ewc_loss": 0.06943519413471222, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003427894553169608, + "grad_norm": 8.037259101867676, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8518450260162354, + "num_tokens": 548788455.0, + "step": 14390 + }, + { + "epoch": 1.8306831191960309, + "ewc_loss": 0.06948989629745483, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003433364909142256, + "grad_norm": 8.09426498413086, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8713017702102661, + "num_tokens": 548826347.0, + "step": 14391 + }, + { + "epoch": 1.8308103294746214, + "ewc_loss": 0.06947135925292969, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034315112861804664, + "grad_norm": 8.060325622558594, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8569345474243164, + "num_tokens": 548865428.0, + "step": 14392 + }, + { + "epoch": 1.830937539753212, + "ewc_loss": 0.06960277259349823, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003444652247708291, + "grad_norm": 8.113224029541016, + "learning_rate": 1e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8506312370300293, + "num_tokens": 548901671.0, + "step": 14393 + }, + { + "epoch": 1.8310647500318025, + "ewc_loss": 0.06907734274864197, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003416523686610162, + "grad_norm": 8.043827056884766, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8609643578529358, + "num_tokens": 548939234.0, + "step": 14394 + }, + { + "epoch": 1.831191960310393, + "ewc_loss": 0.06928569078445435, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003437357663642615, + "grad_norm": 8.091436386108398, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8678701519966125, + "num_tokens": 548975334.0, + "step": 14395 + }, + { + "epoch": 1.8313191705889835, + "ewc_loss": 0.06914074718952179, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003422863665036857, + "grad_norm": 8.038908958435059, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8582015037536621, + "num_tokens": 549016372.0, + "step": 14396 + }, + { + "epoch": 1.831446380867574, + "ewc_loss": 0.06928931176662445, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034377200063318014, + "grad_norm": 9.551874160766602, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8705404996871948, + "num_tokens": 549058049.0, + "step": 14397 + }, + { + "epoch": 1.8315735911461646, + "ewc_loss": 0.06880409270524979, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033891983912326396, + "grad_norm": 7.805264949798584, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8527625799179077, + "num_tokens": 549097855.0, + "step": 14398 + }, + { + "epoch": 1.831700801424755, + "ewc_loss": 0.07122780382633209, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036315692705102265, + "grad_norm": 8.507233619689941, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8830968141555786, + "num_tokens": 549130906.0, + "step": 14399 + }, + { + "epoch": 1.8318280117033456, + "ewc_loss": 0.06863507628440857, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033722963416948915, + "grad_norm": 7.864927291870117, + "learning_rate": 1e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8447120189666748, + "num_tokens": 549176574.0, + "step": 14400 + }, + { + "epoch": 1.8319552219819362, + "ewc_loss": 0.07103242725133896, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003612031869124621, + "grad_norm": 8.389999389648438, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8614210486412048, + "num_tokens": 549213945.0, + "step": 14401 + }, + { + "epoch": 1.8320824322605267, + "ewc_loss": 0.06912386417388916, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003421175351832062, + "grad_norm": 7.929131507873535, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8554607033729553, + "num_tokens": 549255843.0, + "step": 14402 + }, + { + "epoch": 1.8322096425391172, + "ewc_loss": 0.07077707350254059, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003562082420103252, + "grad_norm": 8.334452629089355, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8665111064910889, + "num_tokens": 549292573.0, + "step": 14403 + }, + { + "epoch": 1.8323368528177078, + "ewc_loss": 0.0695018619298935, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003434561367612332, + "grad_norm": 7.921742916107178, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8756847381591797, + "num_tokens": 549335291.0, + "step": 14404 + }, + { + "epoch": 1.8324640630962983, + "ewc_loss": 0.07048565149307251, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000353294046362862, + "grad_norm": 8.19953727722168, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8515856266021729, + "num_tokens": 549381337.0, + "step": 14405 + }, + { + "epoch": 1.8325912733748888, + "ewc_loss": 0.06962849199771881, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003447224444244057, + "grad_norm": 8.005241394042969, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8755903244018555, + "num_tokens": 549418180.0, + "step": 14406 + }, + { + "epoch": 1.8327184836534793, + "ewc_loss": 0.0701417326927185, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034985484671778977, + "grad_norm": 8.109319686889648, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8722488880157471, + "num_tokens": 549461623.0, + "step": 14407 + }, + { + "epoch": 1.8328456939320699, + "ewc_loss": 0.06971016526222229, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000345539185218513, + "grad_norm": 8.057364463806152, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8671271800994873, + "num_tokens": 549500791.0, + "step": 14408 + }, + { + "epoch": 1.8329729042106602, + "ewc_loss": 0.06984933465719223, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034693084307946265, + "grad_norm": 8.07572078704834, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8691998720169067, + "num_tokens": 549533343.0, + "step": 14409 + }, + { + "epoch": 1.8331001144892507, + "ewc_loss": 0.0697372704744339, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003458102291915566, + "grad_norm": 8.021724700927734, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8542287945747375, + "num_tokens": 549572707.0, + "step": 14410 + }, + { + "epoch": 1.8332273247678412, + "ewc_loss": 0.06996342539787292, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000348071800544858, + "grad_norm": 8.126913070678711, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8705434203147888, + "num_tokens": 549609772.0, + "step": 14411 + }, + { + "epoch": 1.8333545350464318, + "ewc_loss": 0.06955675780773163, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003440051223151386, + "grad_norm": 7.986910820007324, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.852367639541626, + "num_tokens": 549648626.0, + "step": 14412 + }, + { + "epoch": 1.8334817453250223, + "ewc_loss": 0.0698251873254776, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034668936859816313, + "grad_norm": 8.069660186767578, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8520021438598633, + "num_tokens": 549693211.0, + "step": 14413 + }, + { + "epoch": 1.8336089556036128, + "ewc_loss": 0.06962341070175171, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003446716582402587, + "grad_norm": 8.073485374450684, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8580750226974487, + "num_tokens": 549729463.0, + "step": 14414 + }, + { + "epoch": 1.8337361658822031, + "ewc_loss": 0.06964438408613205, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034488135133869946, + "grad_norm": 7.9870123863220215, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8695311546325684, + "num_tokens": 549769139.0, + "step": 14415 + }, + { + "epoch": 1.8338633761607936, + "ewc_loss": 0.06966236978769302, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003450611839070916, + "grad_norm": 8.008512496948242, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8671363592147827, + "num_tokens": 549807067.0, + "step": 14416 + }, + { + "epoch": 1.8339905864393842, + "ewc_loss": 0.06937462091445923, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034462515031918883, + "grad_norm": 8.048639297485352, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8629938960075378, + "num_tokens": 549842689.0, + "step": 14417 + }, + { + "epoch": 1.8341177967179747, + "ewc_loss": 0.06962913274765015, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003447288181632757, + "grad_norm": 8.0267915725708, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8820978403091431, + "num_tokens": 549876108.0, + "step": 14418 + }, + { + "epoch": 1.8342450069965652, + "ewc_loss": 0.06941412389278412, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003450201475061476, + "grad_norm": 8.026577949523926, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8778480291366577, + "num_tokens": 549909656.0, + "step": 14419 + }, + { + "epoch": 1.8343722172751558, + "ewc_loss": 0.06948588788509369, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034329635673202574, + "grad_norm": 8.007528305053711, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8554661273956299, + "num_tokens": 549949156.0, + "step": 14420 + }, + { + "epoch": 1.8344994275537463, + "ewc_loss": 0.06950065493583679, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003458854916971177, + "grad_norm": 7.994096279144287, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8705109357833862, + "num_tokens": 549990256.0, + "step": 14421 + }, + { + "epoch": 1.8346266378323368, + "ewc_loss": 0.06944547593593597, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034533371217548847, + "grad_norm": 8.017186164855957, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8715535402297974, + "num_tokens": 550023557.0, + "step": 14422 + }, + { + "epoch": 1.8347538481109273, + "ewc_loss": 0.06943510472774506, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003452299570199102, + "grad_norm": 7.993577003479004, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8600583076477051, + "num_tokens": 550069402.0, + "step": 14423 + }, + { + "epoch": 1.8348810583895179, + "ewc_loss": 0.06954729557037354, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034635188058018684, + "grad_norm": 7.994823455810547, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8713189363479614, + "num_tokens": 550110860.0, + "step": 14424 + }, + { + "epoch": 1.8350082686681084, + "ewc_loss": 0.06954213976860046, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034630033769644797, + "grad_norm": 8.02994441986084, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8481677770614624, + "num_tokens": 550151736.0, + "step": 14425 + }, + { + "epoch": 1.835135478946699, + "ewc_loss": 0.06951747834682465, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003460536536294967, + "grad_norm": 8.017725944519043, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8652534484863281, + "num_tokens": 550187731.0, + "step": 14426 + }, + { + "epoch": 1.8352626892252895, + "ewc_loss": 0.06946784257888794, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000345557346008718, + "grad_norm": 8.02968978881836, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8654307723045349, + "num_tokens": 550220545.0, + "step": 14427 + }, + { + "epoch": 1.83538989950388, + "ewc_loss": 0.06981255114078522, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003465630579739809, + "grad_norm": 8.075350761413574, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8668330311775208, + "num_tokens": 550254755.0, + "step": 14428 + }, + { + "epoch": 1.8355171097824705, + "ewc_loss": 0.06958723068237305, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003443098103161901, + "grad_norm": 7.9638495445251465, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8707422018051147, + "num_tokens": 550295191.0, + "step": 14429 + }, + { + "epoch": 1.835644320061061, + "ewc_loss": 0.06989783048629761, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003474158584140241, + "grad_norm": 8.07664966583252, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8571534156799316, + "num_tokens": 550332984.0, + "step": 14430 + }, + { + "epoch": 1.8357715303396516, + "ewc_loss": 0.06950322538614273, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034346975735388696, + "grad_norm": 7.971458435058594, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.866940975189209, + "num_tokens": 550377470.0, + "step": 14431 + }, + { + "epoch": 1.835898740618242, + "ewc_loss": 0.06997321546077728, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034816961851902306, + "grad_norm": 8.068292617797852, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8689931035041809, + "num_tokens": 550416123.0, + "step": 14432 + }, + { + "epoch": 1.8360259508968326, + "ewc_loss": 0.0695791244506836, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034422872704453766, + "grad_norm": 7.976253986358643, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.869378387928009, + "num_tokens": 550458062.0, + "step": 14433 + }, + { + "epoch": 1.836153161175423, + "ewc_loss": 0.06989915668964386, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034742910065688193, + "grad_norm": 8.081975936889648, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8571370840072632, + "num_tokens": 550496485.0, + "step": 14434 + }, + { + "epoch": 1.8362803714540135, + "ewc_loss": 0.06927452236413956, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003436241240706295, + "grad_norm": 7.927698612213135, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8655152320861816, + "num_tokens": 550536500.0, + "step": 14435 + }, + { + "epoch": 1.836407581732604, + "ewc_loss": 0.06989756971597672, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034985458478331566, + "grad_norm": 8.104053497314453, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8565344214439392, + "num_tokens": 550575611.0, + "step": 14436 + }, + { + "epoch": 1.8365347920111945, + "ewc_loss": 0.06920995563268661, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034297845559194684, + "grad_norm": 7.9575419425964355, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8555090427398682, + "num_tokens": 550616431.0, + "step": 14437 + }, + { + "epoch": 1.836662002289785, + "ewc_loss": 0.06984423100948334, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034932122798636556, + "grad_norm": 8.122749328613281, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8730247020721436, + "num_tokens": 550653626.0, + "step": 14438 + }, + { + "epoch": 1.8367892125683756, + "ewc_loss": 0.06927692145109177, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003436481347307563, + "grad_norm": 8.005573272705078, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8594106435775757, + "num_tokens": 550690899.0, + "step": 14439 + }, + { + "epoch": 1.8369164228469659, + "ewc_loss": 0.0701262354850769, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003472584066912532, + "grad_norm": 8.085648536682129, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8635395169258118, + "num_tokens": 550734408.0, + "step": 14440 + }, + { + "epoch": 1.8370436331255564, + "ewc_loss": 0.06928278505802155, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034370674984529614, + "grad_norm": 8.066987037658691, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8516454696655273, + "num_tokens": 550767888.0, + "step": 14441 + }, + { + "epoch": 1.837170843404147, + "ewc_loss": 0.06972894817590714, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034572696313261986, + "grad_norm": 8.004047393798828, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8652302026748657, + "num_tokens": 550807192.0, + "step": 14442 + }, + { + "epoch": 1.8372980536827375, + "ewc_loss": 0.06965991854667664, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034503667848184705, + "grad_norm": 8.022369384765625, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8719261884689331, + "num_tokens": 550848006.0, + "step": 14443 + }, + { + "epoch": 1.837425263961328, + "ewc_loss": 0.06948526948690414, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003457315906416625, + "grad_norm": 8.011104583740234, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8644034266471863, + "num_tokens": 550888350.0, + "step": 14444 + }, + { + "epoch": 1.8375524742399185, + "ewc_loss": 0.06972717493772507, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034570926800370216, + "grad_norm": 8.077285766601562, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8705453872680664, + "num_tokens": 550925389.0, + "step": 14445 + }, + { + "epoch": 1.837679684518509, + "ewc_loss": 0.06942711770534515, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000345150037901476, + "grad_norm": 7.971721649169922, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8691694140434265, + "num_tokens": 550962610.0, + "step": 14446 + }, + { + "epoch": 1.8378068947970996, + "ewc_loss": 0.06973129510879517, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003481919120531529, + "grad_norm": 8.093955993652344, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8614002466201782, + "num_tokens": 551001018.0, + "step": 14447 + }, + { + "epoch": 1.83793410507569, + "ewc_loss": 0.06967172026634216, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034515466541051865, + "grad_norm": 8.006362915039062, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8657206892967224, + "num_tokens": 551038505.0, + "step": 14448 + }, + { + "epoch": 1.8380613153542806, + "ewc_loss": 0.06969377398490906, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034781661815941334, + "grad_norm": 8.031048774719238, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8758167624473572, + "num_tokens": 551072235.0, + "step": 14449 + }, + { + "epoch": 1.8381885256328712, + "ewc_loss": 0.06978531926870346, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003462907043285668, + "grad_norm": 7.997146129608154, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.876162588596344, + "num_tokens": 551110129.0, + "step": 14450 + }, + { + "epoch": 1.8383157359114617, + "ewc_loss": 0.06989284604787827, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003473659744486213, + "grad_norm": 8.029294967651367, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8657547235488892, + "num_tokens": 551152380.0, + "step": 14451 + }, + { + "epoch": 1.8384429461900522, + "ewc_loss": 0.06961868703365326, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034706576843746006, + "grad_norm": 8.048196792602539, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8743669986724854, + "num_tokens": 551195942.0, + "step": 14452 + }, + { + "epoch": 1.8385701564686427, + "ewc_loss": 0.06985986232757568, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003470360825303942, + "grad_norm": 8.038418769836426, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8567484617233276, + "num_tokens": 551238502.0, + "step": 14453 + }, + { + "epoch": 1.8386973667472333, + "ewc_loss": 0.06961023062467575, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034698122180998325, + "grad_norm": 8.062640190124512, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8729562759399414, + "num_tokens": 551278091.0, + "step": 14454 + }, + { + "epoch": 1.8388245770258238, + "ewc_loss": 0.06946811079978943, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003455599944572896, + "grad_norm": 8.030762672424316, + "learning_rate": 1e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8435938358306885, + "num_tokens": 551314940.0, + "step": 14455 + }, + { + "epoch": 1.8389517873044143, + "ewc_loss": 0.06972630321979523, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034814196988008916, + "grad_norm": 8.082060813903809, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8500977158546448, + "num_tokens": 551356427.0, + "step": 14456 + }, + { + "epoch": 1.8390789975830049, + "ewc_loss": 0.06946957111358643, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034557460458017886, + "grad_norm": 8.001618385314941, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8688321113586426, + "num_tokens": 551400436.0, + "step": 14457 + }, + { + "epoch": 1.8392062078615952, + "ewc_loss": 0.06957810372114182, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034665994462557137, + "grad_norm": 8.030272483825684, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8623695373535156, + "num_tokens": 551438335.0, + "step": 14458 + }, + { + "epoch": 1.8393334181401857, + "ewc_loss": 0.06948558986186981, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034573476295918226, + "grad_norm": 8.045592308044434, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8660788536071777, + "num_tokens": 551471214.0, + "step": 14459 + }, + { + "epoch": 1.8394606284187762, + "ewc_loss": 0.06952977180480957, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034617664641700685, + "grad_norm": 8.041812896728516, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.850591778755188, + "num_tokens": 551504071.0, + "step": 14460 + }, + { + "epoch": 1.8395878386973668, + "ewc_loss": 0.06957657635211945, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034664463601075113, + "grad_norm": 8.061309814453125, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8586714267730713, + "num_tokens": 551538456.0, + "step": 14461 + }, + { + "epoch": 1.8397150489759573, + "ewc_loss": 0.06949736177921295, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034585254616104066, + "grad_norm": 8.024201393127441, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8683983087539673, + "num_tokens": 551575334.0, + "step": 14462 + }, + { + "epoch": 1.8398422592545478, + "ewc_loss": 0.06955709308385849, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003464498440735042, + "grad_norm": 8.042325019836426, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8725447654724121, + "num_tokens": 551617127.0, + "step": 14463 + }, + { + "epoch": 1.8399694695331381, + "ewc_loss": 0.06935758888721466, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034445480559952557, + "grad_norm": 8.017901420593262, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8688398003578186, + "num_tokens": 551655548.0, + "step": 14464 + }, + { + "epoch": 1.8400966798117286, + "ewc_loss": 0.0697268694639206, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034570618299767375, + "grad_norm": 8.015640258789062, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8607376217842102, + "num_tokens": 551691807.0, + "step": 14465 + }, + { + "epoch": 1.8402238900903192, + "ewc_loss": 0.06949456036090851, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003458245482761413, + "grad_norm": 8.070022583007812, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8724075555801392, + "num_tokens": 551729805.0, + "step": 14466 + }, + { + "epoch": 1.8403511003689097, + "ewc_loss": 0.06943345069885254, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003452134260442108, + "grad_norm": 7.9846882820129395, + "learning_rate": 1e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.8418068289756775, + "num_tokens": 551775220.0, + "step": 14467 + }, + { + "epoch": 1.8404783106475002, + "ewc_loss": 0.06994473189115524, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003478848375380039, + "grad_norm": 8.0584077835083, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8555889129638672, + "num_tokens": 551816242.0, + "step": 14468 + }, + { + "epoch": 1.8406055209260908, + "ewc_loss": 0.0695558413863182, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034399592550471425, + "grad_norm": 7.989086151123047, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8530007600784302, + "num_tokens": 551854929.0, + "step": 14469 + }, + { + "epoch": 1.8407327312046813, + "ewc_loss": 0.06987440586090088, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003471815725788474, + "grad_norm": 8.041299819946289, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8621885180473328, + "num_tokens": 551898877.0, + "step": 14470 + }, + { + "epoch": 1.8408599414832718, + "ewc_loss": 0.06969098746776581, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003453474200796336, + "grad_norm": 8.03099250793457, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8805242776870728, + "num_tokens": 551934252.0, + "step": 14471 + }, + { + "epoch": 1.8409871517618623, + "ewc_loss": 0.06985396146774292, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003469770890660584, + "grad_norm": 8.013276100158691, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8755836486816406, + "num_tokens": 551974779.0, + "step": 14472 + }, + { + "epoch": 1.8411143620404529, + "ewc_loss": 0.06983690708875656, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003468065697234124, + "grad_norm": 8.085017204284668, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8636382818222046, + "num_tokens": 552011627.0, + "step": 14473 + }, + { + "epoch": 1.8412415723190434, + "ewc_loss": 0.06971806287765503, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034561814391054213, + "grad_norm": 8.044017791748047, + "learning_rate": 1e-06, + "loss": 0.541, + "mean_token_accuracy": 0.8427045941352844, + "num_tokens": 552045333.0, + "step": 14474 + }, + { + "epoch": 1.841368782597634, + "ewc_loss": 0.06980830430984497, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003465205372776836, + "grad_norm": 8.09952163696289, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8571901321411133, + "num_tokens": 552079974.0, + "step": 14475 + }, + { + "epoch": 1.8414959928762245, + "ewc_loss": 0.06959889829158783, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034442642936483026, + "grad_norm": 8.029241561889648, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8707908987998962, + "num_tokens": 552117507.0, + "step": 14476 + }, + { + "epoch": 1.841623203154815, + "ewc_loss": 0.0697975754737854, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003464132023509592, + "grad_norm": 8.053112983703613, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8636511564254761, + "num_tokens": 552156009.0, + "step": 14477 + }, + { + "epoch": 1.8417504134334055, + "ewc_loss": 0.06965488195419312, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034498629975132644, + "grad_norm": 8.046915054321289, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.872020423412323, + "num_tokens": 552189092.0, + "step": 14478 + }, + { + "epoch": 1.841877623711996, + "ewc_loss": 0.06967654079198837, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003452029195614159, + "grad_norm": 8.0292329788208, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8606313467025757, + "num_tokens": 552228305.0, + "step": 14479 + }, + { + "epoch": 1.8420048339905866, + "ewc_loss": 0.06971343606710434, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003455718688201159, + "grad_norm": 8.044759750366211, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8652012348175049, + "num_tokens": 552269402.0, + "step": 14480 + }, + { + "epoch": 1.842132044269177, + "ewc_loss": 0.06963825225830078, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034482000046409667, + "grad_norm": 8.036368370056152, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8603197336196899, + "num_tokens": 552304870.0, + "step": 14481 + }, + { + "epoch": 1.8422592545477676, + "ewc_loss": 0.06978540122509003, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003462914901319891, + "grad_norm": 8.08082103729248, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8586466312408447, + "num_tokens": 552336802.0, + "step": 14482 + }, + { + "epoch": 1.842386464826358, + "ewc_loss": 0.06955248862504959, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034396236878819764, + "grad_norm": 8.033833503723145, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8767298460006714, + "num_tokens": 552376124.0, + "step": 14483 + }, + { + "epoch": 1.8425136751049485, + "ewc_loss": 0.06977443397045135, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003461818560026586, + "grad_norm": 8.05894947052002, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8553062677383423, + "num_tokens": 552412285.0, + "step": 14484 + }, + { + "epoch": 1.842640885383539, + "ewc_loss": 0.06955895572900772, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034402706660330296, + "grad_norm": 8.0242280960083, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8635700941085815, + "num_tokens": 552447527.0, + "step": 14485 + }, + { + "epoch": 1.8427680956621295, + "ewc_loss": 0.0697423443198204, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003458609280642122, + "grad_norm": 8.045662879943848, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8664031028747559, + "num_tokens": 552486569.0, + "step": 14486 + }, + { + "epoch": 1.84289530594072, + "ewc_loss": 0.06964185833930969, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003448560310062021, + "grad_norm": 8.027339935302734, + "learning_rate": 1e-06, + "loss": 0.6051, + "mean_token_accuracy": 0.8286218643188477, + "num_tokens": 552529158.0, + "step": 14487 + }, + { + "epoch": 1.8430225162193106, + "ewc_loss": 0.06969218701124191, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003453593817539513, + "grad_norm": 8.073813438415527, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8619396686553955, + "num_tokens": 552572183.0, + "step": 14488 + }, + { + "epoch": 1.8431497264979009, + "ewc_loss": 0.0696120411157608, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034455794957466424, + "grad_norm": 7.995705604553223, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8623402118682861, + "num_tokens": 552609785.0, + "step": 14489 + }, + { + "epoch": 1.8432769367764914, + "ewc_loss": 0.06974048912525177, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034584238892421126, + "grad_norm": 8.08930778503418, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.870903491973877, + "num_tokens": 552641243.0, + "step": 14490 + }, + { + "epoch": 1.843404147055082, + "ewc_loss": 0.06950047612190247, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003434422833379358, + "grad_norm": 8.010672569274902, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8558441996574402, + "num_tokens": 552677595.0, + "step": 14491 + }, + { + "epoch": 1.8435313573336725, + "ewc_loss": 0.06975699961185455, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003460074949543923, + "grad_norm": 8.05871295928955, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8650785684585571, + "num_tokens": 552715492.0, + "step": 14492 + }, + { + "epoch": 1.843658567612263, + "ewc_loss": 0.06955370306968689, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034397447598166764, + "grad_norm": 7.979065418243408, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8644105195999146, + "num_tokens": 552757410.0, + "step": 14493 + }, + { + "epoch": 1.8437857778908535, + "ewc_loss": 0.06985984742641449, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003470359370112419, + "grad_norm": 8.097167015075684, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8799707889556885, + "num_tokens": 552793794.0, + "step": 14494 + }, + { + "epoch": 1.843912988169444, + "ewc_loss": 0.06950055062770844, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003434430109336972, + "grad_norm": 7.965181350708008, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8503301739692688, + "num_tokens": 552836010.0, + "step": 14495 + }, + { + "epoch": 1.8440401984480346, + "ewc_loss": 0.06989648938179016, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003474024124443531, + "grad_norm": 8.139986038208008, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8664125204086304, + "num_tokens": 552874136.0, + "step": 14496 + }, + { + "epoch": 1.844167408726625, + "ewc_loss": 0.06945135444402695, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003429510397836566, + "grad_norm": 8.010162353515625, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8596584796905518, + "num_tokens": 552912314.0, + "step": 14497 + }, + { + "epoch": 1.8442946190052156, + "ewc_loss": 0.069862961769104, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003470671072136611, + "grad_norm": 8.136268615722656, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8522580862045288, + "num_tokens": 552953998.0, + "step": 14498 + }, + { + "epoch": 1.8444218292838062, + "ewc_loss": 0.0693039745092392, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034147725091315806, + "grad_norm": 7.99678897857666, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8839786648750305, + "num_tokens": 552987310.0, + "step": 14499 + }, + { + "epoch": 1.8445490395623967, + "ewc_loss": 0.06992881745100021, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034772566868923604, + "grad_norm": 8.17697525024414, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8609642386436462, + "num_tokens": 553019830.0, + "step": 14500 + }, + { + "epoch": 1.8446762498409872, + "ewc_loss": 0.06937192380428314, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034215670893900096, + "grad_norm": 7.965145111083984, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8630682229995728, + "num_tokens": 553066695.0, + "step": 14501 + }, + { + "epoch": 1.8448034601195777, + "ewc_loss": 0.06982441991567612, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000346681714290753, + "grad_norm": 8.162619590759277, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8543332815170288, + "num_tokens": 553098943.0, + "step": 14502 + }, + { + "epoch": 1.8449306703981683, + "ewc_loss": 0.06937117129564285, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003421492292545736, + "grad_norm": 7.999124526977539, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8685064315795898, + "num_tokens": 553136106.0, + "step": 14503 + }, + { + "epoch": 1.8450578806767588, + "ewc_loss": 0.06976928561925888, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003461303422227502, + "grad_norm": 8.100921630859375, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8696166276931763, + "num_tokens": 553170014.0, + "step": 14504 + }, + { + "epoch": 1.8451850909553493, + "ewc_loss": 0.06948515772819519, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003432890516705811, + "grad_norm": 8.03184986114502, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8579307794570923, + "num_tokens": 553213297.0, + "step": 14505 + }, + { + "epoch": 1.8453123012339399, + "ewc_loss": 0.06961382925510406, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034457581932656467, + "grad_norm": 8.099891662597656, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8649068474769592, + "num_tokens": 553248559.0, + "step": 14506 + }, + { + "epoch": 1.8454395115125302, + "ewc_loss": 0.06959928572177887, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034443032927811146, + "grad_norm": 8.103718757629395, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8654991388320923, + "num_tokens": 553289140.0, + "step": 14507 + }, + { + "epoch": 1.8455667217911207, + "ewc_loss": 0.06947914510965347, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034322895226068795, + "grad_norm": 8.03091049194336, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8573650121688843, + "num_tokens": 553330619.0, + "step": 14508 + }, + { + "epoch": 1.8456939320697112, + "ewc_loss": 0.0697009265422821, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034544680966064334, + "grad_norm": 8.075499534606934, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8738369941711426, + "num_tokens": 553371295.0, + "step": 14509 + }, + { + "epoch": 1.8458211423483017, + "ewc_loss": 0.06944756209850311, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034291314659640193, + "grad_norm": 8.012229919433594, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8597694039344788, + "num_tokens": 553410237.0, + "step": 14510 + }, + { + "epoch": 1.8459483526268923, + "ewc_loss": 0.06974884867668152, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034592594602145255, + "grad_norm": 8.07804012298584, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8712677955627441, + "num_tokens": 553446727.0, + "step": 14511 + }, + { + "epoch": 1.8460755629054828, + "ewc_loss": 0.0694977417588234, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003434149257373065, + "grad_norm": 8.03720474243164, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8696308732032776, + "num_tokens": 553488626.0, + "step": 14512 + }, + { + "epoch": 1.846202773184073, + "ewc_loss": 0.06975750625133514, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003460125590208918, + "grad_norm": 8.08107852935791, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8556153774261475, + "num_tokens": 553528294.0, + "step": 14513 + }, + { + "epoch": 1.8463299834626636, + "ewc_loss": 0.06943825632333755, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003428200725466013, + "grad_norm": 8.054515838623047, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8674829006195068, + "num_tokens": 553563738.0, + "step": 14514 + }, + { + "epoch": 1.8464571937412542, + "ewc_loss": 0.06966093927621841, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034504689392633736, + "grad_norm": 8.083972930908203, + "learning_rate": 1e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.82939213514328, + "num_tokens": 553603351.0, + "step": 14515 + }, + { + "epoch": 1.8465844040198447, + "ewc_loss": 0.06952688097953796, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003437062550801784, + "grad_norm": 8.010270118713379, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8624908328056335, + "num_tokens": 553643655.0, + "step": 14516 + }, + { + "epoch": 1.8467116142984352, + "ewc_loss": 0.06971791386604309, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034561665961518884, + "grad_norm": 8.098991394042969, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8756352066993713, + "num_tokens": 553677944.0, + "step": 14517 + }, + { + "epoch": 1.8468388245770258, + "ewc_loss": 0.06946446746587753, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034308218164369464, + "grad_norm": 7.999563217163086, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8578505516052246, + "num_tokens": 553712120.0, + "step": 14518 + }, + { + "epoch": 1.8469660348556163, + "ewc_loss": 0.06978405267000198, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034627801505848765, + "grad_norm": 8.04432201385498, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8682755827903748, + "num_tokens": 553751442.0, + "step": 14519 + }, + { + "epoch": 1.8470932451342068, + "ewc_loss": 0.0694698691368103, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034313619835302234, + "grad_norm": 8.049947738647461, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8706421852111816, + "num_tokens": 553782924.0, + "step": 14520 + }, + { + "epoch": 1.8472204554127973, + "ewc_loss": 0.06965567171573639, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003449942159932107, + "grad_norm": 8.025655746459961, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8554455041885376, + "num_tokens": 553818861.0, + "step": 14521 + }, + { + "epoch": 1.8473476656913879, + "ewc_loss": 0.06960899382829666, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003445274487603456, + "grad_norm": 8.030830383300781, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.861892580986023, + "num_tokens": 553855593.0, + "step": 14522 + }, + { + "epoch": 1.8474748759699784, + "ewc_loss": 0.06950858235359192, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034352330840192735, + "grad_norm": 7.974398136138916, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8626687526702881, + "num_tokens": 553896028.0, + "step": 14523 + }, + { + "epoch": 1.847602086248569, + "ewc_loss": 0.06960859894752502, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000344523461535573, + "grad_norm": 8.019519805908203, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.867666482925415, + "num_tokens": 553935212.0, + "step": 14524 + }, + { + "epoch": 1.8477292965271594, + "ewc_loss": 0.06957860291004181, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034422348835505545, + "grad_norm": 8.003742218017578, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8625422716140747, + "num_tokens": 553971231.0, + "step": 14525 + }, + { + "epoch": 1.84785650680575, + "ewc_loss": 0.06991858780384064, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003451819939073175, + "grad_norm": 8.01364803314209, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8732492327690125, + "num_tokens": 554006782.0, + "step": 14526 + }, + { + "epoch": 1.8479837170843405, + "ewc_loss": 0.069687619805336, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003453136596363038, + "grad_norm": 8.051151275634766, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8580735921859741, + "num_tokens": 554043427.0, + "step": 14527 + }, + { + "epoch": 1.848110927362931, + "ewc_loss": 0.06962641328573227, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034470163518562913, + "grad_norm": 8.030097007751465, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8530333042144775, + "num_tokens": 554084187.0, + "step": 14528 + }, + { + "epoch": 1.8482381376415216, + "ewc_loss": 0.0697619616985321, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003460570878814906, + "grad_norm": 8.053154945373535, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8635913729667664, + "num_tokens": 554122186.0, + "step": 14529 + }, + { + "epoch": 1.848365347920112, + "ewc_loss": 0.06954796612262726, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034391714143566787, + "grad_norm": 8.024726867675781, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.86324143409729, + "num_tokens": 554161388.0, + "step": 14530 + }, + { + "epoch": 1.8484925581987026, + "ewc_loss": 0.06976119428873062, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034604943357408047, + "grad_norm": 8.060375213623047, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8561936616897583, + "num_tokens": 554199837.0, + "step": 14531 + }, + { + "epoch": 1.848619768477293, + "ewc_loss": 0.0696360319852829, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003447977942414582, + "grad_norm": 8.039947509765625, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8530324697494507, + "num_tokens": 554239567.0, + "step": 14532 + }, + { + "epoch": 1.8487469787558835, + "ewc_loss": 0.069760262966156, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034604009124450386, + "grad_norm": 8.083921432495117, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8671073317527771, + "num_tokens": 554277040.0, + "step": 14533 + }, + { + "epoch": 1.848874189034474, + "ewc_loss": 0.06958532333374023, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034429074730724096, + "grad_norm": 8.00967788696289, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8684614896774292, + "num_tokens": 554318994.0, + "step": 14534 + }, + { + "epoch": 1.8490013993130645, + "ewc_loss": 0.0698096975684166, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003465344780124724, + "grad_norm": 8.105474472045898, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8576717376708984, + "num_tokens": 554355334.0, + "step": 14535 + }, + { + "epoch": 1.849128609591655, + "ewc_loss": 0.06963321566581726, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003447696508374065, + "grad_norm": 8.030020713806152, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8524689078330994, + "num_tokens": 554396504.0, + "step": 14536 + }, + { + "epoch": 1.8492558198702456, + "ewc_loss": 0.06977474689483643, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000346184941008687, + "grad_norm": 8.088034629821777, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8698530197143555, + "num_tokens": 554429772.0, + "step": 14537 + }, + { + "epoch": 1.8493830301488359, + "ewc_loss": 0.0695776492357254, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034421400050632656, + "grad_norm": 8.068497657775879, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8656253814697266, + "num_tokens": 554470102.0, + "step": 14538 + }, + { + "epoch": 1.8495102404274264, + "ewc_loss": 0.06971755623817444, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034561307984404266, + "grad_norm": 8.058073043823242, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8545761108398438, + "num_tokens": 554511632.0, + "step": 14539 + }, + { + "epoch": 1.849637450706017, + "ewc_loss": 0.0695384293794632, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034382176818326116, + "grad_norm": 8.034316062927246, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8596358299255371, + "num_tokens": 554553028.0, + "step": 14540 + }, + { + "epoch": 1.8497646609846075, + "ewc_loss": 0.06961603462696075, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003445978800300509, + "grad_norm": 8.098933219909668, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8585977554321289, + "num_tokens": 554585109.0, + "step": 14541 + }, + { + "epoch": 1.849891871263198, + "ewc_loss": 0.06960444152355194, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003444819594733417, + "grad_norm": 8.099286079406738, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8640053272247314, + "num_tokens": 554621141.0, + "step": 14542 + }, + { + "epoch": 1.8500190815417885, + "ewc_loss": 0.06950289011001587, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000343466381309554, + "grad_norm": 8.012063980102539, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8622242212295532, + "num_tokens": 554660838.0, + "step": 14543 + }, + { + "epoch": 1.850146291820379, + "ewc_loss": 0.06974254548549652, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003458629362285137, + "grad_norm": 8.138060569763184, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.865068256855011, + "num_tokens": 554696338.0, + "step": 14544 + }, + { + "epoch": 1.8502735020989696, + "ewc_loss": 0.0693717822432518, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003421553410589695, + "grad_norm": 8.04751968383789, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8760794401168823, + "num_tokens": 554729782.0, + "step": 14545 + }, + { + "epoch": 1.85040071237756, + "ewc_loss": 0.06964312493801117, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034486877848394215, + "grad_norm": 8.061942100524902, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8799049258232117, + "num_tokens": 554767566.0, + "step": 14546 + }, + { + "epoch": 1.8505279226561506, + "ewc_loss": 0.0694449320435524, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003428868076298386, + "grad_norm": 8.205489158630371, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.868767261505127, + "num_tokens": 554806063.0, + "step": 14547 + }, + { + "epoch": 1.8506551329347412, + "ewc_loss": 0.06941454112529755, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034258293453603983, + "grad_norm": 8.058433532714844, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8604558110237122, + "num_tokens": 554839418.0, + "step": 14548 + }, + { + "epoch": 1.8507823432133317, + "ewc_loss": 0.06965526938438416, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003449902287684381, + "grad_norm": 8.131197929382324, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8595141172409058, + "num_tokens": 554881888.0, + "step": 14549 + }, + { + "epoch": 1.8509095534919222, + "ewc_loss": 0.06923559308052063, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003407933982089162, + "grad_norm": 8.00996208190918, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8817787170410156, + "num_tokens": 554921561.0, + "step": 14550 + }, + { + "epoch": 1.8510367637705127, + "ewc_loss": 0.06958821415901184, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003443196474108845, + "grad_norm": 8.079444885253906, + "learning_rate": 1e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8394472002983093, + "num_tokens": 554954380.0, + "step": 14551 + }, + { + "epoch": 1.8511639740491033, + "ewc_loss": 0.06935422122478485, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000341979757649824, + "grad_norm": 8.006089210510254, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8654974699020386, + "num_tokens": 554994781.0, + "step": 14552 + }, + { + "epoch": 1.8512911843276938, + "ewc_loss": 0.0696752667427063, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003451901429798454, + "grad_norm": 8.11003589630127, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8711285591125488, + "num_tokens": 555024938.0, + "step": 14553 + }, + { + "epoch": 1.8514183946062843, + "ewc_loss": 0.06936278939247131, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034206543932668865, + "grad_norm": 8.058974266052246, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8526003956794739, + "num_tokens": 555059785.0, + "step": 14554 + }, + { + "epoch": 1.8515456048848749, + "ewc_loss": 0.06953103095293045, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003437478153500706, + "grad_norm": 8.092900276184082, + "learning_rate": 1e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.8387444019317627, + "num_tokens": 555097502.0, + "step": 14555 + }, + { + "epoch": 1.8516728151634652, + "ewc_loss": 0.06940515339374542, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034248907468281686, + "grad_norm": 8.093788146972656, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8606957197189331, + "num_tokens": 555139976.0, + "step": 14556 + }, + { + "epoch": 1.8518000254420557, + "ewc_loss": 0.06939826160669327, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034242012770846486, + "grad_norm": 8.033891677856445, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8676242828369141, + "num_tokens": 555181250.0, + "step": 14557 + }, + { + "epoch": 1.8519272357206462, + "ewc_loss": 0.06951095908880234, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003435470862314105, + "grad_norm": 8.141847610473633, + "learning_rate": 1e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8448142409324646, + "num_tokens": 555214927.0, + "step": 14558 + }, + { + "epoch": 1.8520544459992367, + "ewc_loss": 0.06928792595863342, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034131674328818917, + "grad_norm": 8.001157760620117, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8600946664810181, + "num_tokens": 555255795.0, + "step": 14559 + }, + { + "epoch": 1.8521816562778273, + "ewc_loss": 0.06972077488899231, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034564523957669735, + "grad_norm": 8.166868209838867, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8662936687469482, + "num_tokens": 555294177.0, + "step": 14560 + }, + { + "epoch": 1.8523088665564178, + "ewc_loss": 0.06913119554519653, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003397494729142636, + "grad_norm": 7.957223415374756, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8765348196029663, + "num_tokens": 555331548.0, + "step": 14561 + }, + { + "epoch": 1.852436076835008, + "ewc_loss": 0.06981116533279419, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034654917544685304, + "grad_norm": 8.120611190795898, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8750371932983398, + "num_tokens": 555368426.0, + "step": 14562 + }, + { + "epoch": 1.8525632871135986, + "ewc_loss": 0.06921215355396271, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003405590250622481, + "grad_norm": 7.948972702026367, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8796166777610779, + "num_tokens": 555411594.0, + "step": 14563 + }, + { + "epoch": 1.8526904973921892, + "ewc_loss": 0.06992064416408539, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003476438869256526, + "grad_norm": 8.20827579498291, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8564670085906982, + "num_tokens": 555450412.0, + "step": 14564 + }, + { + "epoch": 1.8528177076707797, + "ewc_loss": 0.06922377645969391, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034067523665726185, + "grad_norm": 8.004594802856445, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8722189664840698, + "num_tokens": 555489067.0, + "step": 14565 + }, + { + "epoch": 1.8529449179493702, + "ewc_loss": 0.06983289867639542, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034676649374887347, + "grad_norm": 8.181422233581543, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8697543740272522, + "num_tokens": 555524552.0, + "step": 14566 + }, + { + "epoch": 1.8530721282279607, + "ewc_loss": 0.0691511332988739, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003399488050490618, + "grad_norm": 7.982777118682861, + "learning_rate": 1e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8381005525588989, + "num_tokens": 555563766.0, + "step": 14567 + }, + { + "epoch": 1.8531993385065513, + "ewc_loss": 0.0698164775967598, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003466022608336061, + "grad_norm": 8.15762996673584, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8712698817253113, + "num_tokens": 555601410.0, + "step": 14568 + }, + { + "epoch": 1.8533265487851418, + "ewc_loss": 0.0693875253200531, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034231276367790997, + "grad_norm": 8.076441764831543, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8625989556312561, + "num_tokens": 555641346.0, + "step": 14569 + }, + { + "epoch": 1.8534537590637323, + "ewc_loss": 0.06969867646694183, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034542425419203937, + "grad_norm": 8.081250190734863, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8741957545280457, + "num_tokens": 555676537.0, + "step": 14570 + }, + { + "epoch": 1.8535809693423229, + "ewc_loss": 0.06958021223545074, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000344239582773298, + "grad_norm": 8.077189445495605, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8497506380081177, + "num_tokens": 555721634.0, + "step": 14571 + }, + { + "epoch": 1.8537081796209134, + "ewc_loss": 0.06956931948661804, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003441307053435594, + "grad_norm": 8.071176528930664, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8647428750991821, + "num_tokens": 555765447.0, + "step": 14572 + }, + { + "epoch": 1.853835389899504, + "ewc_loss": 0.06953296065330505, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003437671111896634, + "grad_norm": 8.178522109985352, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8622636795043945, + "num_tokens": 555798598.0, + "step": 14573 + }, + { + "epoch": 1.8539626001780944, + "ewc_loss": 0.06930582225322723, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034149570274166763, + "grad_norm": 8.095816612243652, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.871145486831665, + "num_tokens": 555829465.0, + "step": 14574 + }, + { + "epoch": 1.854089810456685, + "ewc_loss": 0.06950469315052032, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003434843965806067, + "grad_norm": 8.09144401550293, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8652502298355103, + "num_tokens": 555866577.0, + "step": 14575 + }, + { + "epoch": 1.8542170207352755, + "ewc_loss": 0.06933248043060303, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003417623520363122, + "grad_norm": 8.082426071166992, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8663628101348877, + "num_tokens": 555900472.0, + "step": 14576 + }, + { + "epoch": 1.854344231013866, + "ewc_loss": 0.06948093324899673, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034324685111641884, + "grad_norm": 8.113859176635742, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8576177358627319, + "num_tokens": 555936356.0, + "step": 14577 + }, + { + "epoch": 1.8544714412924566, + "ewc_loss": 0.06944975256919861, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003429350326769054, + "grad_norm": 8.091495513916016, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8569642305374146, + "num_tokens": 555975338.0, + "step": 14578 + }, + { + "epoch": 1.854598651571047, + "ewc_loss": 0.06952647864818573, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034370223875157535, + "grad_norm": 8.092536926269531, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8533814549446106, + "num_tokens": 556019651.0, + "step": 14579 + }, + { + "epoch": 1.8547258618496376, + "ewc_loss": 0.069483682513237, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034327432513237, + "grad_norm": 8.092103004455566, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8592755794525146, + "num_tokens": 556054530.0, + "step": 14580 + }, + { + "epoch": 1.854853072128228, + "ewc_loss": 0.06965269893407822, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034496450098231435, + "grad_norm": 8.10076904296875, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8593903183937073, + "num_tokens": 556091985.0, + "step": 14581 + }, + { + "epoch": 1.8549802824068184, + "ewc_loss": 0.06961330771446228, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034457052242942154, + "grad_norm": 8.125627517700195, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.867475688457489, + "num_tokens": 556128777.0, + "step": 14582 + }, + { + "epoch": 1.855107492685409, + "ewc_loss": 0.06961236149072647, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000344561121892184, + "grad_norm": 8.091609954833984, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8832153081893921, + "num_tokens": 556164286.0, + "step": 14583 + }, + { + "epoch": 1.8552347029639995, + "ewc_loss": 0.0696055144071579, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003444926405791193, + "grad_norm": 8.077567100524902, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8711890578269958, + "num_tokens": 556199141.0, + "step": 14584 + }, + { + "epoch": 1.85536191324259, + "ewc_loss": 0.06960222870111465, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034445978235453367, + "grad_norm": 8.11871337890625, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8693770170211792, + "num_tokens": 556237104.0, + "step": 14585 + }, + { + "epoch": 1.8554891235211806, + "ewc_loss": 0.06953886151313782, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034382607555016875, + "grad_norm": 8.061691284179688, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8531796932220459, + "num_tokens": 556275343.0, + "step": 14586 + }, + { + "epoch": 1.8556163337997709, + "ewc_loss": 0.06956363469362259, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003440738655626774, + "grad_norm": 8.045531272888184, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8806111812591553, + "num_tokens": 556316539.0, + "step": 14587 + }, + { + "epoch": 1.8557435440783614, + "ewc_loss": 0.06971874833106995, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000345624954206869, + "grad_norm": 8.097780227661133, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8748674392700195, + "num_tokens": 556351101.0, + "step": 14588 + }, + { + "epoch": 1.855870754356952, + "ewc_loss": 0.06956715881824493, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003441091103013605, + "grad_norm": 8.163305282592773, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8756734132766724, + "num_tokens": 556382458.0, + "step": 14589 + }, + { + "epoch": 1.8559979646355425, + "ewc_loss": 0.06962116807699203, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003446491900831461, + "grad_norm": 8.125960350036621, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8665980696678162, + "num_tokens": 556416980.0, + "step": 14590 + }, + { + "epoch": 1.856125174914133, + "ewc_loss": 0.06954514235258102, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003438889398239553, + "grad_norm": 8.457708358764648, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8695333003997803, + "num_tokens": 556451868.0, + "step": 14591 + }, + { + "epoch": 1.8562523851927235, + "ewc_loss": 0.06900908797979355, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033852836349979043, + "grad_norm": 7.987966060638428, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8523187637329102, + "num_tokens": 556485682.0, + "step": 14592 + }, + { + "epoch": 1.856379595471314, + "ewc_loss": 0.06996475160121918, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003480850427877158, + "grad_norm": 8.140129089355469, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8751230835914612, + "num_tokens": 556524857.0, + "step": 14593 + }, + { + "epoch": 1.8565068057499046, + "ewc_loss": 0.06908763945102692, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033931387588381767, + "grad_norm": 8.00950813293457, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8685681819915771, + "num_tokens": 556565763.0, + "step": 14594 + }, + { + "epoch": 1.856634016028495, + "ewc_loss": 0.06977304816246033, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003461680025793612, + "grad_norm": 8.123372077941895, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8529540300369263, + "num_tokens": 556606202.0, + "step": 14595 + }, + { + "epoch": 1.8567612263070856, + "ewc_loss": 0.06933613121509552, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003417987609282136, + "grad_norm": 8.026087760925293, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8727814555168152, + "num_tokens": 556646906.0, + "step": 14596 + }, + { + "epoch": 1.8568884365856761, + "ewc_loss": 0.06974892318248749, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034592676092870533, + "grad_norm": 8.135139465332031, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8655045628547668, + "num_tokens": 556684249.0, + "step": 14597 + }, + { + "epoch": 1.8570156468642667, + "ewc_loss": 0.06934988498687744, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003419363929424435, + "grad_norm": 8.06760311126709, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8528887033462524, + "num_tokens": 556726840.0, + "step": 14598 + }, + { + "epoch": 1.8571428571428572, + "ewc_loss": 0.06958124041557312, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003442499437369406, + "grad_norm": 8.126757621765137, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8658839464187622, + "num_tokens": 556764822.0, + "step": 14599 + }, + { + "epoch": 1.8572700674214477, + "ewc_loss": 0.06943640112876892, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034280147519893944, + "grad_norm": 8.08393669128418, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8628014326095581, + "num_tokens": 556800884.0, + "step": 14600 + }, + { + "epoch": 1.8573972777000383, + "ewc_loss": 0.06956127285957336, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034405020414851606, + "grad_norm": 8.114968299865723, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8711950778961182, + "num_tokens": 556841233.0, + "step": 14601 + }, + { + "epoch": 1.8575244879786288, + "ewc_loss": 0.0694076418876648, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003425139293540269, + "grad_norm": 8.101914405822754, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8579583168029785, + "num_tokens": 556880995.0, + "step": 14602 + }, + { + "epoch": 1.8576516982572193, + "ewc_loss": 0.06949621438980103, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034339961712248623, + "grad_norm": 8.097611427307129, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8676506876945496, + "num_tokens": 556914367.0, + "step": 14603 + }, + { + "epoch": 1.8577789085358098, + "ewc_loss": 0.06954728066921234, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034391036024317145, + "grad_norm": 8.159286499023438, + "learning_rate": 1e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8449944853782654, + "num_tokens": 556955944.0, + "step": 14604 + }, + { + "epoch": 1.8579061188144002, + "ewc_loss": 0.06935010105371475, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034193851752206683, + "grad_norm": 8.127249717712402, + "learning_rate": 1e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8468153476715088, + "num_tokens": 556989851.0, + "step": 14605 + }, + { + "epoch": 1.8580333290929907, + "ewc_loss": 0.06945927441120148, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003430302022024989, + "grad_norm": 8.085776329040527, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8618226051330566, + "num_tokens": 557033662.0, + "step": 14606 + }, + { + "epoch": 1.8581605393715812, + "ewc_loss": 0.0693850889801979, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003422884037718177, + "grad_norm": 8.108499526977539, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8561971783638, + "num_tokens": 557074869.0, + "step": 14607 + }, + { + "epoch": 1.8582877496501717, + "ewc_loss": 0.06934426724910736, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003418801643420011, + "grad_norm": 8.059283256530762, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8527587056159973, + "num_tokens": 557110164.0, + "step": 14608 + }, + { + "epoch": 1.8584149599287623, + "ewc_loss": 0.0696682333946228, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034511988633312285, + "grad_norm": 8.125598907470703, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8642259240150452, + "num_tokens": 557149309.0, + "step": 14609 + }, + { + "epoch": 1.8585421702073528, + "ewc_loss": 0.06934764981269836, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003419140412006527, + "grad_norm": 8.089827537536621, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8593225479125977, + "num_tokens": 557180510.0, + "step": 14610 + }, + { + "epoch": 1.858669380485943, + "ewc_loss": 0.06955790519714355, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034401650191284716, + "grad_norm": 8.179244995117188, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8690916299819946, + "num_tokens": 557218062.0, + "step": 14611 + }, + { + "epoch": 1.8587965907645336, + "ewc_loss": 0.06934228539466858, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003418603155296296, + "grad_norm": 8.002543449401855, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8558781147003174, + "num_tokens": 557258963.0, + "step": 14612 + }, + { + "epoch": 1.8589238010431242, + "ewc_loss": 0.06968331336975098, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003452706732787192, + "grad_norm": 8.27823543548584, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8655428886413574, + "num_tokens": 557296046.0, + "step": 14613 + }, + { + "epoch": 1.8590510113217147, + "ewc_loss": 0.06915536522865295, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003399911511223763, + "grad_norm": 7.945570945739746, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8622303009033203, + "num_tokens": 557328395.0, + "step": 14614 + }, + { + "epoch": 1.8591782216003052, + "ewc_loss": 0.0700690969824791, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003491284733172506, + "grad_norm": 8.297040939331055, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8663298487663269, + "num_tokens": 557363888.0, + "step": 14615 + }, + { + "epoch": 1.8593054318788957, + "ewc_loss": 0.06899070739746094, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003383446019142866, + "grad_norm": 7.978025436401367, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8553036451339722, + "num_tokens": 557402944.0, + "step": 14616 + }, + { + "epoch": 1.8594326421574863, + "ewc_loss": 0.07009419798851013, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003493794647511095, + "grad_norm": 8.256675720214844, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8695108890533447, + "num_tokens": 557442247.0, + "step": 14617 + }, + { + "epoch": 1.8595598524360768, + "ewc_loss": 0.06907585263252258, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003391960635781288, + "grad_norm": 7.962474822998047, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8571441769599915, + "num_tokens": 557483580.0, + "step": 14618 + }, + { + "epoch": 1.8596870627146673, + "ewc_loss": 0.07002314925193787, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034866901114583015, + "grad_norm": 8.141207695007324, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8675282001495361, + "num_tokens": 557520134.0, + "step": 14619 + }, + { + "epoch": 1.8598142729932579, + "ewc_loss": 0.06931259483098984, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003415634564589709, + "grad_norm": 8.00137996673584, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.858339786529541, + "num_tokens": 557565726.0, + "step": 14620 + }, + { + "epoch": 1.8599414832718484, + "ewc_loss": 0.06983014196157455, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034673893242143095, + "grad_norm": 8.18079948425293, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8614828586578369, + "num_tokens": 557602871.0, + "step": 14621 + }, + { + "epoch": 1.860068693550439, + "ewc_loss": 0.06941592693328857, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003425968170631677, + "grad_norm": 8.060495376586914, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8557859659194946, + "num_tokens": 557640873.0, + "step": 14622 + }, + { + "epoch": 1.8601959038290294, + "ewc_loss": 0.06977936625480652, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003462311578914523, + "grad_norm": 8.20021915435791, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8575329184532166, + "num_tokens": 557684233.0, + "step": 14623 + }, + { + "epoch": 1.86032311410762, + "ewc_loss": 0.06945955753326416, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034303308348171413, + "grad_norm": 8.150749206542969, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8691126108169556, + "num_tokens": 557716895.0, + "step": 14624 + }, + { + "epoch": 1.8604503243862105, + "ewc_loss": 0.06958244740962982, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003442619345150888, + "grad_norm": 8.106147766113281, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8609561324119568, + "num_tokens": 557758197.0, + "step": 14625 + }, + { + "epoch": 1.860577534664801, + "ewc_loss": 0.06967885792255402, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034522608621045947, + "grad_norm": 8.212503433227539, + "learning_rate": 1e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.8366196155548096, + "num_tokens": 557794196.0, + "step": 14626 + }, + { + "epoch": 1.8607047449433916, + "ewc_loss": 0.0693289190530777, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003417266416363418, + "grad_norm": 8.115495681762695, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8472548127174377, + "num_tokens": 557833856.0, + "step": 14627 + }, + { + "epoch": 1.860831955221982, + "ewc_loss": 0.06963707506656647, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003448082134127617, + "grad_norm": 8.27569580078125, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8507956862449646, + "num_tokens": 557866304.0, + "step": 14628 + }, + { + "epoch": 1.8609591655005726, + "ewc_loss": 0.06918148696422577, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034025232889689505, + "grad_norm": 8.05224609375, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8750037550926208, + "num_tokens": 557907065.0, + "step": 14629 + }, + { + "epoch": 1.861086375779163, + "ewc_loss": 0.06966876983642578, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003451252414379269, + "grad_norm": 8.113910675048828, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8583202958106995, + "num_tokens": 557947326.0, + "step": 14630 + }, + { + "epoch": 1.8612135860577534, + "ewc_loss": 0.06939596682786942, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003423971647862345, + "grad_norm": 8.092949867248535, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8624595999717712, + "num_tokens": 557990059.0, + "step": 14631 + }, + { + "epoch": 1.861340796336344, + "ewc_loss": 0.06956154108047485, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003440528817009181, + "grad_norm": 8.086812019348145, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.865600049495697, + "num_tokens": 558028406.0, + "step": 14632 + }, + { + "epoch": 1.8614680066149345, + "ewc_loss": 0.06946083158254623, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003430458309594542, + "grad_norm": 8.071325302124023, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8604444861412048, + "num_tokens": 558070239.0, + "step": 14633 + }, + { + "epoch": 1.861595216893525, + "ewc_loss": 0.06954817473888397, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003439192660152912, + "grad_norm": 8.1149320602417, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8670487999916077, + "num_tokens": 558107685.0, + "step": 14634 + }, + { + "epoch": 1.8617224271721156, + "ewc_loss": 0.06959980726242065, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003444356261752546, + "grad_norm": 8.101164817810059, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8557682037353516, + "num_tokens": 558143185.0, + "step": 14635 + }, + { + "epoch": 1.8618496374507059, + "ewc_loss": 0.06956684589385986, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003441059379838407, + "grad_norm": 8.101313591003418, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8722811937332153, + "num_tokens": 558176207.0, + "step": 14636 + }, + { + "epoch": 1.8619768477292964, + "ewc_loss": 0.0696035623550415, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034447311190888286, + "grad_norm": 8.10151481628418, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8787426948547363, + "num_tokens": 558212215.0, + "step": 14637 + }, + { + "epoch": 1.862104058007887, + "ewc_loss": 0.06954853236675262, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034392287489026785, + "grad_norm": 8.085562705993652, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8722069263458252, + "num_tokens": 558248471.0, + "step": 14638 + }, + { + "epoch": 1.8622312682864774, + "ewc_loss": 0.06949400901794434, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003433775855228305, + "grad_norm": 8.069099426269531, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.84907066822052, + "num_tokens": 558285211.0, + "step": 14639 + }, + { + "epoch": 1.862358478565068, + "ewc_loss": 0.06968586146831512, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034529613913036883, + "grad_norm": 8.206979751586914, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8564100861549377, + "num_tokens": 558314434.0, + "step": 14640 + }, + { + "epoch": 1.8624856888436585, + "ewc_loss": 0.06929504871368408, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034138798946514726, + "grad_norm": 8.054071426391602, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8715784549713135, + "num_tokens": 558353710.0, + "step": 14641 + }, + { + "epoch": 1.862612899122249, + "ewc_loss": 0.06996095925569534, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003480470913928002, + "grad_norm": 8.186466217041016, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8511730432510376, + "num_tokens": 558390476.0, + "step": 14642 + }, + { + "epoch": 1.8627401094008396, + "ewc_loss": 0.06919559091329575, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034039339516311884, + "grad_norm": 8.035384178161621, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8720518350601196, + "num_tokens": 558424917.0, + "step": 14643 + }, + { + "epoch": 1.86286731967943, + "ewc_loss": 0.06992287933826447, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034766632597893476, + "grad_norm": 8.238041877746582, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8555442094802856, + "num_tokens": 558459831.0, + "step": 14644 + }, + { + "epoch": 1.8629945299580206, + "ewc_loss": 0.06927789747714996, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003412164805922657, + "grad_norm": 8.02660846710205, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8786903619766235, + "num_tokens": 558498600.0, + "step": 14645 + }, + { + "epoch": 1.8631217402366111, + "ewc_loss": 0.0697859451174736, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034629693254828453, + "grad_norm": 8.1790132522583, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8537603616714478, + "num_tokens": 558537465.0, + "step": 14646 + }, + { + "epoch": 1.8632489505152017, + "ewc_loss": 0.0692903995513916, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003413415397517383, + "grad_norm": 7.9817423820495605, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8834714889526367, + "num_tokens": 558575357.0, + "step": 14647 + }, + { + "epoch": 1.8633761607937922, + "ewc_loss": 0.06993403285741806, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003477778227534145, + "grad_norm": 8.23323917388916, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.860614538192749, + "num_tokens": 558619422.0, + "step": 14648 + }, + { + "epoch": 1.8635033710723827, + "ewc_loss": 0.06919822096824646, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003404197050258517, + "grad_norm": 8.045012474060059, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8540769815444946, + "num_tokens": 558657677.0, + "step": 14649 + }, + { + "epoch": 1.8636305813509733, + "ewc_loss": 0.06985679268836975, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034700537798926234, + "grad_norm": 8.15972900390625, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8716176748275757, + "num_tokens": 558692912.0, + "step": 14650 + }, + { + "epoch": 1.8637577916295638, + "ewc_loss": 0.06923849880695343, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034082247293554246, + "grad_norm": 7.993109703063965, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8751562833786011, + "num_tokens": 558733470.0, + "step": 14651 + }, + { + "epoch": 1.8638850019081543, + "ewc_loss": 0.06992337107658386, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034767124452628195, + "grad_norm": 8.211108207702637, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.858910322189331, + "num_tokens": 558771899.0, + "step": 14652 + }, + { + "epoch": 1.8640122121867448, + "ewc_loss": 0.0692770779132843, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034120833151973784, + "grad_norm": 8.049224853515625, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8680020570755005, + "num_tokens": 558813002.0, + "step": 14653 + }, + { + "epoch": 1.8641394224653351, + "ewc_loss": 0.06985005736351013, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034693803172558546, + "grad_norm": 8.266213417053223, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8480925559997559, + "num_tokens": 558850827.0, + "step": 14654 + }, + { + "epoch": 1.8642666327439257, + "ewc_loss": 0.0692349225282669, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003407866752240807, + "grad_norm": 8.071723937988281, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8597097396850586, + "num_tokens": 558889003.0, + "step": 14655 + }, + { + "epoch": 1.8643938430225162, + "ewc_loss": 0.06977362930774689, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034617382334545255, + "grad_norm": 8.187806129455566, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.869373619556427, + "num_tokens": 558925849.0, + "step": 14656 + }, + { + "epoch": 1.8645210533011067, + "ewc_loss": 0.06932303309440613, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003416678518988192, + "grad_norm": 8.030325889587402, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.862960159778595, + "num_tokens": 558967471.0, + "step": 14657 + }, + { + "epoch": 1.8646482635796973, + "ewc_loss": 0.06977640837430954, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034620158839970827, + "grad_norm": 8.206740379333496, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8672873973846436, + "num_tokens": 559001938.0, + "step": 14658 + }, + { + "epoch": 1.8647754738582878, + "ewc_loss": 0.069206103682518, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034049851819872856, + "grad_norm": 8.061138153076172, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8699188232421875, + "num_tokens": 559040515.0, + "step": 14659 + }, + { + "epoch": 1.864902684136878, + "ewc_loss": 0.06981784105300903, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034661591053009033, + "grad_norm": 8.220222473144531, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.877059817314148, + "num_tokens": 559077702.0, + "step": 14660 + }, + { + "epoch": 1.8650298944154686, + "ewc_loss": 0.06918075680732727, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034024505293928087, + "grad_norm": 8.042121887207031, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.855798065662384, + "num_tokens": 559114741.0, + "step": 14661 + }, + { + "epoch": 1.8651571046940592, + "ewc_loss": 0.06979630887508392, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034640062949620187, + "grad_norm": 8.208122253417969, + "learning_rate": 1e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8415136337280273, + "num_tokens": 559155844.0, + "step": 14662 + }, + { + "epoch": 1.8652843149726497, + "ewc_loss": 0.06924127042293549, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034085020888596773, + "grad_norm": 8.088048934936523, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8655657768249512, + "num_tokens": 559194964.0, + "step": 14663 + }, + { + "epoch": 1.8654115252512402, + "ewc_loss": 0.0697474256157875, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003459117724560201, + "grad_norm": 8.165568351745605, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8691062927246094, + "num_tokens": 559230083.0, + "step": 14664 + }, + { + "epoch": 1.8655387355298307, + "ewc_loss": 0.06935848295688629, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003420222783461213, + "grad_norm": 8.072135925292969, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8741027116775513, + "num_tokens": 559268397.0, + "step": 14665 + }, + { + "epoch": 1.8656659458084213, + "ewc_loss": 0.06966707110404968, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034510818659327924, + "grad_norm": 8.216130256652832, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8548072576522827, + "num_tokens": 559302024.0, + "step": 14666 + }, + { + "epoch": 1.8657931560870118, + "ewc_loss": 0.06925732642412186, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034101077471859753, + "grad_norm": 8.064664840698242, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8611650466918945, + "num_tokens": 559343192.0, + "step": 14667 + }, + { + "epoch": 1.8659203663656023, + "ewc_loss": 0.06980179250240326, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003464554320089519, + "grad_norm": 8.14846420288086, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8860764503479004, + "num_tokens": 559380616.0, + "step": 14668 + }, + { + "epoch": 1.8660475766441929, + "ewc_loss": 0.069343701004982, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034187445999123156, + "grad_norm": 8.061272621154785, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8738417029380798, + "num_tokens": 559413621.0, + "step": 14669 + }, + { + "epoch": 1.8661747869227834, + "ewc_loss": 0.0697082132101059, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000345519685652107, + "grad_norm": 8.149632453918457, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8721822500228882, + "num_tokens": 559447274.0, + "step": 14670 + }, + { + "epoch": 1.866301997201374, + "ewc_loss": 0.06957105547189713, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003441480512265116, + "grad_norm": 8.078998565673828, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8737286329269409, + "num_tokens": 559484306.0, + "step": 14671 + }, + { + "epoch": 1.8664292074799644, + "ewc_loss": 0.06969107687473297, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003453482349868864, + "grad_norm": 8.066351890563965, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8708090782165527, + "num_tokens": 559527126.0, + "step": 14672 + }, + { + "epoch": 1.866556417758555, + "ewc_loss": 0.06970524787902832, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003454899415373802, + "grad_norm": 8.095295906066895, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8676511645317078, + "num_tokens": 559568342.0, + "step": 14673 + }, + { + "epoch": 1.8666836280371455, + "ewc_loss": 0.06953217089176178, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003437591658439487, + "grad_norm": 8.05241870880127, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8600148558616638, + "num_tokens": 559609597.0, + "step": 14674 + }, + { + "epoch": 1.866810838315736, + "ewc_loss": 0.06981326639652252, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003465701302047819, + "grad_norm": 8.119888305664062, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.86272132396698, + "num_tokens": 559646392.0, + "step": 14675 + }, + { + "epoch": 1.8669380485943265, + "ewc_loss": 0.06947888433933258, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003432263038121164, + "grad_norm": 8.031478881835938, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8662084341049194, + "num_tokens": 559684836.0, + "step": 14676 + }, + { + "epoch": 1.867065258872917, + "ewc_loss": 0.06995876878499985, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003480251762084663, + "grad_norm": 8.112342834472656, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8682235479354858, + "num_tokens": 559727793.0, + "step": 14677 + }, + { + "epoch": 1.8671924691515076, + "ewc_loss": 0.06953813135623932, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034381879959255457, + "grad_norm": 8.078985214233398, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8521319627761841, + "num_tokens": 559768154.0, + "step": 14678 + }, + { + "epoch": 1.867319679430098, + "ewc_loss": 0.0698125809431076, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003465632617007941, + "grad_norm": 8.172320365905762, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8636395931243896, + "num_tokens": 559798732.0, + "step": 14679 + }, + { + "epoch": 1.8674468897086884, + "ewc_loss": 0.06951092928647995, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034354679519310594, + "grad_norm": 8.062079429626465, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8642822504043579, + "num_tokens": 559842796.0, + "step": 14680 + }, + { + "epoch": 1.867574099987279, + "ewc_loss": 0.06990136206150055, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003474511322565377, + "grad_norm": 8.141668319702148, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.861471951007843, + "num_tokens": 559884198.0, + "step": 14681 + }, + { + "epoch": 1.8677013102658695, + "ewc_loss": 0.0695018619298935, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003434561367612332, + "grad_norm": 8.234845161437988, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8491218686103821, + "num_tokens": 559916749.0, + "step": 14682 + }, + { + "epoch": 1.86782852054446, + "ewc_loss": 0.06945788860321045, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034301637788303196, + "grad_norm": 8.07891845703125, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8592944145202637, + "num_tokens": 559958430.0, + "step": 14683 + }, + { + "epoch": 1.8679557308230506, + "ewc_loss": 0.06979131698608398, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003463507164269686, + "grad_norm": 8.114252090454102, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.881078839302063, + "num_tokens": 559995720.0, + "step": 14684 + }, + { + "epoch": 1.8680829411016409, + "ewc_loss": 0.06948831677436829, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034332071663811803, + "grad_norm": 8.050422668457031, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8676244020462036, + "num_tokens": 560033304.0, + "step": 14685 + }, + { + "epoch": 1.8682101513802314, + "ewc_loss": 0.06987382471561432, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034717575181275606, + "grad_norm": 8.195213317871094, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.862075686454773, + "num_tokens": 560069879.0, + "step": 14686 + }, + { + "epoch": 1.868337361658822, + "ewc_loss": 0.06951095163822174, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034354705712758005, + "grad_norm": 8.014294624328613, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8618108630180359, + "num_tokens": 560106853.0, + "step": 14687 + }, + { + "epoch": 1.8684645719374124, + "ewc_loss": 0.07003600895404816, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034879756276495755, + "grad_norm": 8.2384033203125, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8643127679824829, + "num_tokens": 560144225.0, + "step": 14688 + }, + { + "epoch": 1.868591782216003, + "ewc_loss": 0.06952317804098129, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034366926411166787, + "grad_norm": 8.08251667022705, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8572494983673096, + "num_tokens": 560186520.0, + "step": 14689 + }, + { + "epoch": 1.8687189924945935, + "ewc_loss": 0.07003359496593475, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003487734356895089, + "grad_norm": 8.151894569396973, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8674541115760803, + "num_tokens": 560225458.0, + "step": 14690 + }, + { + "epoch": 1.868846202773184, + "ewc_loss": 0.06948241591453552, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034326169406995177, + "grad_norm": 8.111187934875488, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8698992729187012, + "num_tokens": 560263401.0, + "step": 14691 + }, + { + "epoch": 1.8689734130517746, + "ewc_loss": 0.06977397203445435, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034617717028595507, + "grad_norm": 8.172907829284668, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8614512085914612, + "num_tokens": 560305026.0, + "step": 14692 + }, + { + "epoch": 1.869100623330365, + "ewc_loss": 0.06951958686113358, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034363334998488426, + "grad_norm": 8.144757270812988, + "learning_rate": 1e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.8317797183990479, + "num_tokens": 560354823.0, + "step": 14693 + }, + { + "epoch": 1.8692278336089556, + "ewc_loss": 0.06961618363857269, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034459930611774325, + "grad_norm": 8.164551734924316, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8799846172332764, + "num_tokens": 560391888.0, + "step": 14694 + }, + { + "epoch": 1.8693550438875461, + "ewc_loss": 0.06949204951524734, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003433579986449331, + "grad_norm": 8.060052871704102, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.866579532623291, + "num_tokens": 560425350.0, + "step": 14695 + }, + { + "epoch": 1.8694822541661367, + "ewc_loss": 0.06950409710407257, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003459198633208871, + "grad_norm": 8.12449836730957, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8746923208236694, + "num_tokens": 560458521.0, + "step": 14696 + }, + { + "epoch": 1.8696094644447272, + "ewc_loss": 0.06924962252378464, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003433751408010721, + "grad_norm": 8.001712799072266, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8573057651519775, + "num_tokens": 560506378.0, + "step": 14697 + }, + { + "epoch": 1.8697366747233177, + "ewc_loss": 0.06970500946044922, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003479290462564677, + "grad_norm": 8.166550636291504, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8856458067893982, + "num_tokens": 560544055.0, + "step": 14698 + }, + { + "epoch": 1.8698638850019083, + "ewc_loss": 0.06928059458732605, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003436848346609622, + "grad_norm": 8.084345817565918, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8738827705383301, + "num_tokens": 560577240.0, + "step": 14699 + }, + { + "epoch": 1.8699910952804988, + "ewc_loss": 0.06963656842708588, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003472445532679558, + "grad_norm": 8.149052619934082, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8585466146469116, + "num_tokens": 560615662.0, + "step": 14700 + }, + { + "epoch": 1.8701183055590893, + "ewc_loss": 0.06944306194782257, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003453095268923789, + "grad_norm": 8.093454360961914, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8584405183792114, + "num_tokens": 560658386.0, + "step": 14701 + }, + { + "epoch": 1.8702455158376798, + "ewc_loss": 0.06960169970989227, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003468958893790841, + "grad_norm": 8.134538650512695, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8621086478233337, + "num_tokens": 560696366.0, + "step": 14702 + }, + { + "epoch": 1.8703727261162701, + "ewc_loss": 0.06948212534189224, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003457001585047692, + "grad_norm": 8.060856819152832, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8519841432571411, + "num_tokens": 560736419.0, + "step": 14703 + }, + { + "epoch": 1.8704999363948607, + "ewc_loss": 0.06958780437707901, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034675694769248366, + "grad_norm": 8.095871925354004, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8628835678100586, + "num_tokens": 560777839.0, + "step": 14704 + }, + { + "epoch": 1.8706271466734512, + "ewc_loss": 0.06966239213943481, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003475027915555984, + "grad_norm": 8.09375, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8743786811828613, + "num_tokens": 560816167.0, + "step": 14705 + }, + { + "epoch": 1.8707543569520417, + "ewc_loss": 0.069621741771698, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003470963565632701, + "grad_norm": 8.103680610656738, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.849885106086731, + "num_tokens": 560854524.0, + "step": 14706 + }, + { + "epoch": 1.8708815672306323, + "ewc_loss": 0.0697266012430191, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003481449675746262, + "grad_norm": 8.101670265197754, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.860569417476654, + "num_tokens": 560894393.0, + "step": 14707 + }, + { + "epoch": 1.8710087775092228, + "ewc_loss": 0.06982338428497314, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034911278635263443, + "grad_norm": 8.059755325317383, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8747959733009338, + "num_tokens": 560938640.0, + "step": 14708 + }, + { + "epoch": 1.871135987787813, + "ewc_loss": 0.06994359195232391, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000350314803654328, + "grad_norm": 8.124176979064941, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8636990189552307, + "num_tokens": 560976805.0, + "step": 14709 + }, + { + "epoch": 1.8712631980664036, + "ewc_loss": 0.06959415972232819, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034682051045820117, + "grad_norm": 8.06919002532959, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8588913679122925, + "num_tokens": 561018940.0, + "step": 14710 + }, + { + "epoch": 1.8713904083449941, + "ewc_loss": 0.06989480555057526, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034982693614438176, + "grad_norm": 8.134503364562988, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8682531118392944, + "num_tokens": 561065228.0, + "step": 14711 + }, + { + "epoch": 1.8715176186235847, + "ewc_loss": 0.06968618184328079, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003477407153695822, + "grad_norm": 8.1272554397583, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8557433485984802, + "num_tokens": 561107540.0, + "step": 14712 + }, + { + "epoch": 1.8716448289021752, + "ewc_loss": 0.06974706798791885, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034834956750273705, + "grad_norm": 8.194344520568848, + "learning_rate": 1e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8422695398330688, + "num_tokens": 561144774.0, + "step": 14713 + }, + { + "epoch": 1.8717720391807657, + "ewc_loss": 0.06962628662586212, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034714178764261305, + "grad_norm": 8.075160026550293, + "learning_rate": 1e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8443641066551208, + "num_tokens": 561186774.0, + "step": 14714 + }, + { + "epoch": 1.8718992494593563, + "ewc_loss": 0.06990524381399155, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034993133158423007, + "grad_norm": 8.193467140197754, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8759335875511169, + "num_tokens": 561223997.0, + "step": 14715 + }, + { + "epoch": 1.8720264597379468, + "ewc_loss": 0.06943758577108383, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003452547825872898, + "grad_norm": 7.990109920501709, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8719669580459595, + "num_tokens": 561261583.0, + "step": 14716 + }, + { + "epoch": 1.8721536700165373, + "ewc_loss": 0.0702427327632904, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003533062117639929, + "grad_norm": 8.20092487335205, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8684378862380981, + "num_tokens": 561302681.0, + "step": 14717 + }, + { + "epoch": 1.8722808802951278, + "ewc_loss": 0.0695611834526062, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034649070585146546, + "grad_norm": 8.125493049621582, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8616989850997925, + "num_tokens": 561338402.0, + "step": 14718 + }, + { + "epoch": 1.8724080905737184, + "ewc_loss": 0.06997901201248169, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003506690263748169, + "grad_norm": 8.184101104736328, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8684628009796143, + "num_tokens": 561372892.0, + "step": 14719 + }, + { + "epoch": 1.872535300852309, + "ewc_loss": 0.06994815170764923, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003479190054349601, + "grad_norm": 8.1177396774292, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8608222007751465, + "num_tokens": 561409474.0, + "step": 14720 + }, + { + "epoch": 1.8726625111308994, + "ewc_loss": 0.07015851140022278, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003500226594042033, + "grad_norm": 8.19413948059082, + "learning_rate": 1e-06, + "loss": 0.5504, + "mean_token_accuracy": 0.837286114692688, + "num_tokens": 561450152.0, + "step": 14721 + }, + { + "epoch": 1.87278972140949, + "ewc_loss": 0.06981678307056427, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034660534583963454, + "grad_norm": 8.04334831237793, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.857932984828949, + "num_tokens": 561489448.0, + "step": 14722 + }, + { + "epoch": 1.8729169316880805, + "ewc_loss": 0.07022456079721451, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035068311262875795, + "grad_norm": 8.22500991821289, + "learning_rate": 1e-06, + "loss": 0.548, + "mean_token_accuracy": 0.839920163154602, + "num_tokens": 561529684.0, + "step": 14723 + }, + { + "epoch": 1.873044141966671, + "ewc_loss": 0.06950734555721283, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034595231409184635, + "grad_norm": 8.129364013671875, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8652603626251221, + "num_tokens": 561564259.0, + "step": 14724 + }, + { + "epoch": 1.8731713522452615, + "ewc_loss": 0.07017075270414352, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035014504101127386, + "grad_norm": 8.149240493774414, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8704723715782166, + "num_tokens": 561609650.0, + "step": 14725 + }, + { + "epoch": 1.873298562523852, + "ewc_loss": 0.06989753991365433, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034741288982331753, + "grad_norm": 8.138980865478516, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8698520660400391, + "num_tokens": 561645520.0, + "step": 14726 + }, + { + "epoch": 1.8734257728024426, + "ewc_loss": 0.07006537169218063, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000349091220414266, + "grad_norm": 8.168502807617188, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8688367009162903, + "num_tokens": 561682771.0, + "step": 14727 + }, + { + "epoch": 1.873552983081033, + "ewc_loss": 0.06982855498790741, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000346723070833832, + "grad_norm": 8.108311653137207, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8546532392501831, + "num_tokens": 561721317.0, + "step": 14728 + }, + { + "epoch": 1.8736801933596234, + "ewc_loss": 0.07006993144750595, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003491368261165917, + "grad_norm": 8.136863708496094, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8652948141098022, + "num_tokens": 561763178.0, + "step": 14729 + }, + { + "epoch": 1.873807403638214, + "ewc_loss": 0.06986290216445923, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034706652513705194, + "grad_norm": 8.306944847106934, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8648549914360046, + "num_tokens": 561802292.0, + "step": 14730 + }, + { + "epoch": 1.8739346139168045, + "ewc_loss": 0.06966759264469147, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003451134543865919, + "grad_norm": 8.060751914978027, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8738638162612915, + "num_tokens": 561838610.0, + "step": 14731 + }, + { + "epoch": 1.874061824195395, + "ewc_loss": 0.07011938095092773, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034963132929988205, + "grad_norm": 8.154241561889648, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8667454123497009, + "num_tokens": 561881992.0, + "step": 14732 + }, + { + "epoch": 1.8741890344739855, + "ewc_loss": 0.06970356404781342, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034547309041954577, + "grad_norm": 8.172114372253418, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8542991280555725, + "num_tokens": 561917661.0, + "step": 14733 + }, + { + "epoch": 1.8743162447525759, + "ewc_loss": 0.06993541121482849, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003477915597613901, + "grad_norm": 8.160332679748535, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8645632266998291, + "num_tokens": 561958218.0, + "step": 14734 + }, + { + "epoch": 1.8744434550311664, + "ewc_loss": 0.0698624700307846, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003470621886663139, + "grad_norm": 8.08511734008789, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8756747245788574, + "num_tokens": 561993882.0, + "step": 14735 + }, + { + "epoch": 1.874570665309757, + "ewc_loss": 0.07012961804866791, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034973365836776793, + "grad_norm": 8.296829223632812, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8648934364318848, + "num_tokens": 562035443.0, + "step": 14736 + }, + { + "epoch": 1.8746978755883474, + "ewc_loss": 0.06957611441612244, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034419860458001494, + "grad_norm": 8.089212417602539, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8622090816497803, + "num_tokens": 562072806.0, + "step": 14737 + }, + { + "epoch": 1.874825085866938, + "ewc_loss": 0.07018298655748367, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003502673644106835, + "grad_norm": 8.262972831726074, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8473531603813171, + "num_tokens": 562113652.0, + "step": 14738 + }, + { + "epoch": 1.8749522961455285, + "ewc_loss": 0.06924590468406677, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003433379461057484, + "grad_norm": 8.09899616241455, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8540745377540588, + "num_tokens": 562150600.0, + "step": 14739 + }, + { + "epoch": 1.875079506424119, + "ewc_loss": 0.06987086683511734, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003495875862427056, + "grad_norm": 8.194863319396973, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8648501038551331, + "num_tokens": 562187730.0, + "step": 14740 + }, + { + "epoch": 1.8752067167027096, + "ewc_loss": 0.06939928978681564, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034487180528230965, + "grad_norm": 8.198766708374023, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8639975786209106, + "num_tokens": 562228229.0, + "step": 14741 + }, + { + "epoch": 1.8753339269813, + "ewc_loss": 0.06966578960418701, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003475367557257414, + "grad_norm": 8.232156753540039, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8694912195205688, + "num_tokens": 562262293.0, + "step": 14742 + }, + { + "epoch": 1.8754611372598906, + "ewc_loss": 0.0695812851190567, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003466917260084301, + "grad_norm": 8.172589302062988, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8621518611907959, + "num_tokens": 562303389.0, + "step": 14743 + }, + { + "epoch": 1.8755883475384811, + "ewc_loss": 0.06954173743724823, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034629623405635357, + "grad_norm": 8.157259941101074, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8564756512641907, + "num_tokens": 562344461.0, + "step": 14744 + }, + { + "epoch": 1.8757155578170717, + "ewc_loss": 0.06951914727687836, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003460703883320093, + "grad_norm": 8.139204978942871, + "learning_rate": 1e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8473155498504639, + "num_tokens": 562387068.0, + "step": 14745 + }, + { + "epoch": 1.8758427680956622, + "ewc_loss": 0.06948940455913544, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034577291808091104, + "grad_norm": 8.124554634094238, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8618060946464539, + "num_tokens": 562428225.0, + "step": 14746 + }, + { + "epoch": 1.8759699783742527, + "ewc_loss": 0.06955596804618835, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034643858089111745, + "grad_norm": 8.119020462036133, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8637745380401611, + "num_tokens": 562463879.0, + "step": 14747 + }, + { + "epoch": 1.8760971886528433, + "ewc_loss": 0.0696273148059845, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034715209039859474, + "grad_norm": 8.171147346496582, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8559346199035645, + "num_tokens": 562502327.0, + "step": 14748 + }, + { + "epoch": 1.8762243989314338, + "ewc_loss": 0.06962088495492935, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003470877418294549, + "grad_norm": 8.133766174316406, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8676639199256897, + "num_tokens": 562542162.0, + "step": 14749 + }, + { + "epoch": 1.8763516092100243, + "ewc_loss": 0.06972778588533401, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034815678372979164, + "grad_norm": 8.16869068145752, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8572081923484802, + "num_tokens": 562583364.0, + "step": 14750 + }, + { + "epoch": 1.8764788194886148, + "ewc_loss": 0.06959519535303116, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034683087142184377, + "grad_norm": 8.140670776367188, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8650625944137573, + "num_tokens": 562615870.0, + "step": 14751 + }, + { + "epoch": 1.8766060297672051, + "ewc_loss": 0.06978701800107956, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003487491048872471, + "grad_norm": 8.202695846557617, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8482661247253418, + "num_tokens": 562655086.0, + "step": 14752 + }, + { + "epoch": 1.8767332400457957, + "ewc_loss": 0.0697135254740715, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003480141458567232, + "grad_norm": 8.229227066040039, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8564093708992004, + "num_tokens": 562697532.0, + "step": 14753 + }, + { + "epoch": 1.8768604503243862, + "ewc_loss": 0.06972183287143707, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003480972081888467, + "grad_norm": 8.227981567382812, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8419129848480225, + "num_tokens": 562731057.0, + "step": 14754 + }, + { + "epoch": 1.8769876606029767, + "ewc_loss": 0.06956587731838226, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034653767943382263, + "grad_norm": 8.176294326782227, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8527178168296814, + "num_tokens": 562770402.0, + "step": 14755 + }, + { + "epoch": 1.8771148708815673, + "ewc_loss": 0.06991598010063171, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034759726258926094, + "grad_norm": 8.170723915100098, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8585715293884277, + "num_tokens": 562806191.0, + "step": 14756 + }, + { + "epoch": 1.8772420811601578, + "ewc_loss": 0.06962071359157562, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003470859955996275, + "grad_norm": 8.18043327331543, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8629863858222961, + "num_tokens": 562846864.0, + "step": 14757 + }, + { + "epoch": 1.877369291438748, + "ewc_loss": 0.06986604630947113, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034709792817011476, + "grad_norm": 8.302891731262207, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8576603531837463, + "num_tokens": 562882887.0, + "step": 14758 + }, + { + "epoch": 1.8774965017173386, + "ewc_loss": 0.06959564983844757, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034439400769770145, + "grad_norm": 8.042055130004883, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8622423410415649, + "num_tokens": 562919096.0, + "step": 14759 + }, + { + "epoch": 1.8776237119959291, + "ewc_loss": 0.07019497454166412, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035038721398450434, + "grad_norm": 8.370891571044922, + "learning_rate": 1e-06, + "loss": 0.5599, + "mean_token_accuracy": 0.8374987840652466, + "num_tokens": 562959336.0, + "step": 14760 + }, + { + "epoch": 1.8777509222745197, + "ewc_loss": 0.06927365064620972, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034117395989596844, + "grad_norm": 8.035158157348633, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8633403778076172, + "num_tokens": 562997883.0, + "step": 14761 + }, + { + "epoch": 1.8778781325531102, + "ewc_loss": 0.07032736390829086, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035171114723198116, + "grad_norm": 8.334397315979004, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8732742667198181, + "num_tokens": 563036763.0, + "step": 14762 + }, + { + "epoch": 1.8780053428317007, + "ewc_loss": 0.06935995817184448, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034203712129965425, + "grad_norm": 8.021717071533203, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8795746564865112, + "num_tokens": 563076815.0, + "step": 14763 + }, + { + "epoch": 1.8781325531102913, + "ewc_loss": 0.07037560641765594, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035219360142946243, + "grad_norm": 8.519036293029785, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8732879757881165, + "num_tokens": 563110511.0, + "step": 14764 + }, + { + "epoch": 1.8782597633888818, + "ewc_loss": 0.06915740668773651, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003400115529075265, + "grad_norm": 8.022198677062988, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8586507439613342, + "num_tokens": 563140757.0, + "step": 14765 + }, + { + "epoch": 1.8783869736674723, + "ewc_loss": 0.07057923078536987, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035422982182353735, + "grad_norm": 8.640789031982422, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8621608018875122, + "num_tokens": 563179961.0, + "step": 14766 + }, + { + "epoch": 1.8785141839460628, + "ewc_loss": 0.06906969845294952, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003391344507690519, + "grad_norm": 7.902248382568359, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8694210648536682, + "num_tokens": 563221355.0, + "step": 14767 + }, + { + "epoch": 1.8786413942246534, + "ewc_loss": 0.07109497487545013, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003593872534111142, + "grad_norm": 8.80855655670166, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8847631812095642, + "num_tokens": 563257427.0, + "step": 14768 + }, + { + "epoch": 1.878768604503244, + "ewc_loss": 0.06912359595298767, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033967342460528016, + "grad_norm": 7.964550971984863, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8602187633514404, + "num_tokens": 563294371.0, + "step": 14769 + }, + { + "epoch": 1.8788958147818344, + "ewc_loss": 0.07121337950229645, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003605712845455855, + "grad_norm": 8.543169975280762, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.873278021812439, + "num_tokens": 563338592.0, + "step": 14770 + }, + { + "epoch": 1.879023025060425, + "ewc_loss": 0.06943251192569733, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034276265068911016, + "grad_norm": 8.173418998718262, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8675738573074341, + "num_tokens": 563373853.0, + "step": 14771 + }, + { + "epoch": 1.8791502353390155, + "ewc_loss": 0.07054810225963593, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003539185272529721, + "grad_norm": 8.324265480041504, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8628658056259155, + "num_tokens": 563411137.0, + "step": 14772 + }, + { + "epoch": 1.879277445617606, + "ewc_loss": 0.06965214014053345, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003449588839430362, + "grad_norm": 8.132518768310547, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8679797649383545, + "num_tokens": 563450294.0, + "step": 14773 + }, + { + "epoch": 1.8794046558961965, + "ewc_loss": 0.07014444470405579, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000349881942383945, + "grad_norm": 8.266179084777832, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8649848699569702, + "num_tokens": 563496858.0, + "step": 14774 + }, + { + "epoch": 1.879531866174787, + "ewc_loss": 0.06973758339881897, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003458133724052459, + "grad_norm": 8.116755485534668, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8590312600135803, + "num_tokens": 563539566.0, + "step": 14775 + }, + { + "epoch": 1.8796590764533776, + "ewc_loss": 0.06979282200336456, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034880716702900827, + "grad_norm": 8.239754676818848, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8704050779342651, + "num_tokens": 563574363.0, + "step": 14776 + }, + { + "epoch": 1.879786286731968, + "ewc_loss": 0.06976225972175598, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034606008557602763, + "grad_norm": 8.203180313110352, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.864822506904602, + "num_tokens": 563605540.0, + "step": 14777 + }, + { + "epoch": 1.8799134970105584, + "ewc_loss": 0.06980651617050171, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034650260931812227, + "grad_norm": 8.191122055053711, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8490256071090698, + "num_tokens": 563646640.0, + "step": 14778 + }, + { + "epoch": 1.880040707289149, + "ewc_loss": 0.06973983347415924, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003458357823546976, + "grad_norm": 8.123388290405273, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8730795383453369, + "num_tokens": 563683576.0, + "step": 14779 + }, + { + "epoch": 1.8801679175677395, + "ewc_loss": 0.06991943717002869, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000347631867043674, + "grad_norm": 8.211653709411621, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8560651540756226, + "num_tokens": 563724088.0, + "step": 14780 + }, + { + "epoch": 1.88029512784633, + "ewc_loss": 0.06964913010597229, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003449288196861744, + "grad_norm": 8.096787452697754, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8625274300575256, + "num_tokens": 563760897.0, + "step": 14781 + }, + { + "epoch": 1.8804223381249205, + "ewc_loss": 0.06992264091968536, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003476639394648373, + "grad_norm": 8.22313117980957, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.868432343006134, + "num_tokens": 563800158.0, + "step": 14782 + }, + { + "epoch": 1.8805495484035109, + "ewc_loss": 0.06971479952335358, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003455854603089392, + "grad_norm": 8.093113899230957, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8702904582023621, + "num_tokens": 563833288.0, + "step": 14783 + }, + { + "epoch": 1.8806767586821014, + "ewc_loss": 0.07012055814266205, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003496430581435561, + "grad_norm": 8.152297019958496, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8747532367706299, + "num_tokens": 563874507.0, + "step": 14784 + }, + { + "epoch": 1.880803968960692, + "ewc_loss": 0.06974870711565018, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003459245781414211, + "grad_norm": 8.105098724365234, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8634740114212036, + "num_tokens": 563912452.0, + "step": 14785 + }, + { + "epoch": 1.8809311792392824, + "ewc_loss": 0.06979260593652725, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000348804984241724, + "grad_norm": 8.23283576965332, + "learning_rate": 1e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8463133573532104, + "num_tokens": 563950565.0, + "step": 14786 + }, + { + "epoch": 1.881058389517873, + "ewc_loss": 0.06960509717464447, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034692982444539666, + "grad_norm": 8.167149543762207, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8543472290039062, + "num_tokens": 563984924.0, + "step": 14787 + }, + { + "epoch": 1.8811855997964635, + "ewc_loss": 0.06988595426082611, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034729705657809973, + "grad_norm": 8.190113067626953, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8706027865409851, + "num_tokens": 564022887.0, + "step": 14788 + }, + { + "epoch": 1.881312810075054, + "ewc_loss": 0.06978170573711395, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034625452826730907, + "grad_norm": 8.102205276489258, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8622795343399048, + "num_tokens": 564064829.0, + "step": 14789 + }, + { + "epoch": 1.8814400203536445, + "ewc_loss": 0.06999791413545609, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003484166518319398, + "grad_norm": 8.21267032623291, + "learning_rate": 1e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.8419613838195801, + "num_tokens": 564100858.0, + "step": 14790 + }, + { + "epoch": 1.881567230632235, + "ewc_loss": 0.06977234035730362, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034616090124472976, + "grad_norm": 8.100140571594238, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8568581938743591, + "num_tokens": 564141477.0, + "step": 14791 + }, + { + "epoch": 1.8816944409108256, + "ewc_loss": 0.07003875821828842, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034882506588473916, + "grad_norm": 8.189532279968262, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8569353818893433, + "num_tokens": 564179271.0, + "step": 14792 + }, + { + "epoch": 1.8818216511894161, + "ewc_loss": 0.06981506943702698, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000346588232787326, + "grad_norm": 8.072587013244629, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8694822192192078, + "num_tokens": 564224336.0, + "step": 14793 + }, + { + "epoch": 1.8819488614680067, + "ewc_loss": 0.07017188519239426, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003501563332974911, + "grad_norm": 8.19913101196289, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8585925102233887, + "num_tokens": 564265157.0, + "step": 14794 + }, + { + "epoch": 1.8820760717465972, + "ewc_loss": 0.06984217464923859, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034685927676036954, + "grad_norm": 8.158458709716797, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8684830665588379, + "num_tokens": 564295023.0, + "step": 14795 + }, + { + "epoch": 1.8822032820251877, + "ewc_loss": 0.0701216608285904, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003496540884952992, + "grad_norm": 8.203157424926758, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8655262589454651, + "num_tokens": 564333700.0, + "step": 14796 + }, + { + "epoch": 1.8823304923037782, + "ewc_loss": 0.06976763904094696, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003461138694547117, + "grad_norm": 8.156055450439453, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8497040867805481, + "num_tokens": 564374342.0, + "step": 14797 + }, + { + "epoch": 1.8824577025823688, + "ewc_loss": 0.07004965841770172, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003489340888336301, + "grad_norm": 8.13658618927002, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.864804744720459, + "num_tokens": 564422877.0, + "step": 14798 + }, + { + "epoch": 1.8825849128609593, + "ewc_loss": 0.06989501416683197, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003473875985946506, + "grad_norm": 8.14762020111084, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.86379075050354, + "num_tokens": 564464713.0, + "step": 14799 + }, + { + "epoch": 1.8827121231395498, + "ewc_loss": 0.06994137167930603, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003478512226138264, + "grad_norm": 8.184480667114258, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8524264097213745, + "num_tokens": 564500667.0, + "step": 14800 + }, + { + "epoch": 1.8828393334181401, + "ewc_loss": 0.069614477455616, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034702368429861963, + "grad_norm": 8.14405345916748, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8718265295028687, + "num_tokens": 564541460.0, + "step": 14801 + }, + { + "epoch": 1.8829665436967307, + "ewc_loss": 0.06968462467193604, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034772520302794874, + "grad_norm": 8.140581130981445, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8545210957527161, + "num_tokens": 564580983.0, + "step": 14802 + }, + { + "epoch": 1.8830937539753212, + "ewc_loss": 0.06963112950325012, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003471901873126626, + "grad_norm": 8.135004997253418, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8727295994758606, + "num_tokens": 564615967.0, + "step": 14803 + }, + { + "epoch": 1.8832209642539117, + "ewc_loss": 0.06961188465356827, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003469977527856827, + "grad_norm": 8.1853666305542, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.854040265083313, + "num_tokens": 564653255.0, + "step": 14804 + }, + { + "epoch": 1.8833481745325023, + "ewc_loss": 0.0696074366569519, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003469532821327448, + "grad_norm": 8.115392684936523, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.859432578086853, + "num_tokens": 564688420.0, + "step": 14805 + }, + { + "epoch": 1.8834753848110928, + "ewc_loss": 0.06966422498226166, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034752118517644703, + "grad_norm": 8.130666732788086, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8561996817588806, + "num_tokens": 564733123.0, + "step": 14806 + }, + { + "epoch": 1.883602595089683, + "ewc_loss": 0.06956450641155243, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034652394242584705, + "grad_norm": 8.184370994567871, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8707078695297241, + "num_tokens": 564767712.0, + "step": 14807 + }, + { + "epoch": 1.8837298053682736, + "ewc_loss": 0.06952862441539764, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000346165121300146, + "grad_norm": 8.09130859375, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8603665828704834, + "num_tokens": 564810663.0, + "step": 14808 + }, + { + "epoch": 1.8838570156468641, + "ewc_loss": 0.06977908313274384, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034866973874159157, + "grad_norm": 8.144918441772461, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.860284686088562, + "num_tokens": 564848689.0, + "step": 14809 + }, + { + "epoch": 1.8839842259254547, + "ewc_loss": 0.06956641376018524, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034654306364245713, + "grad_norm": 8.08259391784668, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8504176735877991, + "num_tokens": 564887426.0, + "step": 14810 + }, + { + "epoch": 1.8841114362040452, + "ewc_loss": 0.0698339194059372, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003492180840112269, + "grad_norm": 8.15703296661377, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8671170473098755, + "num_tokens": 564924852.0, + "step": 14811 + }, + { + "epoch": 1.8842386464826357, + "ewc_loss": 0.0699409544467926, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034784708986990154, + "grad_norm": 8.141051292419434, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8599905967712402, + "num_tokens": 564964201.0, + "step": 14812 + }, + { + "epoch": 1.8843658567612263, + "ewc_loss": 0.06989000737667084, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003497789439279586, + "grad_norm": 8.142459869384766, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8745304942131042, + "num_tokens": 565004939.0, + "step": 14813 + }, + { + "epoch": 1.8844930670398168, + "ewc_loss": 0.06985129415988922, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034939186298288405, + "grad_norm": 8.136017799377441, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.86734938621521, + "num_tokens": 565044067.0, + "step": 14814 + }, + { + "epoch": 1.8846202773184073, + "ewc_loss": 0.06985294818878174, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003494083939585835, + "grad_norm": 8.167984008789062, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8657965660095215, + "num_tokens": 565082114.0, + "step": 14815 + }, + { + "epoch": 1.8847474875969978, + "ewc_loss": 0.06981591880321503, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034903810592368245, + "grad_norm": 8.124375343322754, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8635597229003906, + "num_tokens": 565118350.0, + "step": 14816 + }, + { + "epoch": 1.8848746978755884, + "ewc_loss": 0.06989798694849014, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034985877573490143, + "grad_norm": 8.146162986755371, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8562326431274414, + "num_tokens": 565153313.0, + "step": 14817 + }, + { + "epoch": 1.885001908154179, + "ewc_loss": 0.06987646222114563, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003496435238048434, + "grad_norm": 8.15323257446289, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8509876728057861, + "num_tokens": 565191332.0, + "step": 14818 + }, + { + "epoch": 1.8851291184327694, + "ewc_loss": 0.06986422836780548, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034952114219777286, + "grad_norm": 8.183004379272461, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.859890341758728, + "num_tokens": 565229937.0, + "step": 14819 + }, + { + "epoch": 1.88525632871136, + "ewc_loss": 0.0697670429944992, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003485493070911616, + "grad_norm": 8.151458740234375, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8577014803886414, + "num_tokens": 565266271.0, + "step": 14820 + }, + { + "epoch": 1.8853835389899505, + "ewc_loss": 0.0699203610420227, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003500824677757919, + "grad_norm": 8.185843467712402, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8610137104988098, + "num_tokens": 565300593.0, + "step": 14821 + }, + { + "epoch": 1.885510749268541, + "ewc_loss": 0.06962784379720688, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034715732908807695, + "grad_norm": 8.091920852661133, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8616743683815002, + "num_tokens": 565341801.0, + "step": 14822 + }, + { + "epoch": 1.8856379595471315, + "ewc_loss": 0.06994861364364624, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003503650368656963, + "grad_norm": 8.193387031555176, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8808884620666504, + "num_tokens": 565374380.0, + "step": 14823 + }, + { + "epoch": 1.885765169825722, + "ewc_loss": 0.06950299441814423, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000345908832969144, + "grad_norm": 8.097070693969727, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.856155514717102, + "num_tokens": 565413096.0, + "step": 14824 + }, + { + "epoch": 1.8858923801043126, + "ewc_loss": 0.07022931426763535, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003507306391838938, + "grad_norm": 8.144617080688477, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8724794387817383, + "num_tokens": 565453292.0, + "step": 14825 + }, + { + "epoch": 1.886019590382903, + "ewc_loss": 0.06984911113977432, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003469286020845175, + "grad_norm": 8.091445922851562, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8566290140151978, + "num_tokens": 565489311.0, + "step": 14826 + }, + { + "epoch": 1.8861468006614934, + "ewc_loss": 0.07020524144172668, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003504899505060166, + "grad_norm": 8.190771102905273, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8466156125068665, + "num_tokens": 565526293.0, + "step": 14827 + }, + { + "epoch": 1.886274010940084, + "ewc_loss": 0.06998756527900696, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034831318771466613, + "grad_norm": 8.123359680175781, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8554361462593079, + "num_tokens": 565569215.0, + "step": 14828 + }, + { + "epoch": 1.8864012212186745, + "ewc_loss": 0.07021111249923706, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035054859472438693, + "grad_norm": 8.149267196655273, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8735973834991455, + "num_tokens": 565610755.0, + "step": 14829 + }, + { + "epoch": 1.886528431497265, + "ewc_loss": 0.0701826736330986, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035026425030082464, + "grad_norm": 8.204566955566406, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8672460317611694, + "num_tokens": 565641964.0, + "step": 14830 + }, + { + "epoch": 1.8866556417758555, + "ewc_loss": 0.07002480328083038, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003486855421215296, + "grad_norm": 8.215127944946289, + "learning_rate": 1e-06, + "loss": 0.5752, + "mean_token_accuracy": 0.8291229605674744, + "num_tokens": 565682852.0, + "step": 14831 + }, + { + "epoch": 1.8867828520544458, + "ewc_loss": 0.07000010460615158, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003484385379124433, + "grad_norm": 8.184629440307617, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8784510493278503, + "num_tokens": 565721352.0, + "step": 14832 + }, + { + "epoch": 1.8869100623330364, + "ewc_loss": 0.06995544582605362, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034799196873791516, + "grad_norm": 8.13683032989502, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8678163886070251, + "num_tokens": 565764045.0, + "step": 14833 + }, + { + "epoch": 1.887037272611627, + "ewc_loss": 0.07002250850200653, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003486625791992992, + "grad_norm": 8.170915603637695, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8714011907577515, + "num_tokens": 565797617.0, + "step": 14834 + }, + { + "epoch": 1.8871644828902174, + "ewc_loss": 0.07002197951078415, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034865731140598655, + "grad_norm": 8.149438858032227, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8612936735153198, + "num_tokens": 565839254.0, + "step": 14835 + }, + { + "epoch": 1.887291693168808, + "ewc_loss": 0.06991004943847656, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034753797808662057, + "grad_norm": 8.16303825378418, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8623442649841309, + "num_tokens": 565874742.0, + "step": 14836 + }, + { + "epoch": 1.8874189034473985, + "ewc_loss": 0.06999468803405762, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034838434658013284, + "grad_norm": 8.203144073486328, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8631513714790344, + "num_tokens": 565911043.0, + "step": 14837 + }, + { + "epoch": 1.887546113725989, + "ewc_loss": 0.0699104443192482, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003475419362075627, + "grad_norm": 8.106414794921875, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8584076166152954, + "num_tokens": 565950228.0, + "step": 14838 + }, + { + "epoch": 1.8876733240045795, + "ewc_loss": 0.07004197686910629, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003488572547212243, + "grad_norm": 8.15573787689209, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8744885921478271, + "num_tokens": 565982891.0, + "step": 14839 + }, + { + "epoch": 1.88780053428317, + "ewc_loss": 0.07000546902418137, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003484922053758055, + "grad_norm": 8.168672561645508, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8610910773277283, + "num_tokens": 566022601.0, + "step": 14840 + }, + { + "epoch": 1.8879277445617606, + "ewc_loss": 0.0699520856142044, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003479583247099072, + "grad_norm": 8.120386123657227, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8646105527877808, + "num_tokens": 566054928.0, + "step": 14841 + }, + { + "epoch": 1.8880549548403511, + "ewc_loss": 0.07019945979118347, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003504321211948991, + "grad_norm": 8.179795265197754, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8551344871520996, + "num_tokens": 566094900.0, + "step": 14842 + }, + { + "epoch": 1.8881821651189417, + "ewc_loss": 0.07003845274448395, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034882198087871075, + "grad_norm": 8.204648971557617, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8611640334129333, + "num_tokens": 566125230.0, + "step": 14843 + }, + { + "epoch": 1.8883093753975322, + "ewc_loss": 0.07008543610572815, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034929189132526517, + "grad_norm": 8.115281105041504, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.877853274345398, + "num_tokens": 566165320.0, + "step": 14844 + }, + { + "epoch": 1.8884365856761227, + "ewc_loss": 0.07010260969400406, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003494636039249599, + "grad_norm": 8.326973915100098, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8533528447151184, + "num_tokens": 566202769.0, + "step": 14845 + }, + { + "epoch": 1.8885637959547132, + "ewc_loss": 0.069804847240448, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034648593282327056, + "grad_norm": 8.083645820617676, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8655949234962463, + "num_tokens": 566245874.0, + "step": 14846 + }, + { + "epoch": 1.8886910062333038, + "ewc_loss": 0.07029788196086884, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035141632542945445, + "grad_norm": 8.23939323425293, + "learning_rate": 1e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8438264727592468, + "num_tokens": 566284855.0, + "step": 14847 + }, + { + "epoch": 1.8888182165118943, + "ewc_loss": 0.06972908973693848, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034572844742797315, + "grad_norm": 8.16986083984375, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8546943664550781, + "num_tokens": 566323422.0, + "step": 14848 + }, + { + "epoch": 1.8889454267904848, + "ewc_loss": 0.07014758884906769, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003499133454170078, + "grad_norm": 8.215536117553711, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8715929388999939, + "num_tokens": 566364357.0, + "step": 14849 + }, + { + "epoch": 1.8890726370690751, + "ewc_loss": 0.06988324224948883, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034726993180811405, + "grad_norm": 8.15166187286377, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8592124581336975, + "num_tokens": 566404498.0, + "step": 14850 + }, + { + "epoch": 1.8891998473476657, + "ewc_loss": 0.06987098604440689, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003471473464742303, + "grad_norm": 8.175106048583984, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8589781522750854, + "num_tokens": 566442960.0, + "step": 14851 + }, + { + "epoch": 1.8893270576262562, + "ewc_loss": 0.0699593722820282, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034803125890903175, + "grad_norm": 8.17574405670166, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.872276246547699, + "num_tokens": 566475087.0, + "step": 14852 + }, + { + "epoch": 1.8894542679048467, + "ewc_loss": 0.06991918385028839, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034762939321808517, + "grad_norm": 8.182958602905273, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8532866835594177, + "num_tokens": 566514337.0, + "step": 14853 + }, + { + "epoch": 1.8895814781834372, + "ewc_loss": 0.0699034035205841, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003474715049378574, + "grad_norm": 8.203821182250977, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8516779541969299, + "num_tokens": 566553204.0, + "step": 14854 + }, + { + "epoch": 1.8897086884620278, + "ewc_loss": 0.0699424147605896, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003478616417851299, + "grad_norm": 8.221983909606934, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.86043781042099, + "num_tokens": 566589697.0, + "step": 14855 + }, + { + "epoch": 1.889835898740618, + "ewc_loss": 0.06991446763277054, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034758218680508435, + "grad_norm": 8.157554626464844, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8491665720939636, + "num_tokens": 566629680.0, + "step": 14856 + }, + { + "epoch": 1.8899631090192086, + "ewc_loss": 0.06991686671972275, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003476061683613807, + "grad_norm": 8.166274070739746, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8857302665710449, + "num_tokens": 566670931.0, + "step": 14857 + }, + { + "epoch": 1.8900903192977991, + "ewc_loss": 0.06992533802986145, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034769088961184025, + "grad_norm": 8.205853462219238, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8565056920051575, + "num_tokens": 566706673.0, + "step": 14858 + }, + { + "epoch": 1.8902175295763897, + "ewc_loss": 0.06984555721282959, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003468931245151907, + "grad_norm": 8.224264144897461, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8581335544586182, + "num_tokens": 566749503.0, + "step": 14859 + }, + { + "epoch": 1.8903447398549802, + "ewc_loss": 0.0699005201458931, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003474426921457052, + "grad_norm": 8.187447547912598, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8773685693740845, + "num_tokens": 566790700.0, + "step": 14860 + }, + { + "epoch": 1.8904719501335707, + "ewc_loss": 0.06966735422611237, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003451110969763249, + "grad_norm": 8.081862449645996, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8623871803283691, + "num_tokens": 566832995.0, + "step": 14861 + }, + { + "epoch": 1.8905991604121613, + "ewc_loss": 0.07001181691884995, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034855568083003163, + "grad_norm": 8.240113258361816, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8503648638725281, + "num_tokens": 566868190.0, + "step": 14862 + }, + { + "epoch": 1.8907263706907518, + "ewc_loss": 0.06975221633911133, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000345959619153291, + "grad_norm": 8.184013366699219, + "learning_rate": 1e-06, + "loss": 0.5564, + "mean_token_accuracy": 0.8333122730255127, + "num_tokens": 566906637.0, + "step": 14863 + }, + { + "epoch": 1.8908535809693423, + "ewc_loss": 0.07002167403697968, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034865428460761905, + "grad_norm": 8.193307876586914, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.862722635269165, + "num_tokens": 566946006.0, + "step": 14864 + }, + { + "epoch": 1.8909807912479328, + "ewc_loss": 0.06984664499759674, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003469039511401206, + "grad_norm": 8.15725040435791, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8511720895767212, + "num_tokens": 566987726.0, + "step": 14865 + }, + { + "epoch": 1.8911080015265234, + "ewc_loss": 0.070023313164711, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034867064096033573, + "grad_norm": 8.19262981414795, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8781784772872925, + "num_tokens": 567027432.0, + "step": 14866 + }, + { + "epoch": 1.891235211805114, + "ewc_loss": 0.06983084976673126, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003467459755484015, + "grad_norm": 8.139787673950195, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8684043288230896, + "num_tokens": 567060607.0, + "step": 14867 + }, + { + "epoch": 1.8913624220837044, + "ewc_loss": 0.06997058540582657, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003481433668639511, + "grad_norm": 8.239130973815918, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8559906482696533, + "num_tokens": 567100691.0, + "step": 14868 + }, + { + "epoch": 1.891489632362295, + "ewc_loss": 0.06973956525325775, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003458331630099565, + "grad_norm": 8.100645065307617, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.880541205406189, + "num_tokens": 567139460.0, + "step": 14869 + }, + { + "epoch": 1.8916168426408855, + "ewc_loss": 0.07008752226829529, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003493126714602113, + "grad_norm": 8.192730903625488, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8514412641525269, + "num_tokens": 567185384.0, + "step": 14870 + }, + { + "epoch": 1.891744052919476, + "ewc_loss": 0.06979416310787201, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003463790926616639, + "grad_norm": 8.099379539489746, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8588758707046509, + "num_tokens": 567219898.0, + "step": 14871 + }, + { + "epoch": 1.8918712631980665, + "ewc_loss": 0.07016411423683167, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003500786842778325, + "grad_norm": 8.21867561340332, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8691661357879639, + "num_tokens": 567260322.0, + "step": 14872 + }, + { + "epoch": 1.891998473476657, + "ewc_loss": 0.06974858045578003, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003459232684690505, + "grad_norm": 8.145021438598633, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8583735227584839, + "num_tokens": 567293739.0, + "step": 14873 + }, + { + "epoch": 1.8921256837552476, + "ewc_loss": 0.0701373815536499, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003498113073874265, + "grad_norm": 8.227180480957031, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8651779890060425, + "num_tokens": 567327282.0, + "step": 14874 + }, + { + "epoch": 1.892252894033838, + "ewc_loss": 0.06994792819023132, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003479167935438454, + "grad_norm": 8.14022159576416, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8566462993621826, + "num_tokens": 567366402.0, + "step": 14875 + }, + { + "epoch": 1.8923801043124284, + "ewc_loss": 0.07013256847858429, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003497631405480206, + "grad_norm": 8.197436332702637, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8546273708343506, + "num_tokens": 567409924.0, + "step": 14876 + }, + { + "epoch": 1.892507314591019, + "ewc_loss": 0.06974935531616211, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003483724140096456, + "grad_norm": 8.1652250289917, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8670366406440735, + "num_tokens": 567450355.0, + "step": 14877 + }, + { + "epoch": 1.8926345248696095, + "ewc_loss": 0.06993918120861053, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035027071135118604, + "grad_norm": 8.216583251953125, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.876505434513092, + "num_tokens": 567487701.0, + "step": 14878 + }, + { + "epoch": 1.8927617351482, + "ewc_loss": 0.0699089989066124, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003475274716038257, + "grad_norm": 8.139060020446777, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8634876012802124, + "num_tokens": 567527619.0, + "step": 14879 + }, + { + "epoch": 1.8928889454267905, + "ewc_loss": 0.07008771598339081, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003517560544423759, + "grad_norm": 8.285069465637207, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8632215857505798, + "num_tokens": 567567016.0, + "step": 14880 + }, + { + "epoch": 1.8930161557053808, + "ewc_loss": 0.06988123059272766, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003472498501650989, + "grad_norm": 8.163469314575195, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8508312702178955, + "num_tokens": 567605465.0, + "step": 14881 + }, + { + "epoch": 1.8931433659839714, + "ewc_loss": 0.07030586153268814, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035149609902873635, + "grad_norm": 8.29920768737793, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8641509413719177, + "num_tokens": 567646680.0, + "step": 14882 + }, + { + "epoch": 1.893270576262562, + "ewc_loss": 0.06984519958496094, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003468895156402141, + "grad_norm": 9.283825874328613, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8640822172164917, + "num_tokens": 567685685.0, + "step": 14883 + }, + { + "epoch": 1.8933977865411524, + "ewc_loss": 0.06917068362236023, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034014429547823966, + "grad_norm": 7.991309642791748, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8512395620346069, + "num_tokens": 567717222.0, + "step": 14884 + }, + { + "epoch": 1.893524996819743, + "ewc_loss": 0.07133790850639343, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003618165501393378, + "grad_norm": 8.609210014343262, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8532716035842896, + "num_tokens": 567761050.0, + "step": 14885 + }, + { + "epoch": 1.8936522070983335, + "ewc_loss": 0.06862179189920425, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00033709683339111507, + "grad_norm": 7.94503116607666, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8587671518325806, + "num_tokens": 567801423.0, + "step": 14886 + }, + { + "epoch": 1.893779417376924, + "ewc_loss": 0.07129612565040588, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036384016857482493, + "grad_norm": 8.585049629211426, + "learning_rate": 1e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8528910875320435, + "num_tokens": 567844614.0, + "step": 14887 + }, + { + "epoch": 1.8939066276555145, + "ewc_loss": 0.0694112777709961, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034255030914209783, + "grad_norm": 8.043646812438965, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8648096323013306, + "num_tokens": 567878803.0, + "step": 14888 + }, + { + "epoch": 1.894033837934105, + "ewc_loss": 0.07117176055908203, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003601551288738847, + "grad_norm": 8.489762306213379, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8751809597015381, + "num_tokens": 567909419.0, + "step": 14889 + }, + { + "epoch": 1.8941610482126956, + "ewc_loss": 0.06969119608402252, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003453494282439351, + "grad_norm": 8.13926887512207, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8631650805473328, + "num_tokens": 567949221.0, + "step": 14890 + }, + { + "epoch": 1.8942882584912861, + "ewc_loss": 0.07085412740707397, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035697873681783676, + "grad_norm": 8.42158031463623, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8789111375808716, + "num_tokens": 567985101.0, + "step": 14891 + }, + { + "epoch": 1.8944154687698767, + "ewc_loss": 0.06987026333808899, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034714018693193793, + "grad_norm": 8.182394981384277, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8467131853103638, + "num_tokens": 568030542.0, + "step": 14892 + }, + { + "epoch": 1.8945426790484672, + "ewc_loss": 0.07041576504707336, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003525952051859349, + "grad_norm": 9.36983871459961, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.863984227180481, + "num_tokens": 568070723.0, + "step": 14893 + }, + { + "epoch": 1.8946698893270577, + "ewc_loss": 0.06960295140743256, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034446705831214786, + "grad_norm": 8.02733039855957, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8556657433509827, + "num_tokens": 568113444.0, + "step": 14894 + }, + { + "epoch": 1.8947970996056482, + "ewc_loss": 0.0715407133102417, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036384459235705435, + "grad_norm": 8.617844581604004, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8649634122848511, + "num_tokens": 568152646.0, + "step": 14895 + }, + { + "epoch": 1.8949243098842388, + "ewc_loss": 0.06953903287649155, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034382782177999616, + "grad_norm": 8.088468551635742, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8619683384895325, + "num_tokens": 568187046.0, + "step": 14896 + }, + { + "epoch": 1.8950515201628293, + "ewc_loss": 0.07138650119304657, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003623025550041348, + "grad_norm": 8.54875373840332, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8578841686248779, + "num_tokens": 568225052.0, + "step": 14897 + }, + { + "epoch": 1.8951787304414198, + "ewc_loss": 0.06984606385231018, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003468981303740293, + "grad_norm": 8.216835975646973, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8553245067596436, + "num_tokens": 568258299.0, + "step": 14898 + }, + { + "epoch": 1.8953059407200101, + "ewc_loss": 0.07042156159877777, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003550945548340678, + "grad_norm": 8.363012313842773, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8693295121192932, + "num_tokens": 568297137.0, + "step": 14899 + }, + { + "epoch": 1.8954331509986007, + "ewc_loss": 0.07020018249750137, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035043933894485235, + "grad_norm": 8.326851844787598, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8831905126571655, + "num_tokens": 568331102.0, + "step": 14900 + }, + { + "epoch": 1.8955603612771912, + "ewc_loss": 0.07002191245555878, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003486566129140556, + "grad_norm": 8.311695098876953, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8597003221511841, + "num_tokens": 568373926.0, + "step": 14901 + }, + { + "epoch": 1.8956875715557817, + "ewc_loss": 0.06984653323888779, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003493442200124264, + "grad_norm": 8.358251571655273, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8722704648971558, + "num_tokens": 568406470.0, + "step": 14902 + }, + { + "epoch": 1.8958147818343722, + "ewc_loss": 0.06945794820785522, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003454583929851651, + "grad_norm": 8.211823463439941, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.869968056678772, + "num_tokens": 568443452.0, + "step": 14903 + }, + { + "epoch": 1.8959419921129628, + "ewc_loss": 0.07012095302343369, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034964701626449823, + "grad_norm": 8.320101737976074, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.856682300567627, + "num_tokens": 568480085.0, + "step": 14904 + }, + { + "epoch": 1.896069202391553, + "ewc_loss": 0.0696336179971695, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003447736380621791, + "grad_norm": 8.214357376098633, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8597310781478882, + "num_tokens": 568519191.0, + "step": 14905 + }, + { + "epoch": 1.8961964126701436, + "ewc_loss": 0.07000870257616043, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034852451062761247, + "grad_norm": 8.307892799377441, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8680185079574585, + "num_tokens": 568554224.0, + "step": 14906 + }, + { + "epoch": 1.8963236229487341, + "ewc_loss": 0.06943017244338989, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003451805969234556, + "grad_norm": 8.224979400634766, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8793982267379761, + "num_tokens": 568586102.0, + "step": 14907 + }, + { + "epoch": 1.8964508332273247, + "ewc_loss": 0.06962595880031586, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003471385280136019, + "grad_norm": 8.294182777404785, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8571445345878601, + "num_tokens": 568625224.0, + "step": 14908 + }, + { + "epoch": 1.8965780435059152, + "ewc_loss": 0.06969285011291504, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034536601742729545, + "grad_norm": 8.225688934326172, + "learning_rate": 1e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8423105478286743, + "num_tokens": 568665588.0, + "step": 14909 + }, + { + "epoch": 1.8967052537845057, + "ewc_loss": 0.0695740282535553, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034661914105527103, + "grad_norm": 8.204585075378418, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8793103694915771, + "num_tokens": 568698408.0, + "step": 14910 + }, + { + "epoch": 1.8968324640630962, + "ewc_loss": 0.06977742910385132, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034621174563653767, + "grad_norm": 8.264113426208496, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8513995409011841, + "num_tokens": 568731982.0, + "step": 14911 + }, + { + "epoch": 1.8969596743416868, + "ewc_loss": 0.06977221369743347, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034615962067618966, + "grad_norm": 8.198163032531738, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8624378442764282, + "num_tokens": 568769388.0, + "step": 14912 + }, + { + "epoch": 1.8970868846202773, + "ewc_loss": 0.06997155398130417, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003481530584394932, + "grad_norm": 8.23901653289795, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.869154691696167, + "num_tokens": 568810329.0, + "step": 14913 + }, + { + "epoch": 1.8972140948988678, + "ewc_loss": 0.06959058344364166, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034434336703270674, + "grad_norm": 8.16005802154541, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8665118217468262, + "num_tokens": 568852241.0, + "step": 14914 + }, + { + "epoch": 1.8973413051774584, + "ewc_loss": 0.07001596689224243, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003485972119960934, + "grad_norm": 8.23641586303711, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8584586381912231, + "num_tokens": 568889660.0, + "step": 14915 + }, + { + "epoch": 1.8974685154560489, + "ewc_loss": 0.06972616910934448, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003456992271821946, + "grad_norm": 8.158502578735352, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8599016666412354, + "num_tokens": 568934489.0, + "step": 14916 + }, + { + "epoch": 1.8975957257346394, + "ewc_loss": 0.07003005594015121, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003487380454316735, + "grad_norm": 8.246350288391113, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8837873339653015, + "num_tokens": 568971904.0, + "step": 14917 + }, + { + "epoch": 1.89772293601323, + "ewc_loss": 0.06979766488075256, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034641410456970334, + "grad_norm": 8.135244369506836, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8446802496910095, + "num_tokens": 569011863.0, + "step": 14918 + }, + { + "epoch": 1.8978501462918205, + "ewc_loss": 0.0701943039894104, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035038054920732975, + "grad_norm": 8.304969787597656, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8837624192237854, + "num_tokens": 569050295.0, + "step": 14919 + }, + { + "epoch": 1.897977356570411, + "ewc_loss": 0.06969200074672699, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003453575191088021, + "grad_norm": 8.14453411102295, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.857877790927887, + "num_tokens": 569087013.0, + "step": 14920 + }, + { + "epoch": 1.8981045668490015, + "ewc_loss": 0.07036658376455307, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035210332134738564, + "grad_norm": 8.366948127746582, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8569130897521973, + "num_tokens": 569125226.0, + "step": 14921 + }, + { + "epoch": 1.898231777127592, + "ewc_loss": 0.06969117373228073, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034534925362095237, + "grad_norm": 8.136767387390137, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8599003553390503, + "num_tokens": 569161910.0, + "step": 14922 + }, + { + "epoch": 1.8983589874061826, + "ewc_loss": 0.07035212963819504, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003519587917253375, + "grad_norm": 8.31897258758545, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.865324079990387, + "num_tokens": 569196519.0, + "step": 14923 + }, + { + "epoch": 1.898486197684773, + "ewc_loss": 0.06979301571846008, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034636768396012485, + "grad_norm": 8.158985137939453, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8596765995025635, + "num_tokens": 569234404.0, + "step": 14924 + }, + { + "epoch": 1.8986134079633634, + "ewc_loss": 0.07025521993637085, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003509897505864501, + "grad_norm": 8.282994270324707, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8547883033752441, + "num_tokens": 569272728.0, + "step": 14925 + }, + { + "epoch": 1.898740618241954, + "ewc_loss": 0.06982173025608063, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034665479324758053, + "grad_norm": 8.156048774719238, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8539038896560669, + "num_tokens": 569315434.0, + "step": 14926 + }, + { + "epoch": 1.8988678285205445, + "ewc_loss": 0.07030293345451355, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035146684967912734, + "grad_norm": 8.304664611816406, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8466368913650513, + "num_tokens": 569357473.0, + "step": 14927 + }, + { + "epoch": 1.898995038799135, + "ewc_loss": 0.06974644958972931, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034590199356898665, + "grad_norm": 8.055423736572266, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8501663208007812, + "num_tokens": 569398747.0, + "step": 14928 + }, + { + "epoch": 1.8991222490777255, + "ewc_loss": 0.07063568383455276, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035479431971907616, + "grad_norm": 8.393911361694336, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8706929087638855, + "num_tokens": 569434855.0, + "step": 14929 + }, + { + "epoch": 1.8992494593563158, + "ewc_loss": 0.06978130340576172, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003462504828348756, + "grad_norm": 8.16248893737793, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8628610372543335, + "num_tokens": 569475236.0, + "step": 14930 + }, + { + "epoch": 1.8993766696349064, + "ewc_loss": 0.07051564007997513, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003535939031280577, + "grad_norm": 8.287881851196289, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8625752925872803, + "num_tokens": 569513599.0, + "step": 14931 + }, + { + "epoch": 1.899503879913497, + "ewc_loss": 0.06982980668544769, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003467355272732675, + "grad_norm": 8.196589469909668, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8703190088272095, + "num_tokens": 569559059.0, + "step": 14932 + }, + { + "epoch": 1.8996310901920874, + "ewc_loss": 0.07031578570604324, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003515953430905938, + "grad_norm": 8.314475059509277, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8808127641677856, + "num_tokens": 569595240.0, + "step": 14933 + }, + { + "epoch": 1.899758300470678, + "ewc_loss": 0.0699944943189621, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003483824839349836, + "grad_norm": 8.24341869354248, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8518697619438171, + "num_tokens": 569629336.0, + "step": 14934 + }, + { + "epoch": 1.8998855107492685, + "ewc_loss": 0.07011181116104126, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034955560113303363, + "grad_norm": 8.318597793579102, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8742719292640686, + "num_tokens": 569663613.0, + "step": 14935 + }, + { + "epoch": 1.900012721027859, + "ewc_loss": 0.06992568075656891, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034769432386383414, + "grad_norm": 8.228096008300781, + "learning_rate": 1e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8473199009895325, + "num_tokens": 569702884.0, + "step": 14936 + }, + { + "epoch": 1.9001399313064495, + "ewc_loss": 0.07020118087530136, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035044929245486856, + "grad_norm": 8.225255012512207, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8776953816413879, + "num_tokens": 569744618.0, + "step": 14937 + }, + { + "epoch": 1.90026714158504, + "ewc_loss": 0.06995385885238647, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034797610715031624, + "grad_norm": 8.280689239501953, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8707475662231445, + "num_tokens": 569777083.0, + "step": 14938 + }, + { + "epoch": 1.9003943518636306, + "ewc_loss": 0.06999608874320984, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034839840373024344, + "grad_norm": 8.268239974975586, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8692700862884521, + "num_tokens": 569809652.0, + "step": 14939 + }, + { + "epoch": 1.9005215621422211, + "ewc_loss": 0.06965155154466629, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003473944088909775, + "grad_norm": 8.256564140319824, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8666961193084717, + "num_tokens": 569846465.0, + "step": 14940 + }, + { + "epoch": 1.9006487724208116, + "ewc_loss": 0.06991557031869888, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000347593188052997, + "grad_norm": 8.203580856323242, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.855232834815979, + "num_tokens": 569883655.0, + "step": 14941 + }, + { + "epoch": 1.9007759826994022, + "ewc_loss": 0.06980596482753754, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003489385126158595, + "grad_norm": 8.235090255737305, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8617997765541077, + "num_tokens": 569923358.0, + "step": 14942 + }, + { + "epoch": 1.9009031929779927, + "ewc_loss": 0.06997671723365784, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034820465953089297, + "grad_norm": 8.19854736328125, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8592637181282043, + "num_tokens": 569961393.0, + "step": 14943 + }, + { + "epoch": 1.9010304032565832, + "ewc_loss": 0.07013154029846191, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034975289599969983, + "grad_norm": 8.240217208862305, + "learning_rate": 1e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.8350183367729187, + "num_tokens": 570002092.0, + "step": 14944 + }, + { + "epoch": 1.9011576135351738, + "ewc_loss": 0.06998775899410248, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034831513767130673, + "grad_norm": 8.177199363708496, + "learning_rate": 1e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8465856313705444, + "num_tokens": 570037699.0, + "step": 14945 + }, + { + "epoch": 1.9012848238137643, + "ewc_loss": 0.07016987353563309, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035013625165447593, + "grad_norm": 8.19593620300293, + "learning_rate": 1e-06, + "loss": 0.542, + "mean_token_accuracy": 0.839530348777771, + "num_tokens": 570077693.0, + "step": 14946 + }, + { + "epoch": 1.9014120340923548, + "ewc_loss": 0.07015235722064972, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003499611048027873, + "grad_norm": 8.238293647766113, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8819518089294434, + "num_tokens": 570115354.0, + "step": 14947 + }, + { + "epoch": 1.9015392443709451, + "ewc_loss": 0.07006165385246277, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034905405482277274, + "grad_norm": 8.208807945251465, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8705472350120544, + "num_tokens": 570154387.0, + "step": 14948 + }, + { + "epoch": 1.9016664546495357, + "ewc_loss": 0.0701577365398407, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003500149177853018, + "grad_norm": 8.250844955444336, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8575316667556763, + "num_tokens": 570189743.0, + "step": 14949 + }, + { + "epoch": 1.9017936649281262, + "ewc_loss": 0.07003092765808105, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003487467474769801, + "grad_norm": 8.264446258544922, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8753800392150879, + "num_tokens": 570224009.0, + "step": 14950 + }, + { + "epoch": 1.9019208752067167, + "ewc_loss": 0.06986407935619354, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034707828308455646, + "grad_norm": 8.178930282592773, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8700798749923706, + "num_tokens": 570266703.0, + "step": 14951 + }, + { + "epoch": 1.9020480854853072, + "ewc_loss": 0.07017068564891815, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003501444007270038, + "grad_norm": 8.222122192382812, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8628535270690918, + "num_tokens": 570308241.0, + "step": 14952 + }, + { + "epoch": 1.9021752957638978, + "ewc_loss": 0.06988826394081116, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003473201650194824, + "grad_norm": 8.141674041748047, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8615639805793762, + "num_tokens": 570346931.0, + "step": 14953 + }, + { + "epoch": 1.902302506042488, + "ewc_loss": 0.07017092406749725, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003501467581372708, + "grad_norm": 8.198476791381836, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8764435648918152, + "num_tokens": 570380373.0, + "step": 14954 + }, + { + "epoch": 1.9024297163210786, + "ewc_loss": 0.07004496455192566, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003488871443551034, + "grad_norm": 8.104104042053223, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8744356632232666, + "num_tokens": 570420106.0, + "step": 14955 + }, + { + "epoch": 1.9025569265996691, + "ewc_loss": 0.07043062895536423, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003527437802404165, + "grad_norm": 8.24649715423584, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8846603035926819, + "num_tokens": 570464534.0, + "step": 14956 + }, + { + "epoch": 1.9026841368782597, + "ewc_loss": 0.0699530765414238, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003479682782199234, + "grad_norm": 8.125456809997559, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8526556491851807, + "num_tokens": 570505734.0, + "step": 14957 + }, + { + "epoch": 1.9028113471568502, + "ewc_loss": 0.07023225724697113, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003532014670781791, + "grad_norm": 8.256569862365723, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8724322319030762, + "num_tokens": 570545913.0, + "step": 14958 + }, + { + "epoch": 1.9029385574354407, + "ewc_loss": 0.06968983262777328, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003477772406768054, + "grad_norm": 8.160419464111328, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8708745241165161, + "num_tokens": 570584397.0, + "step": 14959 + }, + { + "epoch": 1.9030657677140312, + "ewc_loss": 0.07003249228000641, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003512038092594594, + "grad_norm": 8.20438289642334, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8691307306289673, + "num_tokens": 570622285.0, + "step": 14960 + }, + { + "epoch": 1.9031929779926218, + "ewc_loss": 0.07007275521755219, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034916502772830427, + "grad_norm": 8.226298332214355, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8475126028060913, + "num_tokens": 570656331.0, + "step": 14961 + }, + { + "epoch": 1.9033201882712123, + "ewc_loss": 0.06984272599220276, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003493062104098499, + "grad_norm": 8.134289741516113, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8690590858459473, + "num_tokens": 570699944.0, + "step": 14962 + }, + { + "epoch": 1.9034473985498028, + "ewc_loss": 0.07029801607131958, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003514176933094859, + "grad_norm": 8.248383522033691, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8582779765129089, + "num_tokens": 570737649.0, + "step": 14963 + }, + { + "epoch": 1.9035746088283934, + "ewc_loss": 0.07007724046707153, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003492098767310381, + "grad_norm": 8.218087196350098, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8620115518569946, + "num_tokens": 570773193.0, + "step": 14964 + }, + { + "epoch": 1.9037018191069839, + "ewc_loss": 0.070229172706604, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003507292130962014, + "grad_norm": 8.215417861938477, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8566138744354248, + "num_tokens": 570809117.0, + "step": 14965 + }, + { + "epoch": 1.9038290293855744, + "ewc_loss": 0.07007536292076111, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034919107565656304, + "grad_norm": 8.202126502990723, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8750231862068176, + "num_tokens": 570844268.0, + "step": 14966 + }, + { + "epoch": 1.903956239664165, + "ewc_loss": 0.07010477781295776, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003494852571748197, + "grad_norm": 8.15615177154541, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.854559063911438, + "num_tokens": 570890166.0, + "step": 14967 + }, + { + "epoch": 1.9040834499427555, + "ewc_loss": 0.06996996700763702, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003505786298774183, + "grad_norm": 8.259256362915039, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8570635318756104, + "num_tokens": 570921419.0, + "step": 14968 + }, + { + "epoch": 1.904210660221346, + "ewc_loss": 0.06977866590023041, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034866551868617535, + "grad_norm": 8.126981735229492, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8540700674057007, + "num_tokens": 570966444.0, + "step": 14969 + }, + { + "epoch": 1.9043378704999365, + "ewc_loss": 0.07039090991020203, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003523466002661735, + "grad_norm": 8.211223602294922, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8733277916908264, + "num_tokens": 571011656.0, + "step": 14970 + }, + { + "epoch": 1.904465080778527, + "ewc_loss": 0.07004067301750183, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003488442744128406, + "grad_norm": 8.16295051574707, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8687825202941895, + "num_tokens": 571047828.0, + "step": 14971 + }, + { + "epoch": 1.9045922910571176, + "ewc_loss": 0.07039555162191391, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035239302087575197, + "grad_norm": 8.22562026977539, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8608742952346802, + "num_tokens": 571084541.0, + "step": 14972 + }, + { + "epoch": 1.9047195013357079, + "ewc_loss": 0.07010738551616669, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003495113633107394, + "grad_norm": 8.147907257080078, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8566550016403198, + "num_tokens": 571120978.0, + "step": 14973 + }, + { + "epoch": 1.9048467116142984, + "ewc_loss": 0.07034675776958466, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035190509515814483, + "grad_norm": 8.245087623596191, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8753896951675415, + "num_tokens": 571160181.0, + "step": 14974 + }, + { + "epoch": 1.904973921892889, + "ewc_loss": 0.07023368775844574, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003507743531372398, + "grad_norm": 8.204224586486816, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8673980236053467, + "num_tokens": 571194648.0, + "step": 14975 + }, + { + "epoch": 1.9051011321714795, + "ewc_loss": 0.07030008733272552, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003514383570291102, + "grad_norm": 8.18187427520752, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8567136526107788, + "num_tokens": 571238449.0, + "step": 14976 + }, + { + "epoch": 1.90522834245007, + "ewc_loss": 0.07018537819385529, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035029128775931895, + "grad_norm": 8.245051383972168, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8631551861763, + "num_tokens": 571277266.0, + "step": 14977 + }, + { + "epoch": 1.9053555527286605, + "ewc_loss": 0.07018972933292389, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003503348270896822, + "grad_norm": 8.175721168518066, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8746738433837891, + "num_tokens": 571316891.0, + "step": 14978 + }, + { + "epoch": 1.9054827630072508, + "ewc_loss": 0.06998725235462189, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003507514193188399, + "grad_norm": 8.244637489318848, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8641146421432495, + "num_tokens": 571353097.0, + "step": 14979 + }, + { + "epoch": 1.9056099732858414, + "ewc_loss": 0.06977124512195587, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003485913039185107, + "grad_norm": 8.20838451385498, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8772904276847839, + "num_tokens": 571392217.0, + "step": 14980 + }, + { + "epoch": 1.905737183564432, + "ewc_loss": 0.06988869607448578, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034976587630808353, + "grad_norm": 8.273765563964844, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8720603585243225, + "num_tokens": 571430440.0, + "step": 14981 + }, + { + "epoch": 1.9058643938430224, + "ewc_loss": 0.06971397995948792, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003480187151581049, + "grad_norm": 8.15973949432373, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8597733974456787, + "num_tokens": 571472002.0, + "step": 14982 + }, + { + "epoch": 1.905991604121613, + "ewc_loss": 0.06997041404247284, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003505829954519868, + "grad_norm": 8.288040161132812, + "learning_rate": 1e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.8440008163452148, + "num_tokens": 571509368.0, + "step": 14983 + }, + { + "epoch": 1.9061188144002035, + "ewc_loss": 0.06963610649108887, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034724001307040453, + "grad_norm": 8.19304370880127, + "learning_rate": 1e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8484743237495422, + "num_tokens": 571543898.0, + "step": 14984 + }, + { + "epoch": 1.906246024678794, + "ewc_loss": 0.07019273936748505, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035036486224271357, + "grad_norm": 8.196459770202637, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8576231002807617, + "num_tokens": 571584395.0, + "step": 14985 + }, + { + "epoch": 1.9063732349573845, + "ewc_loss": 0.06981252133846283, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003490041708573699, + "grad_norm": 8.192758560180664, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8635600209236145, + "num_tokens": 571624792.0, + "step": 14986 + }, + { + "epoch": 1.906500445235975, + "ewc_loss": 0.07018442451953888, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003502817125990987, + "grad_norm": 8.181540489196777, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8652864694595337, + "num_tokens": 571667197.0, + "step": 14987 + }, + { + "epoch": 1.9066276555145656, + "ewc_loss": 0.07029323279857635, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035136978840455413, + "grad_norm": 8.22320556640625, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8572160005569458, + "num_tokens": 571703648.0, + "step": 14988 + }, + { + "epoch": 1.9067548657931561, + "ewc_loss": 0.06992606818675995, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035013954038731754, + "grad_norm": 8.225042343139648, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8538370728492737, + "num_tokens": 571738980.0, + "step": 14989 + }, + { + "epoch": 1.9068820760717466, + "ewc_loss": 0.07017695903778076, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003502071194816381, + "grad_norm": 8.242769241333008, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8646810054779053, + "num_tokens": 571774540.0, + "step": 14990 + }, + { + "epoch": 1.9070092863503372, + "ewc_loss": 0.06986569613218307, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034953586873598397, + "grad_norm": 8.17270565032959, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.870011031627655, + "num_tokens": 571805286.0, + "step": 14991 + }, + { + "epoch": 1.9071364966289277, + "ewc_loss": 0.07028847932815552, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035132229095324874, + "grad_norm": 8.220741271972656, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8564890027046204, + "num_tokens": 571847936.0, + "step": 14992 + }, + { + "epoch": 1.9072637069075182, + "ewc_loss": 0.07010222971439362, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000349459849530831, + "grad_norm": 8.240039825439453, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8585835099220276, + "num_tokens": 571885454.0, + "step": 14993 + }, + { + "epoch": 1.9073909171861088, + "ewc_loss": 0.07011233270168304, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034956078161485493, + "grad_norm": 8.15284252166748, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8614472150802612, + "num_tokens": 571924870.0, + "step": 14994 + }, + { + "epoch": 1.9075181274646993, + "ewc_loss": 0.07035587728023529, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003519962483551353, + "grad_norm": 8.318930625915527, + "learning_rate": 1e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.8404048681259155, + "num_tokens": 571957870.0, + "step": 14995 + }, + { + "epoch": 1.9076453377432898, + "ewc_loss": 0.06987299025058746, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003471674572210759, + "grad_norm": 8.19082260131836, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8545461297035217, + "num_tokens": 571993645.0, + "step": 14996 + }, + { + "epoch": 1.9077725480218801, + "ewc_loss": 0.07023955881595612, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035083305556327105, + "grad_norm": 8.232170104980469, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8713015913963318, + "num_tokens": 572028594.0, + "step": 14997 + }, + { + "epoch": 1.9078997583004706, + "ewc_loss": 0.07000037282705307, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003484412154648453, + "grad_norm": 8.231691360473633, + "learning_rate": 1e-06, + "loss": 0.5556, + "mean_token_accuracy": 0.835128903388977, + "num_tokens": 572066971.0, + "step": 14998 + }, + { + "epoch": 1.9080269685790612, + "ewc_loss": 0.07014492154121399, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034988674451597035, + "grad_norm": 8.23989486694336, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.868179202079773, + "num_tokens": 572107609.0, + "step": 14999 + }, + { + "epoch": 1.9081541788576517, + "ewc_loss": 0.07003509253263474, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034878842416219413, + "grad_norm": 8.188674926757812, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8700879216194153, + "num_tokens": 572146940.0, + "step": 15000 + }, + { + "epoch": 1.9082813891362422, + "ewc_loss": 0.07017497718334198, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003501872415654361, + "grad_norm": 8.249312400817871, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8884856700897217, + "num_tokens": 572184010.0, + "step": 15001 + }, + { + "epoch": 1.9084085994148328, + "ewc_loss": 0.06958816945552826, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034676064387895167, + "grad_norm": 8.204787254333496, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8468199372291565, + "num_tokens": 572223184.0, + "step": 15002 + }, + { + "epoch": 1.908535809693423, + "ewc_loss": 0.07000569254159927, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003484944172669202, + "grad_norm": 8.257893562316895, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8628920912742615, + "num_tokens": 572261770.0, + "step": 15003 + }, + { + "epoch": 1.9086630199720136, + "ewc_loss": 0.06991468369960785, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003475843695923686, + "grad_norm": 8.16789436340332, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8650628328323364, + "num_tokens": 572303229.0, + "step": 15004 + }, + { + "epoch": 1.9087902302506041, + "ewc_loss": 0.07004350423812866, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034887256333604455, + "grad_norm": 8.256000518798828, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8497984409332275, + "num_tokens": 572337839.0, + "step": 15005 + }, + { + "epoch": 1.9089174405291947, + "ewc_loss": 0.06987376511096954, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003471751988399774, + "grad_norm": 8.235072135925293, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.861676812171936, + "num_tokens": 572373604.0, + "step": 15006 + }, + { + "epoch": 1.9090446508077852, + "ewc_loss": 0.07003673911094666, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034880489693023264, + "grad_norm": 8.239524841308594, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8694071769714355, + "num_tokens": 572410425.0, + "step": 15007 + }, + { + "epoch": 1.9091718610863757, + "ewc_loss": 0.06996745616197586, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034811205114237964, + "grad_norm": 8.207615852355957, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8718860745429993, + "num_tokens": 572443998.0, + "step": 15008 + }, + { + "epoch": 1.9092990713649662, + "ewc_loss": 0.06992772966623306, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003477148129604757, + "grad_norm": 8.215213775634766, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8499122858047485, + "num_tokens": 572485337.0, + "step": 15009 + }, + { + "epoch": 1.9094262816435568, + "ewc_loss": 0.06993694603443146, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003478070138953626, + "grad_norm": 8.219382286071777, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8522154092788696, + "num_tokens": 572528273.0, + "step": 15010 + }, + { + "epoch": 1.9095534919221473, + "ewc_loss": 0.07003790140151978, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003488164802547544, + "grad_norm": 8.214094161987305, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8662647604942322, + "num_tokens": 572569467.0, + "step": 15011 + }, + { + "epoch": 1.9096807022007378, + "ewc_loss": 0.0700325071811676, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003487625508569181, + "grad_norm": 8.300313949584961, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8703910708427429, + "num_tokens": 572609626.0, + "step": 15012 + }, + { + "epoch": 1.9098079124793284, + "ewc_loss": 0.06981471180915833, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034658456570468843, + "grad_norm": 8.144771575927734, + "learning_rate": 1e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8468062877655029, + "num_tokens": 572652907.0, + "step": 15013 + }, + { + "epoch": 1.9099351227579189, + "ewc_loss": 0.07014046609401703, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003498421283438802, + "grad_norm": 8.364568710327148, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8703495264053345, + "num_tokens": 572687460.0, + "step": 15014 + }, + { + "epoch": 1.9100623330365094, + "ewc_loss": 0.06968393921852112, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034527687239460647, + "grad_norm": 8.157089233398438, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8667619228363037, + "num_tokens": 572723937.0, + "step": 15015 + }, + { + "epoch": 1.9101895433151, + "ewc_loss": 0.07032664120197296, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000351703871274367, + "grad_norm": 8.252470970153809, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8575420379638672, + "num_tokens": 572764929.0, + "step": 15016 + }, + { + "epoch": 1.9103167535936905, + "ewc_loss": 0.06986552476882935, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003470927767921239, + "grad_norm": 8.2109375, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8611149787902832, + "num_tokens": 572802545.0, + "step": 15017 + }, + { + "epoch": 1.910443963872281, + "ewc_loss": 0.07033039629459381, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035174141521565616, + "grad_norm": 8.227093696594238, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8577991724014282, + "num_tokens": 572850008.0, + "step": 15018 + }, + { + "epoch": 1.9105711741508715, + "ewc_loss": 0.07014493644237518, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034988689003512263, + "grad_norm": 8.309558868408203, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.870448112487793, + "num_tokens": 572888968.0, + "step": 15019 + }, + { + "epoch": 1.910698384429462, + "ewc_loss": 0.07002301514148712, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034866767236962914, + "grad_norm": 8.23890495300293, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.860944390296936, + "num_tokens": 572926314.0, + "step": 15020 + }, + { + "epoch": 1.9108255947080524, + "ewc_loss": 0.07024377584457397, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035087522701360285, + "grad_norm": 8.327134132385254, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8671645522117615, + "num_tokens": 572961789.0, + "step": 15021 + }, + { + "epoch": 1.9109528049866429, + "ewc_loss": 0.06985233724117279, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003469608200248331, + "grad_norm": 8.370322227478027, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8645788431167603, + "num_tokens": 572993680.0, + "step": 15022 + }, + { + "epoch": 1.9110800152652334, + "ewc_loss": 0.0700063705444336, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034850119845941663, + "grad_norm": 8.276508331298828, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8644312620162964, + "num_tokens": 573024003.0, + "step": 15023 + }, + { + "epoch": 1.911207225543824, + "ewc_loss": 0.0699281394481659, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034771885839290917, + "grad_norm": 8.243176460266113, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8699034452438354, + "num_tokens": 573055138.0, + "step": 15024 + }, + { + "epoch": 1.9113344358224145, + "ewc_loss": 0.07008807361125946, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003493182302918285, + "grad_norm": 8.222206115722656, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8666863441467285, + "num_tokens": 573092373.0, + "step": 15025 + }, + { + "epoch": 1.911461646101005, + "ewc_loss": 0.0700877457857132, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003493149415589869, + "grad_norm": 8.269530296325684, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8594737648963928, + "num_tokens": 573131486.0, + "step": 15026 + }, + { + "epoch": 1.9115888563795955, + "ewc_loss": 0.06997835636138916, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034822107409127057, + "grad_norm": 8.170487403869629, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8728955984115601, + "num_tokens": 573170114.0, + "step": 15027 + }, + { + "epoch": 1.9117160666581858, + "ewc_loss": 0.07026651501655579, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035110264434479177, + "grad_norm": 8.354570388793945, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8657921552658081, + "num_tokens": 573200906.0, + "step": 15028 + }, + { + "epoch": 1.9118432769367764, + "ewc_loss": 0.06979627907276154, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003464002802502364, + "grad_norm": 8.15109920501709, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.851169228553772, + "num_tokens": 573243731.0, + "step": 15029 + }, + { + "epoch": 1.9119704872153669, + "ewc_loss": 0.07043300569057465, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003527675289660692, + "grad_norm": 8.294354438781738, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8691943883895874, + "num_tokens": 573283335.0, + "step": 15030 + }, + { + "epoch": 1.9120976974939574, + "ewc_loss": 0.0697856917977333, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034629442961886525, + "grad_norm": 8.152445793151855, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8503855466842651, + "num_tokens": 573326628.0, + "step": 15031 + }, + { + "epoch": 1.912224907772548, + "ewc_loss": 0.07042752206325531, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003527126682456583, + "grad_norm": 8.282369613647461, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.868782103061676, + "num_tokens": 573369025.0, + "step": 15032 + }, + { + "epoch": 1.9123521180511385, + "ewc_loss": 0.06991954892873764, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003476330020930618, + "grad_norm": 8.156871795654297, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.855157732963562, + "num_tokens": 573411923.0, + "step": 15033 + }, + { + "epoch": 1.912479328329729, + "ewc_loss": 0.07029585540294647, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003513960400596261, + "grad_norm": 8.289505004882812, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.858046293258667, + "num_tokens": 573450455.0, + "step": 15034 + }, + { + "epoch": 1.9126065386083195, + "ewc_loss": 0.06997403502464294, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034817782579921186, + "grad_norm": 8.148591995239258, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.854621171951294, + "num_tokens": 573492886.0, + "step": 15035 + }, + { + "epoch": 1.91273374888691, + "ewc_loss": 0.07033964991569519, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003518340236041695, + "grad_norm": 8.29038143157959, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8689770698547363, + "num_tokens": 573532073.0, + "step": 15036 + }, + { + "epoch": 1.9128609591655006, + "ewc_loss": 0.06997521221637726, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034818967105820775, + "grad_norm": 8.116107940673828, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8796882629394531, + "num_tokens": 573573632.0, + "step": 15037 + }, + { + "epoch": 1.9129881694440911, + "ewc_loss": 0.07065516710281372, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000354989169863984, + "grad_norm": 8.293697357177734, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8727037906646729, + "num_tokens": 573610194.0, + "step": 15038 + }, + { + "epoch": 1.9131153797226816, + "ewc_loss": 0.07001923024654388, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000348629750078544, + "grad_norm": 8.196975708007812, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8541035056114197, + "num_tokens": 573650237.0, + "step": 15039 + }, + { + "epoch": 1.9132425900012722, + "ewc_loss": 0.07055569440126419, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035399445914663374, + "grad_norm": 8.332715034484863, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.866384744644165, + "num_tokens": 573686859.0, + "step": 15040 + }, + { + "epoch": 1.9133698002798627, + "ewc_loss": 0.0699409693479538, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034784720628522336, + "grad_norm": 8.169034004211426, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8628079295158386, + "num_tokens": 573726770.0, + "step": 15041 + }, + { + "epoch": 1.9134970105584532, + "ewc_loss": 0.0705074593424797, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035351209226064384, + "grad_norm": 8.335549354553223, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8571867346763611, + "num_tokens": 573768668.0, + "step": 15042 + }, + { + "epoch": 1.9136242208370438, + "ewc_loss": 0.06975395977497101, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034841851447708905, + "grad_norm": 8.23182487487793, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8540956974029541, + "num_tokens": 573807880.0, + "step": 15043 + }, + { + "epoch": 1.9137514311156343, + "ewc_loss": 0.07010103017091751, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035188920446671546, + "grad_norm": 8.26541519165039, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8603895902633667, + "num_tokens": 573847018.0, + "step": 15044 + }, + { + "epoch": 1.9138786413942248, + "ewc_loss": 0.07011960446834564, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003496335120871663, + "grad_norm": 8.283021926879883, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.878975510597229, + "num_tokens": 573881193.0, + "step": 15045 + }, + { + "epoch": 1.9140058516728151, + "ewc_loss": 0.07012651115655899, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003497026045806706, + "grad_norm": 8.224029541015625, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8714902400970459, + "num_tokens": 573918457.0, + "step": 15046 + }, + { + "epoch": 1.9141330619514056, + "ewc_loss": 0.07017605006694794, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003501980390865356, + "grad_norm": 8.211241722106934, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8833281993865967, + "num_tokens": 573958818.0, + "step": 15047 + }, + { + "epoch": 1.9142602722299962, + "ewc_loss": 0.07018539309501648, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003502914623823017, + "grad_norm": 8.284585952758789, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8582993149757385, + "num_tokens": 573996735.0, + "step": 15048 + }, + { + "epoch": 1.9143874825085867, + "ewc_loss": 0.07018139958381653, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035025147371925414, + "grad_norm": 8.253843307495117, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.867431640625, + "num_tokens": 574035005.0, + "step": 15049 + }, + { + "epoch": 1.9145146927871772, + "ewc_loss": 0.07015124708414078, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003499499580357224, + "grad_norm": 8.218259811401367, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8619754314422607, + "num_tokens": 574077817.0, + "step": 15050 + }, + { + "epoch": 1.9146419030657678, + "ewc_loss": 0.07020388543605804, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003504763008095324, + "grad_norm": 8.305739402770996, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8704543113708496, + "num_tokens": 574113083.0, + "step": 15051 + }, + { + "epoch": 1.914769113344358, + "ewc_loss": 0.07007856667041779, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003492231189738959, + "grad_norm": 8.2050199508667, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8671125173568726, + "num_tokens": 574147324.0, + "step": 15052 + }, + { + "epoch": 1.9148963236229486, + "ewc_loss": 0.07041777670383453, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035261522862128913, + "grad_norm": 8.312063217163086, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8783926367759705, + "num_tokens": 574186108.0, + "step": 15053 + }, + { + "epoch": 1.9150235339015391, + "ewc_loss": 0.06985972821712494, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003470348019618541, + "grad_norm": 8.198579788208008, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8747304677963257, + "num_tokens": 574218076.0, + "step": 15054 + }, + { + "epoch": 1.9151507441801296, + "ewc_loss": 0.07036030292510986, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035204051528126, + "grad_norm": 8.276782035827637, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8738954067230225, + "num_tokens": 574256416.0, + "step": 15055 + }, + { + "epoch": 1.9152779544587202, + "ewc_loss": 0.06993818283081055, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034781938302330673, + "grad_norm": 8.16814136505127, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8764961957931519, + "num_tokens": 574295262.0, + "step": 15056 + }, + { + "epoch": 1.9154051647373107, + "ewc_loss": 0.07012587785720825, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035213763476349413, + "grad_norm": 8.328985214233398, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8528790473937988, + "num_tokens": 574329836.0, + "step": 15057 + }, + { + "epoch": 1.9155323750159012, + "ewc_loss": 0.06988289952278137, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003472665266599506, + "grad_norm": 8.17039966583252, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8653817176818848, + "num_tokens": 574363189.0, + "step": 15058 + }, + { + "epoch": 1.9156595852944918, + "ewc_loss": 0.07036316394805908, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003520690952427685, + "grad_norm": 8.322672843933105, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8722913265228271, + "num_tokens": 574396374.0, + "step": 15059 + }, + { + "epoch": 1.9157867955730823, + "ewc_loss": 0.06954121589660645, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003462911117821932, + "grad_norm": 8.159828186035156, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8653932809829712, + "num_tokens": 574435488.0, + "step": 15060 + }, + { + "epoch": 1.9159140058516728, + "ewc_loss": 0.0700942799448967, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003518217126838863, + "grad_norm": 8.273146629333496, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8626316785812378, + "num_tokens": 574475261.0, + "step": 15061 + }, + { + "epoch": 1.9160412161302633, + "ewc_loss": 0.07002955675125122, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003487330977804959, + "grad_norm": 8.16139030456543, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8837773203849792, + "num_tokens": 574513531.0, + "step": 15062 + }, + { + "epoch": 1.9161684264088539, + "ewc_loss": 0.07035696506500244, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035200713318772614, + "grad_norm": 8.282748222351074, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.850011944770813, + "num_tokens": 574551736.0, + "step": 15063 + }, + { + "epoch": 1.9162956366874444, + "ewc_loss": 0.070015087723732, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000348588393535465, + "grad_norm": 8.182902336120605, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8566190004348755, + "num_tokens": 574589978.0, + "step": 15064 + }, + { + "epoch": 1.916422846966035, + "ewc_loss": 0.0704900398850441, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003533379058353603, + "grad_norm": 8.309619903564453, + "learning_rate": 1e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8433976173400879, + "num_tokens": 574628845.0, + "step": 15065 + }, + { + "epoch": 1.9165500572446255, + "ewc_loss": 0.0697007030248642, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00034788588527590036, + "grad_norm": 8.27189826965332, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8597935438156128, + "num_tokens": 574660700.0, + "step": 15066 + }, + { + "epoch": 1.916677267523216, + "ewc_loss": 0.07021315395832062, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035056908382102847, + "grad_norm": 8.22407341003418, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8553780913352966, + "num_tokens": 574692195.0, + "step": 15067 + }, + { + "epoch": 1.9168044778018065, + "ewc_loss": 0.07025963813066483, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003510338719934225, + "grad_norm": 8.256670951843262, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8524385690689087, + "num_tokens": 574732462.0, + "step": 15068 + }, + { + "epoch": 1.916931688080397, + "ewc_loss": 0.06994302570819855, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034786781179718673, + "grad_norm": 8.175339698791504, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8700271844863892, + "num_tokens": 574769353.0, + "step": 15069 + }, + { + "epoch": 1.9170588983589874, + "ewc_loss": 0.07019248604774475, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035036238841712475, + "grad_norm": 8.270048141479492, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8629506826400757, + "num_tokens": 574804958.0, + "step": 15070 + }, + { + "epoch": 1.9171861086375779, + "ewc_loss": 0.07000124454498291, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034844991751015186, + "grad_norm": 8.195837020874023, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8628829717636108, + "num_tokens": 574843966.0, + "step": 15071 + }, + { + "epoch": 1.9173133189161684, + "ewc_loss": 0.07028662413358688, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003513037518132478, + "grad_norm": 8.293021202087402, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.868327796459198, + "num_tokens": 574882916.0, + "step": 15072 + }, + { + "epoch": 1.917440529194759, + "ewc_loss": 0.06992614269256592, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003476989222690463, + "grad_norm": 8.131426811218262, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8715154528617859, + "num_tokens": 574922154.0, + "step": 15073 + }, + { + "epoch": 1.9175677394733495, + "ewc_loss": 0.07049401849508286, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035337769077159464, + "grad_norm": 8.3361177444458, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8596338033676147, + "num_tokens": 574956894.0, + "step": 15074 + }, + { + "epoch": 1.91769494975194, + "ewc_loss": 0.06988698989152908, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003473074175417423, + "grad_norm": 8.11388874053955, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8772029280662537, + "num_tokens": 574998343.0, + "step": 15075 + }, + { + "epoch": 1.9178221600305305, + "ewc_loss": 0.07059915363788605, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035442900843918324, + "grad_norm": 8.269043922424316, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8714550733566284, + "num_tokens": 575036689.0, + "step": 15076 + }, + { + "epoch": 1.9179493703091208, + "ewc_loss": 0.07006387412548065, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003490762028377503, + "grad_norm": 8.136670112609863, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8577311038970947, + "num_tokens": 575075510.0, + "step": 15077 + }, + { + "epoch": 1.9180765805877114, + "ewc_loss": 0.070506751537323, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035350496182218194, + "grad_norm": 8.229283332824707, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8594406843185425, + "num_tokens": 575115797.0, + "step": 15078 + }, + { + "epoch": 1.9182037908663019, + "ewc_loss": 0.07028673589229584, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003513048868626356, + "grad_norm": 8.180196762084961, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8494529724121094, + "num_tokens": 575155240.0, + "step": 15079 + }, + { + "epoch": 1.9183310011448924, + "ewc_loss": 0.070424884557724, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035268638748675585, + "grad_norm": 8.259906768798828, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.851593554019928, + "num_tokens": 575191048.0, + "step": 15080 + }, + { + "epoch": 1.918458211423483, + "ewc_loss": 0.07036250829696655, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035206254688091576, + "grad_norm": 8.200897216796875, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8522244095802307, + "num_tokens": 575227778.0, + "step": 15081 + }, + { + "epoch": 1.9185854217020735, + "ewc_loss": 0.07043509185314178, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035278842551633716, + "grad_norm": 8.21915054321289, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8641622066497803, + "num_tokens": 575269774.0, + "step": 15082 + }, + { + "epoch": 1.918712631980664, + "ewc_loss": 0.07043991982936859, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035283673787489533, + "grad_norm": 8.209527969360352, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8520896434783936, + "num_tokens": 575310710.0, + "step": 15083 + }, + { + "epoch": 1.9188398422592545, + "ewc_loss": 0.07047364115715027, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035317393485456705, + "grad_norm": 8.252995491027832, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.871218204498291, + "num_tokens": 575344073.0, + "step": 15084 + }, + { + "epoch": 1.918967052537845, + "ewc_loss": 0.07034385949373245, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035187607863917947, + "grad_norm": 8.276253700256348, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8535007834434509, + "num_tokens": 575386737.0, + "step": 15085 + }, + { + "epoch": 1.9190942628164356, + "ewc_loss": 0.07032030820846558, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035164059954695404, + "grad_norm": 8.19292163848877, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8673609495162964, + "num_tokens": 575424287.0, + "step": 15086 + }, + { + "epoch": 1.919221473095026, + "ewc_loss": 0.07055868208408356, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003540243487805128, + "grad_norm": 8.346451759338379, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8640674352645874, + "num_tokens": 575463859.0, + "step": 15087 + }, + { + "epoch": 1.9193486833736166, + "ewc_loss": 0.0702158659696579, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035059620859101415, + "grad_norm": 8.22148609161377, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8579281568527222, + "num_tokens": 575500838.0, + "step": 15088 + }, + { + "epoch": 1.9194758936522072, + "ewc_loss": 0.07041245698928833, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003525620559230447, + "grad_norm": 8.297025680541992, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8563835620880127, + "num_tokens": 575535164.0, + "step": 15089 + }, + { + "epoch": 1.9196031039307977, + "ewc_loss": 0.07012175023555756, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034965501981787384, + "grad_norm": 8.225011825561523, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.869081974029541, + "num_tokens": 575569493.0, + "step": 15090 + }, + { + "epoch": 1.9197303142093882, + "ewc_loss": 0.07048667222261429, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003533042035996914, + "grad_norm": 8.315046310424805, + "learning_rate": 1e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8461148142814636, + "num_tokens": 575603396.0, + "step": 15091 + }, + { + "epoch": 1.9198575244879788, + "ewc_loss": 0.07001544535160065, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034859200241044164, + "grad_norm": 8.177849769592285, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8694472312927246, + "num_tokens": 575638255.0, + "step": 15092 + }, + { + "epoch": 1.9199847347665693, + "ewc_loss": 0.0704057514667511, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003524950298015028, + "grad_norm": 8.207954406738281, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8535778522491455, + "num_tokens": 575685281.0, + "step": 15093 + }, + { + "epoch": 1.9201119450451598, + "ewc_loss": 0.07015660405158997, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003500035381875932, + "grad_norm": 8.226646423339844, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8694536685943604, + "num_tokens": 575725030.0, + "step": 15094 + }, + { + "epoch": 1.9202391553237501, + "ewc_loss": 0.07036831974983215, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035212072543799877, + "grad_norm": 8.251174926757812, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8571999669075012, + "num_tokens": 575764237.0, + "step": 15095 + }, + { + "epoch": 1.9203663656023406, + "ewc_loss": 0.0702296793460846, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035073430626653135, + "grad_norm": 8.212154388427734, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8612978458404541, + "num_tokens": 575796732.0, + "step": 15096 + }, + { + "epoch": 1.9204935758809312, + "ewc_loss": 0.07033978402614594, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035183533327654004, + "grad_norm": 8.3314208984375, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8676343560218811, + "num_tokens": 575833133.0, + "step": 15097 + }, + { + "epoch": 1.9206207861595217, + "ewc_loss": 0.07003524899482727, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034879002487286925, + "grad_norm": 8.179771423339844, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.858100414276123, + "num_tokens": 575870920.0, + "step": 15098 + }, + { + "epoch": 1.9207479964381122, + "ewc_loss": 0.07043375819921494, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035277509596198797, + "grad_norm": 8.292137145996094, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8658824563026428, + "num_tokens": 575909132.0, + "step": 15099 + }, + { + "epoch": 1.9208752067167028, + "ewc_loss": 0.0700615867972374, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003490533563308418, + "grad_norm": 8.271612167358398, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.863560676574707, + "num_tokens": 575947979.0, + "step": 15100 + }, + { + "epoch": 1.921002416995293, + "ewc_loss": 0.07022108137607574, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003506483626551926, + "grad_norm": 8.210121154785156, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8517072796821594, + "num_tokens": 575988888.0, + "step": 15101 + }, + { + "epoch": 1.9211296272738836, + "ewc_loss": 0.07029829174280167, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035142042906954885, + "grad_norm": 8.2604341506958, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8649852871894836, + "num_tokens": 576028614.0, + "step": 15102 + }, + { + "epoch": 1.9212568375524741, + "ewc_loss": 0.07010841369628906, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003495216369628906, + "grad_norm": 8.225313186645508, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8910272717475891, + "num_tokens": 576066648.0, + "step": 15103 + }, + { + "epoch": 1.9213840478310646, + "ewc_loss": 0.0703873559832573, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003523110644891858, + "grad_norm": 8.186418533325195, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8592081069946289, + "num_tokens": 576104045.0, + "step": 15104 + }, + { + "epoch": 1.9215112581096552, + "ewc_loss": 0.07023251056671143, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003507626533973962, + "grad_norm": 8.23950481414795, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.857086181640625, + "num_tokens": 576145419.0, + "step": 15105 + }, + { + "epoch": 1.9216384683882457, + "ewc_loss": 0.07029533386230469, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003513908595778048, + "grad_norm": 8.213895797729492, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.859921932220459, + "num_tokens": 576191410.0, + "step": 15106 + }, + { + "epoch": 1.9217656786668362, + "ewc_loss": 0.07035373151302338, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003519747988320887, + "grad_norm": 8.258801460266113, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8586980104446411, + "num_tokens": 576226383.0, + "step": 15107 + }, + { + "epoch": 1.9218928889454268, + "ewc_loss": 0.07037965953350067, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003522340557537973, + "grad_norm": 8.290413856506348, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.855402946472168, + "num_tokens": 576259958.0, + "step": 15108 + }, + { + "epoch": 1.9220200992240173, + "ewc_loss": 0.07025882601737976, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035102578112855554, + "grad_norm": 8.185005187988281, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8660612106323242, + "num_tokens": 576299467.0, + "step": 15109 + }, + { + "epoch": 1.9221473095026078, + "ewc_loss": 0.07059887051582336, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035442618536762893, + "grad_norm": 8.333868980407715, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8579505681991577, + "num_tokens": 576334137.0, + "step": 15110 + }, + { + "epoch": 1.9222745197811983, + "ewc_loss": 0.07014942169189453, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003499317681416869, + "grad_norm": 8.161487579345703, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8593558669090271, + "num_tokens": 576379582.0, + "step": 15111 + }, + { + "epoch": 1.9224017300597889, + "ewc_loss": 0.07080256193876266, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035646313335746527, + "grad_norm": 8.332261085510254, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8781507015228271, + "num_tokens": 576414590.0, + "step": 15112 + }, + { + "epoch": 1.9225289403383794, + "ewc_loss": 0.07004953920841217, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003489328664727509, + "grad_norm": 8.18486213684082, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8532938957214355, + "num_tokens": 576445023.0, + "step": 15113 + }, + { + "epoch": 1.92265615061697, + "ewc_loss": 0.070767343044281, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035611094790510833, + "grad_norm": 8.319023132324219, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8585221767425537, + "num_tokens": 576477861.0, + "step": 15114 + }, + { + "epoch": 1.9227833608955605, + "ewc_loss": 0.0703047439455986, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003514849522616714, + "grad_norm": 8.189701080322266, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8599395751953125, + "num_tokens": 576513121.0, + "step": 15115 + }, + { + "epoch": 1.922910571174151, + "ewc_loss": 0.07070530205965042, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003554905124474317, + "grad_norm": 8.294181823730469, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8688311576843262, + "num_tokens": 576551039.0, + "step": 15116 + }, + { + "epoch": 1.9230377814527415, + "ewc_loss": 0.07029280066490173, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035136548103764653, + "grad_norm": 8.216633796691895, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.85594642162323, + "num_tokens": 576590651.0, + "step": 15117 + }, + { + "epoch": 1.923164991731332, + "ewc_loss": 0.070633664727211, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003547741216607392, + "grad_norm": 8.342734336853027, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8542366027832031, + "num_tokens": 576629119.0, + "step": 15118 + }, + { + "epoch": 1.9232922020099223, + "ewc_loss": 0.07029108703136444, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035134839708916843, + "grad_norm": 8.212127685546875, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8721749186515808, + "num_tokens": 576664240.0, + "step": 15119 + }, + { + "epoch": 1.9234194122885129, + "ewc_loss": 0.07085970789194107, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003545931540429592, + "grad_norm": 8.3104248046875, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.857136607170105, + "num_tokens": 576701946.0, + "step": 15120 + }, + { + "epoch": 1.9235466225671034, + "ewc_loss": 0.07027897983789444, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035122729605063796, + "grad_norm": 8.244368553161621, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8740054368972778, + "num_tokens": 576733541.0, + "step": 15121 + }, + { + "epoch": 1.923673832845694, + "ewc_loss": 0.07054287195205688, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003538661985658109, + "grad_norm": 8.272774696350098, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8478778600692749, + "num_tokens": 576770930.0, + "step": 15122 + }, + { + "epoch": 1.9238010431242845, + "ewc_loss": 0.0703723356127739, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003521608596201986, + "grad_norm": 8.20335865020752, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8634739518165588, + "num_tokens": 576807455.0, + "step": 15123 + }, + { + "epoch": 1.923928253402875, + "ewc_loss": 0.07075127959251404, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00035350886173546314, + "grad_norm": 15.54147720336914, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8706734776496887, + "num_tokens": 576849577.0, + "step": 15124 + }, + { + "epoch": 1.9240554636814655, + "ewc_loss": 0.08246603608131409, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0004730978107545525, + "grad_norm": 9.588905334472656, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8561415672302246, + "num_tokens": 576883506.0, + "step": 15125 + }, + { + "epoch": 1.9241826739600558, + "ewc_loss": 0.0694921687245369, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034335919190198183, + "grad_norm": 8.030060768127441, + "learning_rate": 1e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8464317321777344, + "num_tokens": 576923892.0, + "step": 15126 + }, + { + "epoch": 1.9243098842386464, + "ewc_loss": 0.07273348420858383, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003757723607122898, + "grad_norm": 8.720771789550781, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8799098134040833, + "num_tokens": 576970832.0, + "step": 15127 + }, + { + "epoch": 1.9244370945172369, + "ewc_loss": 0.07114699482917786, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035990748438052833, + "grad_norm": 8.200724601745605, + "learning_rate": 1e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8446903824806213, + "num_tokens": 577006990.0, + "step": 15128 + }, + { + "epoch": 1.9245643047958274, + "ewc_loss": 0.07200352847576141, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000368472799891606, + "grad_norm": 8.609539985656738, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8681972622871399, + "num_tokens": 577044932.0, + "step": 15129 + }, + { + "epoch": 1.924691515074418, + "ewc_loss": 0.07056871056556702, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035412461147643626, + "grad_norm": 8.139115333557129, + "learning_rate": 1e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8456109762191772, + "num_tokens": 577076217.0, + "step": 15130 + }, + { + "epoch": 1.9248187253530085, + "ewc_loss": 0.07189641892910004, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036740166251547635, + "grad_norm": 8.513879776000977, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8791271448135376, + "num_tokens": 577112696.0, + "step": 15131 + }, + { + "epoch": 1.924945935631599, + "ewc_loss": 0.0708252489566803, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035669002681970596, + "grad_norm": 8.248095512390137, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8628709316253662, + "num_tokens": 577156397.0, + "step": 15132 + }, + { + "epoch": 1.9250731459101895, + "ewc_loss": 0.0713108628988266, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036154614645056427, + "grad_norm": 8.38711929321289, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8670405745506287, + "num_tokens": 577194192.0, + "step": 15133 + }, + { + "epoch": 1.92520035618878, + "ewc_loss": 0.07079338282346725, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035637133987620473, + "grad_norm": 8.227038383483887, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.857620358467102, + "num_tokens": 577232639.0, + "step": 15134 + }, + { + "epoch": 1.9253275664673706, + "ewc_loss": 0.07103362679481506, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003587738028727472, + "grad_norm": 8.353748321533203, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8712900876998901, + "num_tokens": 577267349.0, + "step": 15135 + }, + { + "epoch": 1.925454776745961, + "ewc_loss": 0.07046874612569809, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035312495310790837, + "grad_norm": 8.142157554626465, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8585548400878906, + "num_tokens": 577308329.0, + "step": 15136 + }, + { + "epoch": 1.9255819870245516, + "ewc_loss": 0.07121779024600983, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003606154350563884, + "grad_norm": 8.359375953674316, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8644912838935852, + "num_tokens": 577347825.0, + "step": 15137 + }, + { + "epoch": 1.9257091973031422, + "ewc_loss": 0.07031357288360596, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003515732241794467, + "grad_norm": 8.177348136901855, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8637489080429077, + "num_tokens": 577379981.0, + "step": 15138 + }, + { + "epoch": 1.9258364075817327, + "ewc_loss": 0.07107242941856384, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035916175693273544, + "grad_norm": 8.314818382263184, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8668580055236816, + "num_tokens": 577414134.0, + "step": 15139 + }, + { + "epoch": 1.9259636178603232, + "ewc_loss": 0.07040071487426758, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003524446801748127, + "grad_norm": 8.190083503723145, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8610923290252686, + "num_tokens": 577451170.0, + "step": 15140 + }, + { + "epoch": 1.9260908281389137, + "ewc_loss": 0.07076162844896317, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003560537879820913, + "grad_norm": 8.291794776916504, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8476718664169312, + "num_tokens": 577487447.0, + "step": 15141 + }, + { + "epoch": 1.9262180384175043, + "ewc_loss": 0.0704381987452507, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003528194793034345, + "grad_norm": 8.167774200439453, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8676425218582153, + "num_tokens": 577525732.0, + "step": 15142 + }, + { + "epoch": 1.9263452486960948, + "ewc_loss": 0.07071603089570999, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035559781827032566, + "grad_norm": 8.232473373413086, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.861962080001831, + "num_tokens": 577563303.0, + "step": 15143 + }, + { + "epoch": 1.926472458974685, + "ewc_loss": 0.07049354910850525, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003533729468472302, + "grad_norm": 8.175843238830566, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8769861459732056, + "num_tokens": 577607592.0, + "step": 15144 + }, + { + "epoch": 1.9265996692532756, + "ewc_loss": 0.0706990659236908, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003554281429387629, + "grad_norm": 8.23574161529541, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8538722991943359, + "num_tokens": 577651646.0, + "step": 15145 + }, + { + "epoch": 1.9267268795318662, + "ewc_loss": 0.07049749791622162, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003534124989528209, + "grad_norm": 8.160402297973633, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.871871829032898, + "num_tokens": 577689530.0, + "step": 15146 + }, + { + "epoch": 1.9268540898104567, + "ewc_loss": 0.07068520039319992, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003552895213942975, + "grad_norm": 8.239691734313965, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8565810918807983, + "num_tokens": 577732524.0, + "step": 15147 + }, + { + "epoch": 1.9269813000890472, + "ewc_loss": 0.07052892446517944, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003537267621140927, + "grad_norm": 8.216002464294434, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8668445348739624, + "num_tokens": 577767164.0, + "step": 15148 + }, + { + "epoch": 1.9271085103676378, + "ewc_loss": 0.0707576647400856, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035601414856500924, + "grad_norm": 8.247004508972168, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8698573112487793, + "num_tokens": 577802347.0, + "step": 15149 + }, + { + "epoch": 1.927235720646228, + "ewc_loss": 0.07060081511735916, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003544456558302045, + "grad_norm": 8.220559120178223, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8548411130905151, + "num_tokens": 577842899.0, + "step": 15150 + }, + { + "epoch": 1.9273629309248186, + "ewc_loss": 0.07067619264125824, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003551994450390339, + "grad_norm": 8.243292808532715, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8629378080368042, + "num_tokens": 577885137.0, + "step": 15151 + }, + { + "epoch": 1.9274901412034091, + "ewc_loss": 0.07063296437263489, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003547671949490905, + "grad_norm": 8.177084922790527, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8762203454971313, + "num_tokens": 577925727.0, + "step": 15152 + }, + { + "epoch": 1.9276173514819996, + "ewc_loss": 0.07087311148643494, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003571686684153974, + "grad_norm": 8.258499145507812, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8673025965690613, + "num_tokens": 577965399.0, + "step": 15153 + }, + { + "epoch": 1.9277445617605902, + "ewc_loss": 0.07054516673088074, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035388919059187174, + "grad_norm": 8.225831031799316, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.862084686756134, + "num_tokens": 578007608.0, + "step": 15154 + }, + { + "epoch": 1.9278717720391807, + "ewc_loss": 0.0707092434167862, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000355529977241531, + "grad_norm": 8.256237983703613, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8731184005737305, + "num_tokens": 578043425.0, + "step": 15155 + }, + { + "epoch": 1.9279989823177712, + "ewc_loss": 0.07072409987449646, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003556785231921822, + "grad_norm": 8.268373489379883, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8593763709068298, + "num_tokens": 578075838.0, + "step": 15156 + }, + { + "epoch": 1.9281261925963618, + "ewc_loss": 0.07052304595708847, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003536679723765701, + "grad_norm": 8.2249755859375, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8576167821884155, + "num_tokens": 578110113.0, + "step": 15157 + }, + { + "epoch": 1.9282534028749523, + "ewc_loss": 0.07057320326566696, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000354169518686831, + "grad_norm": 8.227791786193848, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.863220751285553, + "num_tokens": 578145743.0, + "step": 15158 + }, + { + "epoch": 1.9283806131535428, + "ewc_loss": 0.07038941979408264, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035477313213050365, + "grad_norm": 8.27911376953125, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8703623414039612, + "num_tokens": 578181588.0, + "step": 15159 + }, + { + "epoch": 1.9285078234321333, + "ewc_loss": 0.07053259015083313, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003537633747328073, + "grad_norm": 8.25440502166748, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8610361814498901, + "num_tokens": 578222048.0, + "step": 15160 + }, + { + "epoch": 1.9286350337107239, + "ewc_loss": 0.0706409364938736, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000354846881236881, + "grad_norm": 8.225198745727539, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8658945560455322, + "num_tokens": 578262664.0, + "step": 15161 + }, + { + "epoch": 1.9287622439893144, + "ewc_loss": 0.07063142955303192, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003547518281266093, + "grad_norm": 8.324127197265625, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8524681925773621, + "num_tokens": 578303431.0, + "step": 15162 + }, + { + "epoch": 1.928889454267905, + "ewc_loss": 0.0702647864818573, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003510854148771614, + "grad_norm": 8.194598197937012, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8529835939407349, + "num_tokens": 578338978.0, + "step": 15163 + }, + { + "epoch": 1.9290166645464955, + "ewc_loss": 0.07076849043369293, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035612238571047783, + "grad_norm": 8.319816589355469, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.858944833278656, + "num_tokens": 578376340.0, + "step": 15164 + }, + { + "epoch": 1.929143874825086, + "ewc_loss": 0.07029590010643005, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035139647661708295, + "grad_norm": 8.264294624328613, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8647657632827759, + "num_tokens": 578414714.0, + "step": 15165 + }, + { + "epoch": 1.9292710851036765, + "ewc_loss": 0.07071691751480103, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035560663673095405, + "grad_norm": 8.28065299987793, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8492063283920288, + "num_tokens": 578453668.0, + "step": 15166 + }, + { + "epoch": 1.929398295382267, + "ewc_loss": 0.07045198976993561, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003529573732521385, + "grad_norm": 8.405036926269531, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8554965853691101, + "num_tokens": 578494148.0, + "step": 15167 + }, + { + "epoch": 1.9295255056608573, + "ewc_loss": 0.07027800381183624, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035121754626743495, + "grad_norm": 8.292815208435059, + "learning_rate": 1e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8424677848815918, + "num_tokens": 578530743.0, + "step": 15168 + }, + { + "epoch": 1.9296527159394479, + "ewc_loss": 0.07051680982112885, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035360557376407087, + "grad_norm": 8.256998062133789, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8846024870872498, + "num_tokens": 578567613.0, + "step": 15169 + }, + { + "epoch": 1.9297799262180384, + "ewc_loss": 0.07038463652133942, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003522838815115392, + "grad_norm": 8.361891746520996, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8612027168273926, + "num_tokens": 578602981.0, + "step": 15170 + }, + { + "epoch": 1.929907136496629, + "ewc_loss": 0.07027043402194977, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035114187630824745, + "grad_norm": 8.273186683654785, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8562085628509521, + "num_tokens": 578641027.0, + "step": 15171 + }, + { + "epoch": 1.9300343467752195, + "ewc_loss": 0.07037073373794556, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035214488161727786, + "grad_norm": 8.23350715637207, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8764435648918152, + "num_tokens": 578683006.0, + "step": 15172 + }, + { + "epoch": 1.93016155705381, + "ewc_loss": 0.07037296146154404, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003521671169437468, + "grad_norm": 8.373841285705566, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8659927248954773, + "num_tokens": 578716722.0, + "step": 15173 + }, + { + "epoch": 1.9302887673324005, + "ewc_loss": 0.07015220075845718, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003499595040921122, + "grad_norm": 8.242060661315918, + "learning_rate": 1e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8435165882110596, + "num_tokens": 578750740.0, + "step": 15174 + }, + { + "epoch": 1.9304159776109908, + "ewc_loss": 0.07047423720359802, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003531798720359802, + "grad_norm": 8.2705078125, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.864547610282898, + "num_tokens": 578791358.0, + "step": 15175 + }, + { + "epoch": 1.9305431878895813, + "ewc_loss": 0.07021570205688477, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003505945496726781, + "grad_norm": 8.215806007385254, + "learning_rate": 1e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8463163375854492, + "num_tokens": 578834514.0, + "step": 15176 + }, + { + "epoch": 1.9306703981681719, + "ewc_loss": 0.07053268700838089, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003537643642630428, + "grad_norm": 8.344613075256348, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8552666306495667, + "num_tokens": 578875869.0, + "step": 15177 + }, + { + "epoch": 1.9307976084467624, + "ewc_loss": 0.07007750868797302, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034921258338727057, + "grad_norm": 8.12171459197998, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8664100170135498, + "num_tokens": 578921079.0, + "step": 15178 + }, + { + "epoch": 1.930924818725353, + "ewc_loss": 0.07083001732826233, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035673766979016364, + "grad_norm": 8.41865062713623, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8609645366668701, + "num_tokens": 578959837.0, + "step": 15179 + }, + { + "epoch": 1.9310520290039435, + "ewc_loss": 0.06998923420906067, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003483298060018569, + "grad_norm": 8.408721923828125, + "learning_rate": 1e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8340113162994385, + "num_tokens": 579001629.0, + "step": 15180 + }, + { + "epoch": 1.931179239282534, + "ewc_loss": 0.0702381357550621, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035081885289400816, + "grad_norm": 8.170402526855469, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.885446310043335, + "num_tokens": 579036762.0, + "step": 15181 + }, + { + "epoch": 1.9313064495611245, + "ewc_loss": 0.07047049701213837, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003531425027176738, + "grad_norm": 8.260113716125488, + "learning_rate": 1e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8411409854888916, + "num_tokens": 579078455.0, + "step": 15182 + }, + { + "epoch": 1.931433659839715, + "ewc_loss": 0.07007408142089844, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003491782699711621, + "grad_norm": 8.234705924987793, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8663445711135864, + "num_tokens": 579116942.0, + "step": 15183 + }, + { + "epoch": 1.9315608701183056, + "ewc_loss": 0.07046989351511002, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003531364200171083, + "grad_norm": 8.1826753616333, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8763908743858337, + "num_tokens": 579155180.0, + "step": 15184 + }, + { + "epoch": 1.931688080396896, + "ewc_loss": 0.07032796740531921, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035171714262105525, + "grad_norm": 8.25694751739502, + "learning_rate": 1e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8555722236633301, + "num_tokens": 579194336.0, + "step": 15185 + }, + { + "epoch": 1.9318152906754866, + "ewc_loss": 0.07035785913467407, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035201606806367636, + "grad_norm": 8.265815734863281, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8576074838638306, + "num_tokens": 579226609.0, + "step": 15186 + }, + { + "epoch": 1.9319425009540772, + "ewc_loss": 0.07037615776062012, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035219904384575784, + "grad_norm": 8.251660346984863, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8666203022003174, + "num_tokens": 579259288.0, + "step": 15187 + }, + { + "epoch": 1.9320697112326677, + "ewc_loss": 0.07031703740358353, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035160788684152067, + "grad_norm": 8.179685592651367, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8711490631103516, + "num_tokens": 579294797.0, + "step": 15188 + }, + { + "epoch": 1.9321969215112582, + "ewc_loss": 0.07055562734603882, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003539937606547028, + "grad_norm": 8.277956008911133, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.872602105140686, + "num_tokens": 579334352.0, + "step": 15189 + }, + { + "epoch": 1.9323241317898487, + "ewc_loss": 0.07030953466892242, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035153282806277275, + "grad_norm": 8.18030834197998, + "learning_rate": 1e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8514655232429504, + "num_tokens": 579379618.0, + "step": 15190 + }, + { + "epoch": 1.9324513420684393, + "ewc_loss": 0.07060331851243973, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003544706851243973, + "grad_norm": 8.29592227935791, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.861034631729126, + "num_tokens": 579420230.0, + "step": 15191 + }, + { + "epoch": 1.9325785523470298, + "ewc_loss": 0.07030156254768372, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003514530835673213, + "grad_norm": 8.271671295166016, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8624415993690491, + "num_tokens": 579458236.0, + "step": 15192 + }, + { + "epoch": 1.93270576262562, + "ewc_loss": 0.07055029273033142, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035394044243730605, + "grad_norm": 8.277685165405273, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8612383604049683, + "num_tokens": 579495561.0, + "step": 15193 + }, + { + "epoch": 1.9328329729042106, + "ewc_loss": 0.07035797834396362, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003520172322168946, + "grad_norm": 8.246576309204102, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.860015869140625, + "num_tokens": 579530225.0, + "step": 15194 + }, + { + "epoch": 1.9329601831828012, + "ewc_loss": 0.07046757638454437, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035311325336806476, + "grad_norm": 8.253046989440918, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8539279103279114, + "num_tokens": 579570143.0, + "step": 15195 + }, + { + "epoch": 1.9330873934613917, + "ewc_loss": 0.07040373980998993, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003524749481584877, + "grad_norm": 8.202446937561035, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8765151500701904, + "num_tokens": 579607105.0, + "step": 15196 + }, + { + "epoch": 1.9332146037399822, + "ewc_loss": 0.07043163478374481, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000352753879269585, + "grad_norm": 8.287915229797363, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8789790868759155, + "num_tokens": 579640018.0, + "step": 15197 + }, + { + "epoch": 1.9333418140185727, + "ewc_loss": 0.07016511261463165, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035008866689167917, + "grad_norm": 8.184931755065918, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8578973412513733, + "num_tokens": 579681250.0, + "step": 15198 + }, + { + "epoch": 1.933469024297163, + "ewc_loss": 0.0705399215221405, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003538367454893887, + "grad_norm": 8.257486343383789, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8527130484580994, + "num_tokens": 579718535.0, + "step": 15199 + }, + { + "epoch": 1.9335962345757536, + "ewc_loss": 0.0701950415968895, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003503879124764353, + "grad_norm": 8.214656829833984, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8499754071235657, + "num_tokens": 579756959.0, + "step": 15200 + }, + { + "epoch": 1.933723444854344, + "ewc_loss": 0.07043814659118652, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003528190136421472, + "grad_norm": 8.244152069091797, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.854906439781189, + "num_tokens": 579798266.0, + "step": 15201 + }, + { + "epoch": 1.9338506551329346, + "ewc_loss": 0.07028135657310486, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035125104477629066, + "grad_norm": 8.243185043334961, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8819359540939331, + "num_tokens": 579831869.0, + "step": 15202 + }, + { + "epoch": 1.9339778654115252, + "ewc_loss": 0.07039329409599304, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003523704071994871, + "grad_norm": 8.281855583190918, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8640511631965637, + "num_tokens": 579874168.0, + "step": 15203 + }, + { + "epoch": 1.9341050756901157, + "ewc_loss": 0.07025472819805145, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000350984773831442, + "grad_norm": 8.161518096923828, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8678096532821655, + "num_tokens": 579916313.0, + "step": 15204 + }, + { + "epoch": 1.9342322859687062, + "ewc_loss": 0.07052396982908249, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003536771982908249, + "grad_norm": 8.249727249145508, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8743097186088562, + "num_tokens": 579959022.0, + "step": 15205 + }, + { + "epoch": 1.9343594962472968, + "ewc_loss": 0.07012836635112762, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034972114372067153, + "grad_norm": 8.220930099487305, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8474746942520142, + "num_tokens": 580003675.0, + "step": 15206 + }, + { + "epoch": 1.9344867065258873, + "ewc_loss": 0.0704815536737442, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000353253009961918, + "grad_norm": 8.364619255065918, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8725262880325317, + "num_tokens": 580044384.0, + "step": 15207 + }, + { + "epoch": 1.9346139168044778, + "ewc_loss": 0.0701354444026947, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003497919242363423, + "grad_norm": 8.184889793395996, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8528944253921509, + "num_tokens": 580078955.0, + "step": 15208 + }, + { + "epoch": 1.9347411270830683, + "ewc_loss": 0.07060602307319641, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003544977807905525, + "grad_norm": 8.313292503356934, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8607439398765564, + "num_tokens": 580116793.0, + "step": 15209 + }, + { + "epoch": 1.9348683373616589, + "ewc_loss": 0.07000862061977386, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003485236957203597, + "grad_norm": 8.1522798538208, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8629190921783447, + "num_tokens": 580153622.0, + "step": 15210 + }, + { + "epoch": 1.9349955476402494, + "ewc_loss": 0.070694699883461, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003553845454007387, + "grad_norm": 8.329347610473633, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8600015640258789, + "num_tokens": 580195287.0, + "step": 15211 + }, + { + "epoch": 1.93512275791884, + "ewc_loss": 0.0701129287481308, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034956683521158993, + "grad_norm": 8.200020790100098, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8492528200149536, + "num_tokens": 580229601.0, + "step": 15212 + }, + { + "epoch": 1.9352499681974304, + "ewc_loss": 0.0706346333026886, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035478384234011173, + "grad_norm": 8.319605827331543, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8657406568527222, + "num_tokens": 580268079.0, + "step": 15213 + }, + { + "epoch": 1.935377178476021, + "ewc_loss": 0.07013175636529922, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003497550787869841, + "grad_norm": 8.18184757232666, + "learning_rate": 1e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.8450061678886414, + "num_tokens": 580305379.0, + "step": 15214 + }, + { + "epoch": 1.9355043887546115, + "ewc_loss": 0.07065173238515854, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035495482734404504, + "grad_norm": 8.374303817749023, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8615554571151733, + "num_tokens": 580338438.0, + "step": 15215 + }, + { + "epoch": 1.935631599033202, + "ewc_loss": 0.07019945979118347, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035043206298723817, + "grad_norm": 8.223294258117676, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8629086017608643, + "num_tokens": 580370017.0, + "step": 15216 + }, + { + "epoch": 1.9357588093117923, + "ewc_loss": 0.07052221149206161, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000353659619577229, + "grad_norm": 8.329465866088867, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8738095760345459, + "num_tokens": 580409572.0, + "step": 15217 + }, + { + "epoch": 1.9358860195903829, + "ewc_loss": 0.07021491229534149, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003505866334307939, + "grad_norm": 8.14456558227539, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8676731586456299, + "num_tokens": 580448481.0, + "step": 15218 + }, + { + "epoch": 1.9360132298689734, + "ewc_loss": 0.070652075111866, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035495826159603894, + "grad_norm": 8.34549331665039, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8741289973258972, + "num_tokens": 580481919.0, + "step": 15219 + }, + { + "epoch": 1.936140440147564, + "ewc_loss": 0.07013379782438278, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034977548057213426, + "grad_norm": 8.206953048706055, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8576928377151489, + "num_tokens": 580520758.0, + "step": 15220 + }, + { + "epoch": 1.9362676504261545, + "ewc_loss": 0.0704980194568634, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035341770853847265, + "grad_norm": 8.296822547912598, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8548250794410706, + "num_tokens": 580559839.0, + "step": 15221 + }, + { + "epoch": 1.936394860704745, + "ewc_loss": 0.07013452053070068, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000349782727425918, + "grad_norm": 8.213592529296875, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8891056776046753, + "num_tokens": 580591767.0, + "step": 15222 + }, + { + "epoch": 1.9365220709833355, + "ewc_loss": 0.07033009827136993, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003517384466249496, + "grad_norm": 8.320704460144043, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8710380792617798, + "num_tokens": 580633180.0, + "step": 15223 + }, + { + "epoch": 1.9366492812619258, + "ewc_loss": 0.0702100321650505, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035053782630711794, + "grad_norm": 8.197720527648926, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8643503189086914, + "num_tokens": 580677045.0, + "step": 15224 + }, + { + "epoch": 1.9367764915405163, + "ewc_loss": 0.07037511467933655, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035218868288211524, + "grad_norm": 8.272037506103516, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8757349252700806, + "num_tokens": 580717423.0, + "step": 15225 + }, + { + "epoch": 1.9369037018191069, + "ewc_loss": 0.07021297514438629, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003505672502797097, + "grad_norm": 8.254861831665039, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8661912679672241, + "num_tokens": 580756677.0, + "step": 15226 + }, + { + "epoch": 1.9370309120976974, + "ewc_loss": 0.0703037679195404, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035147517337463796, + "grad_norm": 8.261282920837402, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8667376041412354, + "num_tokens": 580796993.0, + "step": 15227 + }, + { + "epoch": 1.937158122376288, + "ewc_loss": 0.07045857608318329, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000353023293428123, + "grad_norm": 8.294301986694336, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8654366135597229, + "num_tokens": 580834616.0, + "step": 15228 + }, + { + "epoch": 1.9372853326548785, + "ewc_loss": 0.07020504772663116, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003504879423417151, + "grad_norm": 8.230069160461426, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8654412031173706, + "num_tokens": 580873718.0, + "step": 15229 + }, + { + "epoch": 1.937412542933469, + "ewc_loss": 0.0703909695148468, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035234715323895216, + "grad_norm": 8.327096939086914, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8593575954437256, + "num_tokens": 580911678.0, + "step": 15230 + }, + { + "epoch": 1.9375397532120595, + "ewc_loss": 0.06997190415859222, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035059789661318064, + "grad_norm": 8.264981269836426, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8688998222351074, + "num_tokens": 580953282.0, + "step": 15231 + }, + { + "epoch": 1.93766696349065, + "ewc_loss": 0.07030972838401794, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003515348071232438, + "grad_norm": 8.26494026184082, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8682747483253479, + "num_tokens": 580997954.0, + "step": 15232 + }, + { + "epoch": 1.9377941737692406, + "ewc_loss": 0.07018432021141052, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035028072306886315, + "grad_norm": 8.342597007751465, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8637416362762451, + "num_tokens": 581035623.0, + "step": 15233 + }, + { + "epoch": 1.937921384047831, + "ewc_loss": 0.07009823620319366, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003494198899716139, + "grad_norm": 8.331786155700684, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8733142614364624, + "num_tokens": 581068727.0, + "step": 15234 + }, + { + "epoch": 1.9380485943264216, + "ewc_loss": 0.07003805786371231, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034881808096542954, + "grad_norm": 8.284077644348145, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8627591729164124, + "num_tokens": 581111180.0, + "step": 15235 + }, + { + "epoch": 1.9381758046050122, + "ewc_loss": 0.07013309001922607, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003497684374451637, + "grad_norm": 8.356181144714355, + "learning_rate": 1e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8440513014793396, + "num_tokens": 581152373.0, + "step": 15236 + }, + { + "epoch": 1.9383030148836027, + "ewc_loss": 0.06994716823101044, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034790922654792666, + "grad_norm": 8.239385604858398, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8606950044631958, + "num_tokens": 581195854.0, + "step": 15237 + }, + { + "epoch": 1.9384302251621932, + "ewc_loss": 0.07034879922866821, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035192546783946455, + "grad_norm": 8.463006019592285, + "learning_rate": 1e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.8352384567260742, + "num_tokens": 581236414.0, + "step": 15238 + }, + { + "epoch": 1.9385574354407837, + "ewc_loss": 0.0698394775390625, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003468322684057057, + "grad_norm": 8.24692440032959, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8470429182052612, + "num_tokens": 581275349.0, + "step": 15239 + }, + { + "epoch": 1.9386846457193743, + "ewc_loss": 0.07045701146125793, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003530076064635068, + "grad_norm": 8.383485794067383, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.870410144329071, + "num_tokens": 581313231.0, + "step": 15240 + }, + { + "epoch": 1.9388118559979648, + "ewc_loss": 0.06991170346736908, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003475545672699809, + "grad_norm": 8.259347915649414, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8629595041275024, + "num_tokens": 581355295.0, + "step": 15241 + }, + { + "epoch": 1.938939066276555, + "ewc_loss": 0.07041803002357483, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003526178188621998, + "grad_norm": 8.336108207702637, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8668795228004456, + "num_tokens": 581392633.0, + "step": 15242 + }, + { + "epoch": 1.9390662765551456, + "ewc_loss": 0.07003176212310791, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034875510027632117, + "grad_norm": 8.279973030090332, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8766391277313232, + "num_tokens": 581425552.0, + "step": 15243 + }, + { + "epoch": 1.9391934868337362, + "ewc_loss": 0.0702531710267067, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035096920328214765, + "grad_norm": 8.38158893585205, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8496295809745789, + "num_tokens": 581467707.0, + "step": 15244 + }, + { + "epoch": 1.9393206971123267, + "ewc_loss": 0.07003012299537659, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003487387439236045, + "grad_norm": 8.24422550201416, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8692915439605713, + "num_tokens": 581504888.0, + "step": 15245 + }, + { + "epoch": 1.9394479073909172, + "ewc_loss": 0.07038965821266174, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003523340274114162, + "grad_norm": 8.36106014251709, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8628566265106201, + "num_tokens": 581537800.0, + "step": 15246 + }, + { + "epoch": 1.9395751176695077, + "ewc_loss": 0.07010652869939804, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034950277768075466, + "grad_norm": 8.280603408813477, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.883408784866333, + "num_tokens": 581570594.0, + "step": 15247 + }, + { + "epoch": 1.939702327948098, + "ewc_loss": 0.07042611390352249, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035269864019937813, + "grad_norm": 14.041484832763672, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8547837734222412, + "num_tokens": 581610764.0, + "step": 15248 + }, + { + "epoch": 1.9398295382266886, + "ewc_loss": 0.07845911383628845, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00043302858830429614, + "grad_norm": 9.219217300415039, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8636279106140137, + "num_tokens": 581652341.0, + "step": 15249 + }, + { + "epoch": 1.939956748505279, + "ewc_loss": 0.07058948278427124, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003543323546182364, + "grad_norm": 8.502232551574707, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8551393151283264, + "num_tokens": 581689978.0, + "step": 15250 + }, + { + "epoch": 1.9400839587838696, + "ewc_loss": 0.07128283381462097, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003612658765632659, + "grad_norm": 8.538040161132812, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.872512936592102, + "num_tokens": 581728811.0, + "step": 15251 + }, + { + "epoch": 1.9402111690624602, + "ewc_loss": 0.07163067907094955, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003647443081717938, + "grad_norm": 8.499039649963379, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8729804754257202, + "num_tokens": 581768016.0, + "step": 15252 + }, + { + "epoch": 1.9403383793410507, + "ewc_loss": 0.07082223892211914, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035665984614752233, + "grad_norm": 8.42667293548584, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8657506704330444, + "num_tokens": 581805405.0, + "step": 15253 + }, + { + "epoch": 1.9404655896196412, + "ewc_loss": 0.07094397395849228, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035787723027169704, + "grad_norm": 8.485533714294434, + "learning_rate": 1e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8428338170051575, + "num_tokens": 581845302.0, + "step": 15254 + }, + { + "epoch": 1.9405927998982317, + "ewc_loss": 0.07055017352104187, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003563806531019509, + "grad_norm": 8.440452575683594, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8717731833457947, + "num_tokens": 581875391.0, + "step": 15255 + }, + { + "epoch": 1.9407200101768223, + "ewc_loss": 0.07089617848396301, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000357399316271767, + "grad_norm": 8.439340591430664, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8623156547546387, + "num_tokens": 581916580.0, + "step": 15256 + }, + { + "epoch": 1.9408472204554128, + "ewc_loss": 0.07057081162929535, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000354145624442026, + "grad_norm": 8.354818344116211, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8646333813667297, + "num_tokens": 581958539.0, + "step": 15257 + }, + { + "epoch": 1.9409744307340033, + "ewc_loss": 0.07074640691280365, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003559015749488026, + "grad_norm": 8.446524620056152, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8613299131393433, + "num_tokens": 581996981.0, + "step": 15258 + }, + { + "epoch": 1.9411016410125939, + "ewc_loss": 0.07017627358436584, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003526416840031743, + "grad_norm": 8.37130355834961, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8726410269737244, + "num_tokens": 582036068.0, + "step": 15259 + }, + { + "epoch": 1.9412288512911844, + "ewc_loss": 0.07038585841655731, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035229610512033105, + "grad_norm": 14.064554214477539, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8641349077224731, + "num_tokens": 582073322.0, + "step": 15260 + }, + { + "epoch": 1.941356061569775, + "ewc_loss": 0.07865305989980698, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0004349680966697633, + "grad_norm": 9.297654151916504, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8664795756340027, + "num_tokens": 582110736.0, + "step": 15261 + }, + { + "epoch": 1.9414832718483654, + "ewc_loss": 0.07030609995126724, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035149851464666426, + "grad_norm": 8.40855884552002, + "learning_rate": 1e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8456265926361084, + "num_tokens": 582153431.0, + "step": 15262 + }, + { + "epoch": 1.941610482126956, + "ewc_loss": 0.07147970050573349, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003632345178630203, + "grad_norm": 8.602771759033203, + "learning_rate": 1e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8441665172576904, + "num_tokens": 582191793.0, + "step": 15263 + }, + { + "epoch": 1.9417376924055465, + "ewc_loss": 0.07128143310546875, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036125187762081623, + "grad_norm": 8.42021369934082, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8540225028991699, + "num_tokens": 582227923.0, + "step": 15264 + }, + { + "epoch": 1.941864902684137, + "ewc_loss": 0.07106444984674454, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000359082012437284, + "grad_norm": 8.487483978271484, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8465388417243958, + "num_tokens": 582268029.0, + "step": 15265 + }, + { + "epoch": 1.9419921129627273, + "ewc_loss": 0.07070689648389816, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000355506461346522, + "grad_norm": 8.342512130737305, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8562265634536743, + "num_tokens": 582302173.0, + "step": 15266 + }, + { + "epoch": 1.9421193232413179, + "ewc_loss": 0.07047294080257416, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00035804975777864456, + "grad_norm": 8.442632675170898, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8787924647331238, + "num_tokens": 582341642.0, + "step": 15267 + }, + { + "epoch": 1.9422465335199084, + "ewc_loss": 0.07039826363325119, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003548615495674312, + "grad_norm": 8.281085014343262, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.864690899848938, + "num_tokens": 582378960.0, + "step": 15268 + }, + { + "epoch": 1.942373743798499, + "ewc_loss": 0.07047367095947266, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035561565891839564, + "grad_norm": 8.378999710083008, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8704203367233276, + "num_tokens": 582422149.0, + "step": 15269 + }, + { + "epoch": 1.9425009540770894, + "ewc_loss": 0.07023045420646667, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003531834518071264, + "grad_norm": 8.287406921386719, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8489307761192322, + "num_tokens": 582462657.0, + "step": 15270 + }, + { + "epoch": 1.94262816435568, + "ewc_loss": 0.07085075229406357, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035694503458216786, + "grad_norm": 8.472662925720215, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8752401471138, + "num_tokens": 582499264.0, + "step": 15271 + }, + { + "epoch": 1.9427553746342705, + "ewc_loss": 0.07001568377017975, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003510357637424022, + "grad_norm": 8.290132522583008, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8662736415863037, + "num_tokens": 582534744.0, + "step": 15272 + }, + { + "epoch": 1.9428825849128608, + "ewc_loss": 0.07074123620986938, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000358291290467605, + "grad_norm": 8.39203929901123, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8586040735244751, + "num_tokens": 582577010.0, + "step": 15273 + }, + { + "epoch": 1.9430097951914513, + "ewc_loss": 0.07019840180873871, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035286296042613685, + "grad_norm": 8.308117866516113, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.857476532459259, + "num_tokens": 582617103.0, + "step": 15274 + }, + { + "epoch": 1.9431370054700419, + "ewc_loss": 0.07070328295230865, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003554703143890947, + "grad_norm": 8.333468437194824, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8563360571861267, + "num_tokens": 582661754.0, + "step": 15275 + }, + { + "epoch": 1.9432642157486324, + "ewc_loss": 0.07033787667751312, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003542577032931149, + "grad_norm": 8.284480094909668, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8642808198928833, + "num_tokens": 582701701.0, + "step": 15276 + }, + { + "epoch": 1.943391426027223, + "ewc_loss": 0.07042086124420166, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035508754081092775, + "grad_norm": 8.373024940490723, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8547786474227905, + "num_tokens": 582741977.0, + "step": 15277 + }, + { + "epoch": 1.9435186363058135, + "ewc_loss": 0.07019856572151184, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035286459024064243, + "grad_norm": 8.268437385559082, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8729924559593201, + "num_tokens": 582783420.0, + "step": 15278 + }, + { + "epoch": 1.943645846584404, + "ewc_loss": 0.0706244558095932, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035712344106286764, + "grad_norm": 8.388240814208984, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8703038096427917, + "num_tokens": 582819464.0, + "step": 15279 + }, + { + "epoch": 1.9437730568629945, + "ewc_loss": 0.07010321319103241, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035191100323572755, + "grad_norm": 8.238636016845703, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8672873973846436, + "num_tokens": 582860757.0, + "step": 15280 + }, + { + "epoch": 1.943900267141585, + "ewc_loss": 0.07055138796567917, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035639278939925134, + "grad_norm": 8.33323860168457, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.857597827911377, + "num_tokens": 582910300.0, + "step": 15281 + }, + { + "epoch": 1.9440274774201756, + "ewc_loss": 0.07049021124839783, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003533396520651877, + "grad_norm": 8.259520530700684, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8566405773162842, + "num_tokens": 582956333.0, + "step": 15282 + }, + { + "epoch": 1.944154687698766, + "ewc_loss": 0.07053539901971817, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035623289295472205, + "grad_norm": 8.31562328338623, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8719763159751892, + "num_tokens": 582994068.0, + "step": 15283 + }, + { + "epoch": 1.9442818979773566, + "ewc_loss": 0.07033750414848328, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000354253948898986, + "grad_norm": 8.314977645874023, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8607237935066223, + "num_tokens": 583027829.0, + "step": 15284 + }, + { + "epoch": 1.9444091082559471, + "ewc_loss": 0.0704735741019249, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035561464028432965, + "grad_norm": 8.218880653381348, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8607512712478638, + "num_tokens": 583068881.0, + "step": 15285 + }, + { + "epoch": 1.9445363185345377, + "ewc_loss": 0.07064500451087952, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003573289141058922, + "grad_norm": 8.35971450805664, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8667436242103577, + "num_tokens": 583100097.0, + "step": 15286 + }, + { + "epoch": 1.9446635288131282, + "ewc_loss": 0.07042627036571503, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003551416448317468, + "grad_norm": 8.286689758300781, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8628485798835754, + "num_tokens": 583135340.0, + "step": 15287 + }, + { + "epoch": 1.9447907390917187, + "ewc_loss": 0.07066094875335693, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003574883739929646, + "grad_norm": 8.28164291381836, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8621572256088257, + "num_tokens": 583172032.0, + "step": 15288 + }, + { + "epoch": 1.9449179493703093, + "ewc_loss": 0.0707625299692154, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000356062751961872, + "grad_norm": 8.208824157714844, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8670223951339722, + "num_tokens": 583205244.0, + "step": 15289 + }, + { + "epoch": 1.9450451596488998, + "ewc_loss": 0.07075123488903046, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003583912330213934, + "grad_norm": 8.293679237365723, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8577336072921753, + "num_tokens": 583247537.0, + "step": 15290 + }, + { + "epoch": 1.94517236992749, + "ewc_loss": 0.07041693478822708, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035504825063981116, + "grad_norm": 8.250473976135254, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.856905460357666, + "num_tokens": 583285349.0, + "step": 15291 + }, + { + "epoch": 1.9452995802060806, + "ewc_loss": 0.0706835389137268, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035771424882113934, + "grad_norm": 8.240660667419434, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8652868270874023, + "num_tokens": 583323810.0, + "step": 15292 + }, + { + "epoch": 1.9454267904846712, + "ewc_loss": 0.07074753940105438, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003559128672350198, + "grad_norm": 8.216567993164062, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.87440025806427, + "num_tokens": 583354244.0, + "step": 15293 + }, + { + "epoch": 1.9455540007632617, + "ewc_loss": 0.07071465253829956, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003580253978725523, + "grad_norm": 8.290267944335938, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8780338764190674, + "num_tokens": 583391764.0, + "step": 15294 + }, + { + "epoch": 1.9456812110418522, + "ewc_loss": 0.07063723355531693, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035725123598240316, + "grad_norm": 8.224647521972656, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8508115410804749, + "num_tokens": 583426305.0, + "step": 15295 + }, + { + "epoch": 1.9458084213204427, + "ewc_loss": 0.07081026583909988, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003589815751183778, + "grad_norm": 8.282000541687012, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8591592311859131, + "num_tokens": 583465252.0, + "step": 15296 + }, + { + "epoch": 1.945935631599033, + "ewc_loss": 0.07067136466503143, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003575925948098302, + "grad_norm": 8.255105018615723, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8538432717323303, + "num_tokens": 583510001.0, + "step": 15297 + }, + { + "epoch": 1.9460628418776236, + "ewc_loss": 0.07067986577749252, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003576775488909334, + "grad_norm": 8.220842361450195, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8645992279052734, + "num_tokens": 583547380.0, + "step": 15298 + }, + { + "epoch": 1.946190052156214, + "ewc_loss": 0.07061073184013367, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035698627470992506, + "grad_norm": 8.24956226348877, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8639431595802307, + "num_tokens": 583588852.0, + "step": 15299 + }, + { + "epoch": 1.9463172624348046, + "ewc_loss": 0.07065007090568542, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035737958387471735, + "grad_norm": 8.30009937286377, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8689529895782471, + "num_tokens": 583634006.0, + "step": 15300 + }, + { + "epoch": 1.9464444727133952, + "ewc_loss": 0.07064226269721985, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035730149829760194, + "grad_norm": 8.262001037597656, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8481605052947998, + "num_tokens": 583679430.0, + "step": 15301 + }, + { + "epoch": 1.9465716829919857, + "ewc_loss": 0.07068541646003723, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035773302079178393, + "grad_norm": 8.283361434936523, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8690675497055054, + "num_tokens": 583708856.0, + "step": 15302 + }, + { + "epoch": 1.9466988932705762, + "ewc_loss": 0.07060296088457108, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035690850927494466, + "grad_norm": 8.273685455322266, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8608692288398743, + "num_tokens": 583750737.0, + "step": 15303 + }, + { + "epoch": 1.9468261035491667, + "ewc_loss": 0.07071449607610703, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003580238844733685, + "grad_norm": 8.289567947387695, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8675342798233032, + "num_tokens": 583786342.0, + "step": 15304 + }, + { + "epoch": 1.9469533138277573, + "ewc_loss": 0.07025127112865448, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003558330063242465, + "grad_norm": 8.196226119995117, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8687857389450073, + "num_tokens": 583821681.0, + "step": 15305 + }, + { + "epoch": 1.9470805241063478, + "ewc_loss": 0.07056945562362671, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00035901489900425076, + "grad_norm": 8.354870796203613, + "learning_rate": 1e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8440717458724976, + "num_tokens": 583855640.0, + "step": 15306 + }, + { + "epoch": 1.9472077343849383, + "ewc_loss": 0.07066772878170013, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035755624412558973, + "grad_norm": 8.226024627685547, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8723031878471375, + "num_tokens": 583888047.0, + "step": 15307 + }, + { + "epoch": 1.9473349446635289, + "ewc_loss": 0.0709826648235321, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003607055405154824, + "grad_norm": 8.317721366882324, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.862740695476532, + "num_tokens": 583923220.0, + "step": 15308 + }, + { + "epoch": 1.9474621549421194, + "ewc_loss": 0.07082711160182953, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003567085659597069, + "grad_norm": 8.239632606506348, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8582861423492432, + "num_tokens": 583965718.0, + "step": 15309 + }, + { + "epoch": 1.94758936522071, + "ewc_loss": 0.07121453434228897, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036058283876627684, + "grad_norm": 8.317041397094727, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8679845333099365, + "num_tokens": 584014027.0, + "step": 15310 + }, + { + "epoch": 1.9477165754993004, + "ewc_loss": 0.07090838253498077, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003575212904252112, + "grad_norm": 8.3095703125, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8657438158988953, + "num_tokens": 584045019.0, + "step": 15311 + }, + { + "epoch": 1.947843785777891, + "ewc_loss": 0.07100211828947067, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003584586957003921, + "grad_norm": 8.279990196228027, + "learning_rate": 1e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8467459678649902, + "num_tokens": 584086839.0, + "step": 15312 + }, + { + "epoch": 1.9479709960564815, + "ewc_loss": 0.07094718515872955, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035790930269286036, + "grad_norm": 8.299941062927246, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8778411149978638, + "num_tokens": 584123785.0, + "step": 15313 + }, + { + "epoch": 1.948098206335072, + "ewc_loss": 0.07099258899688721, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003583633806556463, + "grad_norm": 8.271257400512695, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8542375564575195, + "num_tokens": 584163768.0, + "step": 15314 + }, + { + "epoch": 1.9482254166136623, + "ewc_loss": 0.07097582519054413, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035819580079987645, + "grad_norm": 8.285470962524414, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8586474657058716, + "num_tokens": 584201668.0, + "step": 15315 + }, + { + "epoch": 1.9483526268922529, + "ewc_loss": 0.07094359397888184, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035787338856607676, + "grad_norm": 8.323243141174316, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.869930624961853, + "num_tokens": 584236626.0, + "step": 15316 + }, + { + "epoch": 1.9484798371708434, + "ewc_loss": 0.07087570428848267, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000357194512616843, + "grad_norm": 8.219462394714355, + "learning_rate": 1e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8463565111160278, + "num_tokens": 584279811.0, + "step": 15317 + }, + { + "epoch": 1.948607047449434, + "ewc_loss": 0.07112791389226913, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035971662146039307, + "grad_norm": 8.359362602233887, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.863599956035614, + "num_tokens": 584317314.0, + "step": 15318 + }, + { + "epoch": 1.9487342577280244, + "ewc_loss": 0.07090628147125244, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003550589317455888, + "grad_norm": 8.44696044921875, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8683562278747559, + "num_tokens": 584355768.0, + "step": 15319 + }, + { + "epoch": 1.948861468006615, + "ewc_loss": 0.07074940949678421, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003559315809980035, + "grad_norm": 8.297131538391113, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.85930997133255, + "num_tokens": 584386205.0, + "step": 15320 + }, + { + "epoch": 1.9489886782852053, + "ewc_loss": 0.07088686525821686, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035730618401430547, + "grad_norm": 8.301541328430176, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8747744560241699, + "num_tokens": 584419721.0, + "step": 15321 + }, + { + "epoch": 1.9491158885637958, + "ewc_loss": 0.07063709199428558, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035480843507684767, + "grad_norm": 8.286839485168457, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8605611324310303, + "num_tokens": 584451029.0, + "step": 15322 + }, + { + "epoch": 1.9492430988423863, + "ewc_loss": 0.07096593081951141, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003580967604648322, + "grad_norm": 8.312222480773926, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8769552707672119, + "num_tokens": 584483991.0, + "step": 15323 + }, + { + "epoch": 1.9493703091209769, + "ewc_loss": 0.07089477777481079, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003573852591216564, + "grad_norm": 8.312710762023926, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.861706018447876, + "num_tokens": 584517652.0, + "step": 15324 + }, + { + "epoch": 1.9494975193995674, + "ewc_loss": 0.07077478617429733, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003561853663995862, + "grad_norm": 8.288999557495117, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8713357448577881, + "num_tokens": 584561058.0, + "step": 15325 + }, + { + "epoch": 1.949624729678158, + "ewc_loss": 0.07094777375459671, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035791523987427354, + "grad_norm": 8.363609313964844, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8662241101264954, + "num_tokens": 584595097.0, + "step": 15326 + }, + { + "epoch": 1.9497519399567484, + "ewc_loss": 0.07066059857606888, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035504347761161625, + "grad_norm": 8.220452308654785, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8709871768951416, + "num_tokens": 584639073.0, + "step": 15327 + }, + { + "epoch": 1.949879150235339, + "ewc_loss": 0.07107621431350708, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003591995919123292, + "grad_norm": 8.34567642211914, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8535876274108887, + "num_tokens": 584680517.0, + "step": 15328 + }, + { + "epoch": 1.9500063605139295, + "ewc_loss": 0.07054126262664795, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035385016235522926, + "grad_norm": 8.206354141235352, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8619013428688049, + "num_tokens": 584717796.0, + "step": 15329 + }, + { + "epoch": 1.95013357079252, + "ewc_loss": 0.07113420218229294, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035977951483801007, + "grad_norm": 8.38090705871582, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.875316858291626, + "num_tokens": 584755595.0, + "step": 15330 + }, + { + "epoch": 1.9502607810711106, + "ewc_loss": 0.0706062912940979, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035450042923912406, + "grad_norm": 8.226758003234863, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8527781367301941, + "num_tokens": 584798681.0, + "step": 15331 + }, + { + "epoch": 1.950387991349701, + "ewc_loss": 0.07109974324703217, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035943492548540235, + "grad_norm": 8.402032852172852, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8648090362548828, + "num_tokens": 584835424.0, + "step": 15332 + }, + { + "epoch": 1.9505152016282916, + "ewc_loss": 0.0704672634601593, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003531101392582059, + "grad_norm": 8.240867614746094, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8763510584831238, + "num_tokens": 584872063.0, + "step": 15333 + }, + { + "epoch": 1.9506424119068821, + "ewc_loss": 0.07102499902248383, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035868751001544297, + "grad_norm": 8.367558479309082, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.878768265247345, + "num_tokens": 584905850.0, + "step": 15334 + }, + { + "epoch": 1.9507696221854727, + "ewc_loss": 0.07058022916316986, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035423983354121447, + "grad_norm": 8.261756896972656, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8715119361877441, + "num_tokens": 584946480.0, + "step": 15335 + }, + { + "epoch": 1.9508968324640632, + "ewc_loss": 0.0709112137556076, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035754963755607605, + "grad_norm": 8.390597343444824, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8699638843536377, + "num_tokens": 584977382.0, + "step": 15336 + }, + { + "epoch": 1.9510240427426537, + "ewc_loss": 0.07050663232803345, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035350382677279413, + "grad_norm": 8.27719783782959, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8651789426803589, + "num_tokens": 585010885.0, + "step": 15337 + }, + { + "epoch": 1.9511512530212443, + "ewc_loss": 0.07082943618297577, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003567318490240723, + "grad_norm": 8.317874908447266, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8603829145431519, + "num_tokens": 585048208.0, + "step": 15338 + }, + { + "epoch": 1.9512784632998348, + "ewc_loss": 0.07057184725999832, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035415595630183816, + "grad_norm": 8.292506217956543, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8589135408401489, + "num_tokens": 585087477.0, + "step": 15339 + }, + { + "epoch": 1.951405673578425, + "ewc_loss": 0.07078789919614792, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003563164791557938, + "grad_norm": 8.694703102111816, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8625249862670898, + "num_tokens": 585120860.0, + "step": 15340 + }, + { + "epoch": 1.9515328838570156, + "ewc_loss": 0.07013146579265594, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034975219750776887, + "grad_norm": 8.159171104431152, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8659672737121582, + "num_tokens": 585161887.0, + "step": 15341 + }, + { + "epoch": 1.9516600941356061, + "ewc_loss": 0.07131728529930115, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003616103786043823, + "grad_norm": 8.463150024414062, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8718286752700806, + "num_tokens": 585200496.0, + "step": 15342 + }, + { + "epoch": 1.9517873044141967, + "ewc_loss": 0.07007172703742981, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00034915475407615304, + "grad_norm": 8.243876457214355, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8648337125778198, + "num_tokens": 585243976.0, + "step": 15343 + }, + { + "epoch": 1.9519145146927872, + "ewc_loss": 0.07110937684774399, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035953125916421413, + "grad_norm": 8.403489112854004, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8588626980781555, + "num_tokens": 585282366.0, + "step": 15344 + }, + { + "epoch": 1.9520417249713777, + "ewc_loss": 0.07031844556331635, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003516220021992922, + "grad_norm": 8.213347434997559, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8633911609649658, + "num_tokens": 585325224.0, + "step": 15345 + }, + { + "epoch": 1.952168935249968, + "ewc_loss": 0.07096539437770844, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003580914344638586, + "grad_norm": 8.46187973022461, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8723855018615723, + "num_tokens": 585361954.0, + "step": 15346 + }, + { + "epoch": 1.9522961455285586, + "ewc_loss": 0.07031761109828949, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003516136493999511, + "grad_norm": 8.235675811767578, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8604499101638794, + "num_tokens": 585406033.0, + "step": 15347 + }, + { + "epoch": 1.952423355807149, + "ewc_loss": 0.07093952596187592, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003578327305149287, + "grad_norm": 8.408617973327637, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.862571120262146, + "num_tokens": 585440457.0, + "step": 15348 + }, + { + "epoch": 1.9525505660857396, + "ewc_loss": 0.07055407762527466, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003539782774168998, + "grad_norm": 8.317891120910645, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8865787982940674, + "num_tokens": 585473451.0, + "step": 15349 + }, + { + "epoch": 1.9526777763643302, + "ewc_loss": 0.07097875326871872, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035822505014948547, + "grad_norm": 8.4249849319458, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8676179647445679, + "num_tokens": 585509652.0, + "step": 15350 + }, + { + "epoch": 1.9528049866429207, + "ewc_loss": 0.07052461057901382, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035368360113352537, + "grad_norm": 8.282764434814453, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8681750297546387, + "num_tokens": 585545591.0, + "step": 15351 + }, + { + "epoch": 1.9529321969215112, + "ewc_loss": 0.07097776234149933, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003582151257432997, + "grad_norm": 8.401623725891113, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8524705171585083, + "num_tokens": 585583757.0, + "step": 15352 + }, + { + "epoch": 1.9530594072001017, + "ewc_loss": 0.07180853188037872, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000354315823642537, + "grad_norm": 53.6325798034668, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8528715372085571, + "num_tokens": 585623897.0, + "step": 15353 + }, + { + "epoch": 1.9531866174786923, + "ewc_loss": 0.11474428325891495, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0007958803325891495, + "grad_norm": 13.122106552124023, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8665200471878052, + "num_tokens": 585661043.0, + "step": 15354 + }, + { + "epoch": 1.9533138277572828, + "ewc_loss": 0.06878608465194702, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00033629839890636504, + "grad_norm": 6.99373722076416, + "learning_rate": 1e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8368241786956787, + "num_tokens": 585699201.0, + "step": 15355 + }, + { + "epoch": 1.9534410380358733, + "ewc_loss": 0.09890203177928925, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0006374578224495053, + "grad_norm": 12.354403495788574, + "learning_rate": 1e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8567662239074707, + "num_tokens": 585735325.0, + "step": 15356 + }, + { + "epoch": 1.9535682483144639, + "ewc_loss": 0.10336041450500488, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0006820416310802102, + "grad_norm": 11.917952537536621, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8710887432098389, + "num_tokens": 585769909.0, + "step": 15357 + }, + { + "epoch": 1.9536954585930544, + "ewc_loss": 0.0802701935172081, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0004511394363362342, + "grad_norm": 8.78751277923584, + "learning_rate": 1e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.852838397026062, + "num_tokens": 585804590.0, + "step": 15358 + }, + { + "epoch": 1.953822668871645, + "ewc_loss": 0.08414718508720398, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0004899093182757497, + "grad_norm": 10.386381149291992, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8601962924003601, + "num_tokens": 585843349.0, + "step": 15359 + }, + { + "epoch": 1.9539498791502354, + "ewc_loss": 0.08607256412506104, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0005091631319373846, + "grad_norm": 9.833307266235352, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8760550022125244, + "num_tokens": 585881846.0, + "step": 15360 + }, + { + "epoch": 1.954077089428826, + "ewc_loss": 0.07794924080371857, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0004279298591427505, + "grad_norm": 9.058760643005371, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8608233332633972, + "num_tokens": 585917848.0, + "step": 15361 + }, + { + "epoch": 1.9542042997074165, + "ewc_loss": 0.07903476059436798, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0004387850931379944, + "grad_norm": 9.468706130981445, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.869358241558075, + "num_tokens": 585961249.0, + "step": 15362 + }, + { + "epoch": 1.954331509986007, + "ewc_loss": 0.07733836770057678, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00042182113975286484, + "grad_norm": 8.871879577636719, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8724555969238281, + "num_tokens": 586005795.0, + "step": 15363 + }, + { + "epoch": 1.9544587202645973, + "ewc_loss": 0.07602546364068985, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00040869213989935815, + "grad_norm": 9.059906005859375, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8731013536453247, + "num_tokens": 586041523.0, + "step": 15364 + }, + { + "epoch": 1.9545859305431879, + "ewc_loss": 0.07526208460330963, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00040105832158587873, + "grad_norm": 8.818877220153809, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8551415801048279, + "num_tokens": 586077800.0, + "step": 15365 + }, + { + "epoch": 1.9547131408217784, + "ewc_loss": 0.0742502212524414, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003933811385650188, + "grad_norm": 8.796531677246094, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8591136336326599, + "num_tokens": 586116010.0, + "step": 15366 + }, + { + "epoch": 1.954840351100369, + "ewc_loss": 0.07321161776781082, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00038299508742056787, + "grad_norm": 8.617657661437988, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.875766396522522, + "num_tokens": 586155359.0, + "step": 15367 + }, + { + "epoch": 1.9549675613789594, + "ewc_loss": 0.07332087308168411, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00038408764521591365, + "grad_norm": 8.704732894897461, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8672254681587219, + "num_tokens": 586188190.0, + "step": 15368 + }, + { + "epoch": 1.95509477165755, + "ewc_loss": 0.07229825854301453, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003738615196198225, + "grad_norm": 8.53840160369873, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8631336092948914, + "num_tokens": 586225971.0, + "step": 15369 + }, + { + "epoch": 1.9552219819361403, + "ewc_loss": 0.07265334576368332, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003749709576368332, + "grad_norm": 8.622870445251465, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.860141932964325, + "num_tokens": 586263295.0, + "step": 15370 + }, + { + "epoch": 1.9553491922147308, + "ewc_loss": 0.07130296528339386, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003663499956019223, + "grad_norm": 8.400583267211914, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8853524923324585, + "num_tokens": 586293547.0, + "step": 15371 + }, + { + "epoch": 1.9554764024933213, + "ewc_loss": 0.07175550609827042, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00037087537930347025, + "grad_norm": 8.591815948486328, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8597061634063721, + "num_tokens": 586333180.0, + "step": 15372 + }, + { + "epoch": 1.9556036127719119, + "ewc_loss": 0.07097858190536499, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00036310614086687565, + "grad_norm": 8.421226501464844, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8649156093597412, + "num_tokens": 586368780.0, + "step": 15373 + }, + { + "epoch": 1.9557308230505024, + "ewc_loss": 0.07126413285732269, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00036596160498447716, + "grad_norm": 8.50173282623291, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8483101725578308, + "num_tokens": 586404177.0, + "step": 15374 + }, + { + "epoch": 1.955858033329093, + "ewc_loss": 0.07067060470581055, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00036002640263177454, + "grad_norm": 8.3631010055542, + "learning_rate": 1e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8495632410049438, + "num_tokens": 586446917.0, + "step": 15375 + }, + { + "epoch": 1.9559852436076834, + "ewc_loss": 0.07119931280612946, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036287205875851214, + "grad_norm": 8.454431533813477, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8637465238571167, + "num_tokens": 586485832.0, + "step": 15376 + }, + { + "epoch": 1.956112453886274, + "ewc_loss": 0.07081913948059082, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035907034180127084, + "grad_norm": 8.416081428527832, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8746327757835388, + "num_tokens": 586524089.0, + "step": 15377 + }, + { + "epoch": 1.9562396641648645, + "ewc_loss": 0.07096832990646362, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036056223325431347, + "grad_norm": 8.396953582763672, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.867693305015564, + "num_tokens": 586564097.0, + "step": 15378 + }, + { + "epoch": 1.956366874443455, + "ewc_loss": 0.07072370499372482, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035811596899293363, + "grad_norm": 8.343635559082031, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8762626647949219, + "num_tokens": 586600653.0, + "step": 15379 + }, + { + "epoch": 1.9564940847220456, + "ewc_loss": 0.07086050510406494, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003594839945435524, + "grad_norm": 8.366175651550293, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8696644306182861, + "num_tokens": 586639527.0, + "step": 15380 + }, + { + "epoch": 1.956621295000636, + "ewc_loss": 0.07064419984817505, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035732085234485567, + "grad_norm": 8.380843162536621, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8671606779098511, + "num_tokens": 586670885.0, + "step": 15381 + }, + { + "epoch": 1.9567485052792266, + "ewc_loss": 0.07074292749166489, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003583081706892699, + "grad_norm": 8.330002784729004, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8766053915023804, + "num_tokens": 586709875.0, + "step": 15382 + }, + { + "epoch": 1.9568757155578171, + "ewc_loss": 0.07069622725248337, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035784117062576115, + "grad_norm": 8.264053344726562, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8582091927528381, + "num_tokens": 586753164.0, + "step": 15383 + }, + { + "epoch": 1.9570029258364077, + "ewc_loss": 0.07101984322071075, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003586359671317041, + "grad_norm": 8.319839477539062, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8713260889053345, + "num_tokens": 586793737.0, + "step": 15384 + }, + { + "epoch": 1.9571301361149982, + "ewc_loss": 0.07058917731046677, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035677067353390157, + "grad_norm": 8.28783893585205, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8593926429748535, + "num_tokens": 586833428.0, + "step": 15385 + }, + { + "epoch": 1.9572573463935887, + "ewc_loss": 0.07081734389066696, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003590523556340486, + "grad_norm": 8.378496170043945, + "learning_rate": 1e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8416249752044678, + "num_tokens": 586871202.0, + "step": 15386 + }, + { + "epoch": 1.9573845566721793, + "ewc_loss": 0.07062087953090668, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035708773066289723, + "grad_norm": 8.310213088989258, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8754947185516357, + "num_tokens": 586905537.0, + "step": 15387 + }, + { + "epoch": 1.9575117669507698, + "ewc_loss": 0.07080581784248352, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000358937046257779, + "grad_norm": 8.33389949798584, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8525714874267578, + "num_tokens": 586940378.0, + "step": 15388 + }, + { + "epoch": 1.95763897722936, + "ewc_loss": 0.0705806165933609, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035668505006469786, + "grad_norm": 8.25180721282959, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8745068311691284, + "num_tokens": 586978806.0, + "step": 15389 + }, + { + "epoch": 1.9577661875079506, + "ewc_loss": 0.07087653875350952, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003596442984417081, + "grad_norm": 8.433777809143066, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8585734367370605, + "num_tokens": 587007367.0, + "step": 15390 + }, + { + "epoch": 1.9578933977865411, + "ewc_loss": 0.07052819430828094, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003561608027666807, + "grad_norm": 8.232024192810059, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8865668177604675, + "num_tokens": 587040422.0, + "step": 15391 + }, + { + "epoch": 1.9580206080651317, + "ewc_loss": 0.07131856679916382, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003616231260821223, + "grad_norm": 8.428301811218262, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.882166862487793, + "num_tokens": 587070238.0, + "step": 15392 + }, + { + "epoch": 1.9581478183437222, + "ewc_loss": 0.0706675723195076, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003551132103893906, + "grad_norm": 8.269465446472168, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.871371328830719, + "num_tokens": 587105973.0, + "step": 15393 + }, + { + "epoch": 1.9582750286223127, + "ewc_loss": 0.07118375599384308, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003602750366553664, + "grad_norm": 8.360758781433105, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8765312433242798, + "num_tokens": 587137975.0, + "step": 15394 + }, + { + "epoch": 1.958402238900903, + "ewc_loss": 0.07064469158649445, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003548844251781702, + "grad_norm": 8.279738426208496, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8611217737197876, + "num_tokens": 587169176.0, + "step": 15395 + }, + { + "epoch": 1.9585294491794936, + "ewc_loss": 0.07093669474124908, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003578044706955552, + "grad_norm": 8.337789535522461, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8600383996963501, + "num_tokens": 587205570.0, + "step": 15396 + }, + { + "epoch": 1.958656659458084, + "ewc_loss": 0.07077687978744507, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003562063502613455, + "grad_norm": 8.246811866760254, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.855716347694397, + "num_tokens": 587246729.0, + "step": 15397 + }, + { + "epoch": 1.9587838697366746, + "ewc_loss": 0.0710831731557846, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003592692664824426, + "grad_norm": 8.31181526184082, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8676530718803406, + "num_tokens": 587284537.0, + "step": 15398 + }, + { + "epoch": 1.9589110800152651, + "ewc_loss": 0.07070861756801605, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035552369081415236, + "grad_norm": 8.213851928710938, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8691167831420898, + "num_tokens": 587324394.0, + "step": 15399 + }, + { + "epoch": 1.9590382902938557, + "ewc_loss": 0.07108079642057419, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000359245459549129, + "grad_norm": 8.291041374206543, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8725943565368652, + "num_tokens": 587359935.0, + "step": 15400 + }, + { + "epoch": 1.9591655005724462, + "ewc_loss": 0.0707307979464531, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035574549110606313, + "grad_norm": 8.228353500366211, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8750245571136475, + "num_tokens": 587402081.0, + "step": 15401 + }, + { + "epoch": 1.9592927108510367, + "ewc_loss": 0.07107852399349213, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003592227876652032, + "grad_norm": 8.324685096740723, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8615764379501343, + "num_tokens": 587442939.0, + "step": 15402 + }, + { + "epoch": 1.9594199211296273, + "ewc_loss": 0.07074938714504242, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035593134816735983, + "grad_norm": 8.32342529296875, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8566222190856934, + "num_tokens": 587479155.0, + "step": 15403 + }, + { + "epoch": 1.9595471314082178, + "ewc_loss": 0.07086018472909927, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003570393309928477, + "grad_norm": 8.268869400024414, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8524556159973145, + "num_tokens": 587519285.0, + "step": 15404 + }, + { + "epoch": 1.9596743416868083, + "ewc_loss": 0.07100658118724823, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003585033118724823, + "grad_norm": 8.303624153137207, + "learning_rate": 1e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8459558486938477, + "num_tokens": 587563978.0, + "step": 15405 + }, + { + "epoch": 1.9598015519653988, + "ewc_loss": 0.07077119499444962, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003561494522728026, + "grad_norm": 8.230372428894043, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8587707877159119, + "num_tokens": 587599052.0, + "step": 15406 + }, + { + "epoch": 1.9599287622439894, + "ewc_loss": 0.07097608596086502, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035819836193695664, + "grad_norm": 8.321920394897461, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8678946495056152, + "num_tokens": 587639980.0, + "step": 15407 + }, + { + "epoch": 1.96005597252258, + "ewc_loss": 0.07080386579036713, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003564761718735099, + "grad_norm": 8.297670364379883, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.85968017578125, + "num_tokens": 587673836.0, + "step": 15408 + }, + { + "epoch": 1.9601831828011704, + "ewc_loss": 0.07096980512142181, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035813558497466147, + "grad_norm": 8.326457023620605, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.864391565322876, + "num_tokens": 587712896.0, + "step": 15409 + }, + { + "epoch": 1.960310393079761, + "ewc_loss": 0.07080209255218506, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003564583894331008, + "grad_norm": 8.335966110229492, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8574020862579346, + "num_tokens": 587747352.0, + "step": 15410 + }, + { + "epoch": 1.9604376033583515, + "ewc_loss": 0.07089084386825562, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035734593984670937, + "grad_norm": 8.355100631713867, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8626932501792908, + "num_tokens": 587790234.0, + "step": 15411 + }, + { + "epoch": 1.960564813636942, + "ewc_loss": 0.07081282883882523, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035656578256748617, + "grad_norm": 8.337964057922363, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8779312372207642, + "num_tokens": 587830185.0, + "step": 15412 + }, + { + "epoch": 1.9606920239155323, + "ewc_loss": 0.07078371942043304, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003562747151590884, + "grad_norm": 8.339606285095215, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8654052019119263, + "num_tokens": 587866265.0, + "step": 15413 + }, + { + "epoch": 1.9608192341941229, + "ewc_loss": 0.07061642408370972, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003546017687767744, + "grad_norm": 8.21644401550293, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8756223917007446, + "num_tokens": 587904956.0, + "step": 15414 + }, + { + "epoch": 1.9609464444727134, + "ewc_loss": 0.07106772810220718, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003591147833503783, + "grad_norm": 14.125926971435547, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8714460134506226, + "num_tokens": 587942369.0, + "step": 15415 + }, + { + "epoch": 1.961073654751304, + "ewc_loss": 0.07942692935466766, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0004427068342920393, + "grad_norm": 9.2181978225708, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8693307638168335, + "num_tokens": 587979340.0, + "step": 15416 + }, + { + "epoch": 1.9612008650298944, + "ewc_loss": 0.07145338505506516, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036297133192420006, + "grad_norm": 8.426127433776855, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8596440553665161, + "num_tokens": 588021812.0, + "step": 15417 + }, + { + "epoch": 1.961328075308485, + "ewc_loss": 0.07197970896959305, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003682345850393176, + "grad_norm": 8.52757453918457, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8606781959533691, + "num_tokens": 588064978.0, + "step": 15418 + }, + { + "epoch": 1.9614552855870753, + "ewc_loss": 0.0724436566233635, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00037287408486008644, + "grad_norm": 8.549478530883789, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8597007393836975, + "num_tokens": 588098142.0, + "step": 15419 + }, + { + "epoch": 1.9615824958656658, + "ewc_loss": 0.07151942700147629, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003636317851487547, + "grad_norm": 8.413688659667969, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8708838820457458, + "num_tokens": 588141808.0, + "step": 15420 + }, + { + "epoch": 1.9617097061442563, + "ewc_loss": 0.07170821726322174, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003655197215266526, + "grad_norm": 8.451642990112305, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.869798481464386, + "num_tokens": 588178137.0, + "step": 15421 + }, + { + "epoch": 1.9618369164228469, + "ewc_loss": 0.07144735008478165, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036291099968366325, + "grad_norm": 8.520745277404785, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.857347846031189, + "num_tokens": 588215646.0, + "step": 15422 + }, + { + "epoch": 1.9619641267014374, + "ewc_loss": 0.07140421867370605, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003624797100201249, + "grad_norm": 8.380578994750977, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.854820728302002, + "num_tokens": 588251345.0, + "step": 15423 + }, + { + "epoch": 1.962091336980028, + "ewc_loss": 0.0713447630405426, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036188517697155476, + "grad_norm": 8.448472023010254, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8698439598083496, + "num_tokens": 588291222.0, + "step": 15424 + }, + { + "epoch": 1.9622185472586184, + "ewc_loss": 0.07107323408126831, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035916981869377196, + "grad_norm": 8.361222267150879, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8545277118682861, + "num_tokens": 588329480.0, + "step": 15425 + }, + { + "epoch": 1.962345757537209, + "ewc_loss": 0.07138629257678986, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036230043042451143, + "grad_norm": 8.386996269226074, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8560813665390015, + "num_tokens": 588367143.0, + "step": 15426 + }, + { + "epoch": 1.9624729678157995, + "ewc_loss": 0.07108937203884125, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000359331228537485, + "grad_norm": 8.342939376831055, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8735889792442322, + "num_tokens": 588408363.0, + "step": 15427 + }, + { + "epoch": 1.96260017809439, + "ewc_loss": 0.07135205715894699, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003619580820668489, + "grad_norm": 8.375068664550781, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8697837591171265, + "num_tokens": 588445397.0, + "step": 15428 + }, + { + "epoch": 1.9627273883729806, + "ewc_loss": 0.07121943682432175, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000360631849616766, + "grad_norm": 8.359219551086426, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8693498969078064, + "num_tokens": 588480436.0, + "step": 15429 + }, + { + "epoch": 1.962854598651571, + "ewc_loss": 0.07120053470134735, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003604428784456104, + "grad_norm": 8.316492080688477, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8638057708740234, + "num_tokens": 588526918.0, + "step": 15430 + }, + { + "epoch": 1.9629818089301616, + "ewc_loss": 0.07096248120069504, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036050373455509543, + "grad_norm": 8.378621101379395, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8638519048690796, + "num_tokens": 588561300.0, + "step": 15431 + }, + { + "epoch": 1.9631090192087521, + "ewc_loss": 0.07048454880714417, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.00035816585295833647, + "grad_norm": 8.305191040039062, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8556673526763916, + "num_tokens": 588603393.0, + "step": 15432 + }, + { + "epoch": 1.9632362294873427, + "ewc_loss": 0.07078014314174652, + "ewc_loss_diag": 3.457069396972656e-05, + "ewc_loss_parallel": 0.0003611217252910137, + "grad_norm": 8.412524223327637, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8714780807495117, + "num_tokens": 588643806.0, + "step": 15433 + }, + { + "epoch": 1.9633634397659332, + "ewc_loss": 0.07037396728992462, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003546185907907784, + "grad_norm": 8.231637001037598, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.862736701965332, + "num_tokens": 588685090.0, + "step": 15434 + }, + { + "epoch": 1.9634906500445237, + "ewc_loss": 0.07116742432117462, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003601117350626737, + "grad_norm": 8.361283302307129, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8508893847465515, + "num_tokens": 588720983.0, + "step": 15435 + }, + { + "epoch": 1.9636178603231143, + "ewc_loss": 0.07078561186790466, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035629357444122434, + "grad_norm": 8.210797309875488, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8582254648208618, + "num_tokens": 588760143.0, + "step": 15436 + }, + { + "epoch": 1.9637450706017048, + "ewc_loss": 0.0711677074432373, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003625559329520911, + "grad_norm": 8.405584335327148, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8650996685028076, + "num_tokens": 588796636.0, + "step": 15437 + }, + { + "epoch": 1.963872280880295, + "ewc_loss": 0.07078462839126587, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035628373734652996, + "grad_norm": 8.214179992675781, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8607398867607117, + "num_tokens": 588831681.0, + "step": 15438 + }, + { + "epoch": 1.9639994911588856, + "ewc_loss": 0.07130888104438782, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036396775976754725, + "grad_norm": 8.318862915039062, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8752331733703613, + "num_tokens": 588875408.0, + "step": 15439 + }, + { + "epoch": 1.9641267014374761, + "ewc_loss": 0.07075484097003937, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035842726356349885, + "grad_norm": 13.986856460571289, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8690123558044434, + "num_tokens": 588913583.0, + "step": 15440 + }, + { + "epoch": 1.9642539117160667, + "ewc_loss": 0.0788700059056282, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00043713755439966917, + "grad_norm": 9.054341316223145, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8731878399848938, + "num_tokens": 588949833.0, + "step": 15441 + }, + { + "epoch": 1.9643811219946572, + "ewc_loss": 0.0723826065659523, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00037226357380859554, + "grad_norm": 8.57048511505127, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8670485615730286, + "num_tokens": 588985678.0, + "step": 15442 + }, + { + "epoch": 1.9645083322732477, + "ewc_loss": 0.07155527174472809, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003664316318463534, + "grad_norm": 8.464272499084473, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8617532849311829, + "num_tokens": 589018276.0, + "step": 15443 + }, + { + "epoch": 1.964635542551838, + "ewc_loss": 0.07346102595329285, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000383047794457525, + "grad_norm": 8.656227111816406, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8620655536651611, + "num_tokens": 589061900.0, + "step": 15444 + }, + { + "epoch": 1.9647627528304286, + "ewc_loss": 0.07151784002780914, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003636159235611558, + "grad_norm": 8.383235931396484, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8658734560012817, + "num_tokens": 589102740.0, + "step": 15445 + }, + { + "epoch": 1.964889963109019, + "ewc_loss": 0.07237769663333893, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00037221447564661503, + "grad_norm": 8.542261123657227, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.873217761516571, + "num_tokens": 589135276.0, + "step": 15446 + }, + { + "epoch": 1.9650171733876096, + "ewc_loss": 0.07158252596855164, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003642628144007176, + "grad_norm": 8.440323829650879, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8635687828063965, + "num_tokens": 589173390.0, + "step": 15447 + }, + { + "epoch": 1.9651443836662001, + "ewc_loss": 0.07193755358457565, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003678130451589823, + "grad_norm": 8.474089622497559, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.856957197189331, + "num_tokens": 589214592.0, + "step": 15448 + }, + { + "epoch": 1.9652715939447907, + "ewc_loss": 0.07139308750629425, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000362368329660967, + "grad_norm": 8.376072883605957, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8643895387649536, + "num_tokens": 589250682.0, + "step": 15449 + }, + { + "epoch": 1.9653988042233812, + "ewc_loss": 0.07159806787967682, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003644182288553566, + "grad_norm": 8.471256256103516, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8583962917327881, + "num_tokens": 589288326.0, + "step": 15450 + }, + { + "epoch": 1.9655260145019717, + "ewc_loss": 0.07138137519359589, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036225130315870047, + "grad_norm": 8.402430534362793, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8610425591468811, + "num_tokens": 589321632.0, + "step": 15451 + }, + { + "epoch": 1.9656532247805623, + "ewc_loss": 0.07140330970287323, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036247060052119195, + "grad_norm": 8.378072738647461, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8530271053314209, + "num_tokens": 589361134.0, + "step": 15452 + }, + { + "epoch": 1.9657804350591528, + "ewc_loss": 0.07139334827661514, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036237097810953856, + "grad_norm": 8.358414649963379, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8670008182525635, + "num_tokens": 589402338.0, + "step": 15453 + }, + { + "epoch": 1.9659076453377433, + "ewc_loss": 0.07124355435371399, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036087303305976093, + "grad_norm": 8.354911804199219, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8571114540100098, + "num_tokens": 589441080.0, + "step": 15454 + }, + { + "epoch": 1.9660348556163338, + "ewc_loss": 0.07122670114040375, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003607045509852469, + "grad_norm": 8.314617156982422, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8614068627357483, + "num_tokens": 589484019.0, + "step": 15455 + }, + { + "epoch": 1.9661620658949244, + "ewc_loss": 0.07131160795688629, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036155356792733073, + "grad_norm": 8.329672813415527, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8593194484710693, + "num_tokens": 589525890.0, + "step": 15456 + }, + { + "epoch": 1.966289276173515, + "ewc_loss": 0.07125155627727509, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003609530394896865, + "grad_norm": 8.356063842773438, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8702045679092407, + "num_tokens": 589560644.0, + "step": 15457 + }, + { + "epoch": 1.9664164864521054, + "ewc_loss": 0.07123039662837982, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036074151284992695, + "grad_norm": 8.301641464233398, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8839815855026245, + "num_tokens": 589600863.0, + "step": 15458 + }, + { + "epoch": 1.966543696730696, + "ewc_loss": 0.07127663493156433, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003612037980929017, + "grad_norm": 8.325069427490234, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8625234365463257, + "num_tokens": 589637948.0, + "step": 15459 + }, + { + "epoch": 1.9666709070092865, + "ewc_loss": 0.07123644649982452, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036080199060961604, + "grad_norm": 8.324951171875, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8777586221694946, + "num_tokens": 589678061.0, + "step": 15460 + }, + { + "epoch": 1.966798117287877, + "ewc_loss": 0.07121817022562027, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003606191894505173, + "grad_norm": 15.597458839416504, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8529871702194214, + "num_tokens": 589718050.0, + "step": 15461 + }, + { + "epoch": 1.9669253275664673, + "ewc_loss": 0.08303980529308319, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0004788355145137757, + "grad_norm": 9.636094093322754, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8667019009590149, + "num_tokens": 589759206.0, + "step": 15462 + }, + { + "epoch": 1.9670525378450578, + "ewc_loss": 0.07043199241161346, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003527574590407312, + "grad_norm": 8.208714485168457, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8520240783691406, + "num_tokens": 589800218.0, + "step": 15463 + }, + { + "epoch": 1.9671797481236484, + "ewc_loss": 0.07322008162736893, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003806383174378425, + "grad_norm": 8.730598449707031, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8644468784332275, + "num_tokens": 589843105.0, + "step": 15464 + }, + { + "epoch": 1.967306958402239, + "ewc_loss": 0.07220672070980072, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003705046547111124, + "grad_norm": 8.344082832336426, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8554404973983765, + "num_tokens": 589881560.0, + "step": 15465 + }, + { + "epoch": 1.9674341686808294, + "ewc_loss": 0.0726325660943985, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000374763214495033, + "grad_norm": 8.65842056274414, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8523288369178772, + "num_tokens": 589922755.0, + "step": 15466 + }, + { + "epoch": 1.96756137895942, + "ewc_loss": 0.07156869769096375, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003641245129983872, + "grad_norm": 8.325677871704102, + "learning_rate": 1e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8450589776039124, + "num_tokens": 589964982.0, + "step": 15467 + }, + { + "epoch": 1.9676885892380103, + "ewc_loss": 0.07252642512321472, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00037370173959061503, + "grad_norm": 8.554402351379395, + "learning_rate": 1e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8501888513565063, + "num_tokens": 590003371.0, + "step": 15468 + }, + { + "epoch": 1.9678157995166008, + "ewc_loss": 0.0714922696352005, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000363360159099102, + "grad_norm": 8.354902267456055, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8608484864234924, + "num_tokens": 590048361.0, + "step": 15469 + }, + { + "epoch": 1.9679430097951913, + "ewc_loss": 0.07200246304273605, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003709035227075219, + "grad_norm": 8.586771011352539, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8610358238220215, + "num_tokens": 590083141.0, + "step": 15470 + }, + { + "epoch": 1.9680702200737819, + "ewc_loss": 0.07101739943027496, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003610528656281531, + "grad_norm": 8.23580551147461, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8586310744285583, + "num_tokens": 590128161.0, + "step": 15471 + }, + { + "epoch": 1.9681974303523724, + "ewc_loss": 0.07216206192970276, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00037249954766593874, + "grad_norm": 8.630498886108398, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8796600103378296, + "num_tokens": 590166101.0, + "step": 15472 + }, + { + "epoch": 1.968324640630963, + "ewc_loss": 0.07079055905342102, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00035878451308235526, + "grad_norm": 8.223756790161133, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8619997501373291, + "num_tokens": 590207643.0, + "step": 15473 + }, + { + "epoch": 1.9684518509095534, + "ewc_loss": 0.0724383145570755, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00037282067933119833, + "grad_norm": 8.600187301635742, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8561611175537109, + "num_tokens": 590247567.0, + "step": 15474 + }, + { + "epoch": 1.968579061188144, + "ewc_loss": 0.07092900574207306, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036016901140101254, + "grad_norm": 8.28890323638916, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8593875169754028, + "num_tokens": 590287940.0, + "step": 15475 + }, + { + "epoch": 1.9687062714667345, + "ewc_loss": 0.07189375162124634, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036981640732847154, + "grad_norm": 8.552754402160645, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8571489453315735, + "num_tokens": 590322558.0, + "step": 15476 + }, + { + "epoch": 1.968833481745325, + "ewc_loss": 0.0707944929599762, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003588238905649632, + "grad_norm": 8.237207412719727, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8629235029220581, + "num_tokens": 590362913.0, + "step": 15477 + }, + { + "epoch": 1.9689606920239155, + "ewc_loss": 0.07199528068304062, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00037083172355778515, + "grad_norm": 8.53992748260498, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8545714616775513, + "num_tokens": 590405877.0, + "step": 15478 + }, + { + "epoch": 1.969087902302506, + "ewc_loss": 0.07091726362705231, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036005154834128916, + "grad_norm": 8.291065216064453, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8535621762275696, + "num_tokens": 590445373.0, + "step": 15479 + }, + { + "epoch": 1.9692151125810966, + "ewc_loss": 0.07232477515935898, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003692438476718962, + "grad_norm": 8.492015838623047, + "learning_rate": 1e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8531915545463562, + "num_tokens": 590483319.0, + "step": 15480 + }, + { + "epoch": 1.9693423228596871, + "ewc_loss": 0.07113388180732727, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036221774644218385, + "grad_norm": 8.345407485961914, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8500885367393494, + "num_tokens": 590522433.0, + "step": 15481 + }, + { + "epoch": 1.9694695331382777, + "ewc_loss": 0.071904256939888, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003674800682347268, + "grad_norm": 8.48281192779541, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8670434951782227, + "num_tokens": 590564342.0, + "step": 15482 + }, + { + "epoch": 1.9695967434168682, + "ewc_loss": 0.07134562730789185, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036189373349770904, + "grad_norm": 8.344436645507812, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8648595213890076, + "num_tokens": 590594527.0, + "step": 15483 + }, + { + "epoch": 1.9697239536954587, + "ewc_loss": 0.07195284962654114, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036796595668420196, + "grad_norm": 8.472370147705078, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8747193813323975, + "num_tokens": 590630573.0, + "step": 15484 + }, + { + "epoch": 1.9698511639740492, + "ewc_loss": 0.07125353813171387, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000360972888302058, + "grad_norm": 8.35352611541748, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8645781874656677, + "num_tokens": 590669333.0, + "step": 15485 + }, + { + "epoch": 1.9699783742526398, + "ewc_loss": 0.07157567143440247, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036419424577616155, + "grad_norm": 8.339423179626465, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8793163299560547, + "num_tokens": 590709884.0, + "step": 15486 + }, + { + "epoch": 1.97010558453123, + "ewc_loss": 0.07158105075359344, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003642479714471847, + "grad_norm": 8.376935005187988, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8697324395179749, + "num_tokens": 590746109.0, + "step": 15487 + }, + { + "epoch": 1.9702327948098206, + "ewc_loss": 0.07133863121271133, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036182382609695196, + "grad_norm": 8.364310264587402, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.846959114074707, + "num_tokens": 590787881.0, + "step": 15488 + }, + { + "epoch": 1.9703600050884111, + "ewc_loss": 0.07145547866821289, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036299231578595936, + "grad_norm": 8.37266731262207, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8488016724586487, + "num_tokens": 590826861.0, + "step": 15489 + }, + { + "epoch": 1.9704872153670017, + "ewc_loss": 0.07143564522266388, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036279394407756627, + "grad_norm": 8.401175498962402, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8706672787666321, + "num_tokens": 590863104.0, + "step": 15490 + }, + { + "epoch": 1.9706144256455922, + "ewc_loss": 0.07136868685483932, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036212438135407865, + "grad_norm": 8.351249694824219, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8703241348266602, + "num_tokens": 590900582.0, + "step": 15491 + }, + { + "epoch": 1.9707416359241827, + "ewc_loss": 0.07135851681232452, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003620227216742933, + "grad_norm": 8.307608604431152, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8702808618545532, + "num_tokens": 590937735.0, + "step": 15492 + }, + { + "epoch": 1.970868846202773, + "ewc_loss": 0.07133719325065613, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036180944880470634, + "grad_norm": 8.334728240966797, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8785411715507507, + "num_tokens": 590973089.0, + "step": 15493 + }, + { + "epoch": 1.9709960564813636, + "ewc_loss": 0.07127804309129715, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003612179134506732, + "grad_norm": 8.349087715148926, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8495480418205261, + "num_tokens": 591012117.0, + "step": 15494 + }, + { + "epoch": 1.971123266759954, + "ewc_loss": 0.07131731510162354, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003616106405388564, + "grad_norm": 8.354545593261719, + "learning_rate": 1e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.844639003276825, + "num_tokens": 591044978.0, + "step": 15495 + }, + { + "epoch": 1.9712504770385446, + "ewc_loss": 0.0712602287530899, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036103976890444756, + "grad_norm": 8.317476272583008, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8594039082527161, + "num_tokens": 591085562.0, + "step": 15496 + }, + { + "epoch": 1.9713776873171351, + "ewc_loss": 0.07151861488819122, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036362369428388774, + "grad_norm": 8.439715385437012, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8537905216217041, + "num_tokens": 591120140.0, + "step": 15497 + }, + { + "epoch": 1.9715048975957257, + "ewc_loss": 0.07112405449151993, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035967802978120744, + "grad_norm": 8.311121940612793, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8532691597938538, + "num_tokens": 591153952.0, + "step": 15498 + }, + { + "epoch": 1.9716321078743162, + "ewc_loss": 0.07146292924880981, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003630667633842677, + "grad_norm": 8.329598426818848, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.880297064781189, + "num_tokens": 591192983.0, + "step": 15499 + }, + { + "epoch": 1.9717593181529067, + "ewc_loss": 0.07122409343719482, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003606783866416663, + "grad_norm": 8.37863540649414, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8639932870864868, + "num_tokens": 591226260.0, + "step": 15500 + }, + { + "epoch": 1.9718865284314973, + "ewc_loss": 0.07127489149570465, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003611863940022886, + "grad_norm": 8.370028495788574, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8490673303604126, + "num_tokens": 591259361.0, + "step": 15501 + }, + { + "epoch": 1.9720137387100878, + "ewc_loss": 0.07114028930664062, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003598403709474951, + "grad_norm": 8.36383056640625, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8621083498001099, + "num_tokens": 591299753.0, + "step": 15502 + }, + { + "epoch": 1.9721409489886783, + "ewc_loss": 0.07109663635492325, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000359403871698305, + "grad_norm": 8.304923057556152, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8650976419448853, + "num_tokens": 591335008.0, + "step": 15503 + }, + { + "epoch": 1.9722681592672688, + "ewc_loss": 0.071042999625206, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035886751720681787, + "grad_norm": 8.3421630859375, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8701098561286926, + "num_tokens": 591373001.0, + "step": 15504 + }, + { + "epoch": 1.9723953695458594, + "ewc_loss": 0.07099861651659012, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003584236837923527, + "grad_norm": 8.329705238342285, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8664030432701111, + "num_tokens": 591408637.0, + "step": 15505 + }, + { + "epoch": 1.97252257982445, + "ewc_loss": 0.07105402648448944, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035897776251658797, + "grad_norm": 8.36872673034668, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8591979742050171, + "num_tokens": 591444908.0, + "step": 15506 + }, + { + "epoch": 1.9726497901030404, + "ewc_loss": 0.07089361548423767, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035737367579713464, + "grad_norm": 8.30252456665039, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8648223876953125, + "num_tokens": 591484641.0, + "step": 15507 + }, + { + "epoch": 1.972777000381631, + "ewc_loss": 0.07119548320770264, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003603923541959375, + "grad_norm": 8.373809814453125, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8594987392425537, + "num_tokens": 591521859.0, + "step": 15508 + }, + { + "epoch": 1.9729042106602215, + "ewc_loss": 0.07092802226543427, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003577177121769637, + "grad_norm": 8.384011268615723, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8629072904586792, + "num_tokens": 591559124.0, + "step": 15509 + }, + { + "epoch": 1.973031420938812, + "ewc_loss": 0.07100987434387207, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003585361992008984, + "grad_norm": 8.326372146606445, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8616688251495361, + "num_tokens": 591597170.0, + "step": 15510 + }, + { + "epoch": 1.9731586312174023, + "ewc_loss": 0.07111327350139618, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035957020008936524, + "grad_norm": 8.35139274597168, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8687682747840881, + "num_tokens": 591634784.0, + "step": 15511 + }, + { + "epoch": 1.9732858414959928, + "ewc_loss": 0.07090708613395691, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035750839742831886, + "grad_norm": 8.346658706665039, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8674982786178589, + "num_tokens": 591671904.0, + "step": 15512 + }, + { + "epoch": 1.9734130517745834, + "ewc_loss": 0.071022629737854, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035866376128979027, + "grad_norm": 8.34221076965332, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8652663230895996, + "num_tokens": 591712990.0, + "step": 15513 + }, + { + "epoch": 1.973540262053174, + "ewc_loss": 0.0709790289402008, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035822775680571795, + "grad_norm": 8.345226287841797, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8605250120162964, + "num_tokens": 591748579.0, + "step": 15514 + }, + { + "epoch": 1.9736674723317644, + "ewc_loss": 0.07087073475122452, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035714483237825334, + "grad_norm": 8.344231605529785, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8562167286872864, + "num_tokens": 591785865.0, + "step": 15515 + }, + { + "epoch": 1.973794682610355, + "ewc_loss": 0.07094405591487885, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035787810338661075, + "grad_norm": 8.333372116088867, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8717395067214966, + "num_tokens": 591820087.0, + "step": 15516 + }, + { + "epoch": 1.9739218928889453, + "ewc_loss": 0.07083961367607117, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003568336251191795, + "grad_norm": 8.312333106994629, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8635962009429932, + "num_tokens": 591859713.0, + "step": 15517 + }, + { + "epoch": 1.9740491031675358, + "ewc_loss": 0.07102663815021515, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003587038372643292, + "grad_norm": 8.397672653198242, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8709186315536499, + "num_tokens": 591893142.0, + "step": 15518 + }, + { + "epoch": 1.9741763134461263, + "ewc_loss": 0.0707077607512474, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003555151051841676, + "grad_norm": 8.281941413879395, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8586694002151489, + "num_tokens": 591931529.0, + "step": 15519 + }, + { + "epoch": 1.9743035237247168, + "ewc_loss": 0.0710628554224968, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003590660635381937, + "grad_norm": 8.363105773925781, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8670247197151184, + "num_tokens": 591976248.0, + "step": 15520 + }, + { + "epoch": 1.9744307340033074, + "ewc_loss": 0.0708041861653328, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035647934419102967, + "grad_norm": 8.317052841186523, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8612073659896851, + "num_tokens": 592018008.0, + "step": 15521 + }, + { + "epoch": 1.974557944281898, + "ewc_loss": 0.07089100778102875, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003573475987650454, + "grad_norm": 8.359904289245605, + "learning_rate": 1e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8489029407501221, + "num_tokens": 592062934.0, + "step": 15522 + }, + { + "epoch": 1.9746851545604884, + "ewc_loss": 0.07088980078697205, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035733546246774495, + "grad_norm": 8.349958419799805, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8620101809501648, + "num_tokens": 592103129.0, + "step": 15523 + }, + { + "epoch": 1.974812364839079, + "ewc_loss": 0.07079172134399414, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003563546924851835, + "grad_norm": 8.327610969543457, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8597227931022644, + "num_tokens": 592142116.0, + "step": 15524 + }, + { + "epoch": 1.9749395751176695, + "ewc_loss": 0.07096093893051147, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003580469056032598, + "grad_norm": 8.321423530578613, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8648277521133423, + "num_tokens": 592182507.0, + "step": 15525 + }, + { + "epoch": 1.97506678539626, + "ewc_loss": 0.07099997997283936, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003584372461773455, + "grad_norm": 8.352645874023438, + "learning_rate": 1e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8418123722076416, + "num_tokens": 592225593.0, + "step": 15526 + }, + { + "epoch": 1.9751939956748505, + "ewc_loss": 0.07083998620510101, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003568373213056475, + "grad_norm": 8.3458833694458, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8879094123840332, + "num_tokens": 592270676.0, + "step": 15527 + }, + { + "epoch": 1.975321205953441, + "ewc_loss": 0.07089580595493317, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035739559098146856, + "grad_norm": 8.40064811706543, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8829377889633179, + "num_tokens": 592305537.0, + "step": 15528 + }, + { + "epoch": 1.9754484162320316, + "ewc_loss": 0.07081129401922226, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035655044484883547, + "grad_norm": 8.320025444030762, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8689267039299011, + "num_tokens": 592338289.0, + "step": 15529 + }, + { + "epoch": 1.9755756265106221, + "ewc_loss": 0.07096990942955017, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003581366327125579, + "grad_norm": 8.357516288757324, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8661753535270691, + "num_tokens": 592379189.0, + "step": 15530 + }, + { + "epoch": 1.9757028367892127, + "ewc_loss": 0.070872962474823, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035716709680855274, + "grad_norm": 8.326765060424805, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8778215646743774, + "num_tokens": 592421875.0, + "step": 15531 + }, + { + "epoch": 1.9758300470678032, + "ewc_loss": 0.07104086875915527, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003588461840990931, + "grad_norm": 8.410531997680664, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8446172475814819, + "num_tokens": 592457438.0, + "step": 15532 + }, + { + "epoch": 1.9759572573463937, + "ewc_loss": 0.07075703144073486, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003560078330338001, + "grad_norm": 8.27560043334961, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.87384432554245, + "num_tokens": 592493230.0, + "step": 15533 + }, + { + "epoch": 1.9760844676249842, + "ewc_loss": 0.0711212158203125, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003596496826503426, + "grad_norm": 8.407394409179688, + "learning_rate": 1e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8474265336990356, + "num_tokens": 592533656.0, + "step": 15534 + }, + { + "epoch": 1.9762116779035748, + "ewc_loss": 0.07073292136192322, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000355766766006127, + "grad_norm": 8.313472747802734, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8759651184082031, + "num_tokens": 592571622.0, + "step": 15535 + }, + { + "epoch": 1.976338888182165, + "ewc_loss": 0.07113893330097198, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035982680856250226, + "grad_norm": 8.40445613861084, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.858141303062439, + "num_tokens": 592613260.0, + "step": 15536 + }, + { + "epoch": 1.9764660984607556, + "ewc_loss": 0.07082676142454147, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035670510260388255, + "grad_norm": 8.297755241394043, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8615046143531799, + "num_tokens": 592648872.0, + "step": 15537 + }, + { + "epoch": 1.9765933087393461, + "ewc_loss": 0.0712088942527771, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003605264355428517, + "grad_norm": 8.42686939239502, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8649346828460693, + "num_tokens": 592682336.0, + "step": 15538 + }, + { + "epoch": 1.9767205190179367, + "ewc_loss": 0.07081407308578491, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035657818079926074, + "grad_norm": 8.285375595092773, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8771418333053589, + "num_tokens": 592716412.0, + "step": 15539 + }, + { + "epoch": 1.9768477292965272, + "ewc_loss": 0.07129518687725067, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003613893350120634, + "grad_norm": 8.38585090637207, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8647956252098083, + "num_tokens": 592753936.0, + "step": 15540 + }, + { + "epoch": 1.9769749395751177, + "ewc_loss": 0.07088211178779602, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035725862835533917, + "grad_norm": 8.290390968322754, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8628225326538086, + "num_tokens": 592792731.0, + "step": 15541 + }, + { + "epoch": 1.977102149853708, + "ewc_loss": 0.07128739356994629, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036131139495410025, + "grad_norm": 8.336758613586426, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8664512038230896, + "num_tokens": 592832035.0, + "step": 15542 + }, + { + "epoch": 1.9772293601322986, + "ewc_loss": 0.07108893990516663, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003593269211705774, + "grad_norm": 8.341327667236328, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8812674880027771, + "num_tokens": 592869693.0, + "step": 15543 + }, + { + "epoch": 1.977356570410889, + "ewc_loss": 0.0711330771446228, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035976830986328423, + "grad_norm": 8.292867660522461, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8666395545005798, + "num_tokens": 592909648.0, + "step": 15544 + }, + { + "epoch": 1.9774837806894796, + "ewc_loss": 0.07130607217550278, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000361498212441802, + "grad_norm": 8.395153999328613, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8661760687828064, + "num_tokens": 592942539.0, + "step": 15545 + }, + { + "epoch": 1.9776109909680701, + "ewc_loss": 0.07100573182106018, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003584947844501585, + "grad_norm": 8.290451049804688, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.862375020980835, + "num_tokens": 592983680.0, + "step": 15546 + }, + { + "epoch": 1.9777382012466607, + "ewc_loss": 0.07142221927642822, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003626597172114998, + "grad_norm": 8.420393943786621, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8529125452041626, + "num_tokens": 593023161.0, + "step": 15547 + }, + { + "epoch": 1.9778654115252512, + "ewc_loss": 0.07082167267799377, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003566542000044137, + "grad_norm": 8.253000259399414, + "learning_rate": 1e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8453137874603271, + "num_tokens": 593066612.0, + "step": 15548 + }, + { + "epoch": 1.9779926218038417, + "ewc_loss": 0.07142794132232666, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036271687713451684, + "grad_norm": 8.444588661193848, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8502517342567444, + "num_tokens": 593105224.0, + "step": 15549 + }, + { + "epoch": 1.9781198320824323, + "ewc_loss": 0.07094945013523102, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035793197457678616, + "grad_norm": 8.300429344177246, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8633490204811096, + "num_tokens": 593143683.0, + "step": 15550 + }, + { + "epoch": 1.9782470423610228, + "ewc_loss": 0.07129871100187302, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036142460885457695, + "grad_norm": 8.400918006896973, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8489817380905151, + "num_tokens": 593179827.0, + "step": 15551 + }, + { + "epoch": 1.9783742526396133, + "ewc_loss": 0.07109281420707703, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003593656874727458, + "grad_norm": 14.075481414794922, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8651820421218872, + "num_tokens": 593220936.0, + "step": 15552 + }, + { + "epoch": 1.9785014629182038, + "ewc_loss": 0.07925538718700409, + "ewc_loss_diag": 3.5762786865234375e-05, + "ewc_loss_parallel": 0.00043610858847387135, + "grad_norm": 9.264546394348145, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8597298860549927, + "num_tokens": 593258503.0, + "step": 15553 + }, + { + "epoch": 1.9786286731967944, + "ewc_loss": 0.07150924205780029, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036352992174215615, + "grad_norm": 8.56118392944336, + "learning_rate": 1e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8435683250427246, + "num_tokens": 593293738.0, + "step": 15554 + }, + { + "epoch": 1.978755883475385, + "ewc_loss": 0.0718383714556694, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036682121572084725, + "grad_norm": 8.508535385131836, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8544654846191406, + "num_tokens": 593338037.0, + "step": 15555 + }, + { + "epoch": 1.9788830937539754, + "ewc_loss": 0.07253792881965637, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00037381681613624096, + "grad_norm": 8.579633712768555, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8491203784942627, + "num_tokens": 593383509.0, + "step": 15556 + }, + { + "epoch": 1.979010304032566, + "ewc_loss": 0.07154570519924164, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036389450542628765, + "grad_norm": 8.468589782714844, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8662322759628296, + "num_tokens": 593417734.0, + "step": 15557 + }, + { + "epoch": 1.9791375143111565, + "ewc_loss": 0.0717504471540451, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003659419307950884, + "grad_norm": 8.521559715270996, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8745733499526978, + "num_tokens": 593454502.0, + "step": 15558 + }, + { + "epoch": 1.979264724589747, + "ewc_loss": 0.07139469683170319, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036238451139070094, + "grad_norm": 8.450630187988281, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8590037822723389, + "num_tokens": 593498864.0, + "step": 15559 + }, + { + "epoch": 1.9793919348683373, + "ewc_loss": 0.07160401344299316, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036447762977331877, + "grad_norm": 8.457003593444824, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8611117005348206, + "num_tokens": 593540125.0, + "step": 15560 + }, + { + "epoch": 1.9795191451469278, + "ewc_loss": 0.0712280124425888, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003607176768127829, + "grad_norm": 8.498177528381348, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.871141791343689, + "num_tokens": 593566937.0, + "step": 15561 + }, + { + "epoch": 1.9796463554255184, + "ewc_loss": 0.0713248997926712, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003616865142248571, + "grad_norm": 8.432392120361328, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8583850264549255, + "num_tokens": 593600109.0, + "step": 15562 + }, + { + "epoch": 1.979773565704109, + "ewc_loss": 0.07132109999656677, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036164853372611105, + "grad_norm": 8.461760520935059, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.862518310546875, + "num_tokens": 593638982.0, + "step": 15563 + }, + { + "epoch": 1.9799007759826994, + "ewc_loss": 0.07092253863811493, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035766285145655274, + "grad_norm": 8.349784851074219, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8523997068405151, + "num_tokens": 593677270.0, + "step": 15564 + }, + { + "epoch": 1.98002798626129, + "ewc_loss": 0.0712498277425766, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036093578091822565, + "grad_norm": 8.413095474243164, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8717048764228821, + "num_tokens": 593714056.0, + "step": 15565 + }, + { + "epoch": 1.9801551965398803, + "ewc_loss": 0.07095295190811157, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035796701558865607, + "grad_norm": 8.364036560058594, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8508654832839966, + "num_tokens": 593753516.0, + "step": 15566 + }, + { + "epoch": 1.9802824068184708, + "ewc_loss": 0.07136087119579315, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003620462666731328, + "grad_norm": 8.44990348815918, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8639563322067261, + "num_tokens": 593789276.0, + "step": 15567 + }, + { + "epoch": 1.9804096170970613, + "ewc_loss": 0.07076114416122437, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035604892764240503, + "grad_norm": 8.357345581054688, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8662482500076294, + "num_tokens": 593828112.0, + "step": 15568 + }, + { + "epoch": 1.9805368273756518, + "ewc_loss": 0.07121887803077698, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003606262616813183, + "grad_norm": 8.415432929992676, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8514142632484436, + "num_tokens": 593866370.0, + "step": 15569 + }, + { + "epoch": 1.9806640376542424, + "ewc_loss": 0.07082005590200424, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003566380764823407, + "grad_norm": 8.259334564208984, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8681049346923828, + "num_tokens": 593907863.0, + "step": 15570 + }, + { + "epoch": 1.980791247932833, + "ewc_loss": 0.07130681723356247, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036150566302239895, + "grad_norm": 8.386106491088867, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8749088644981384, + "num_tokens": 593945254.0, + "step": 15571 + }, + { + "epoch": 1.9809184582114234, + "ewc_loss": 0.07089964300394058, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003574339207261801, + "grad_norm": 8.322061538696289, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8696877956390381, + "num_tokens": 593987105.0, + "step": 15572 + }, + { + "epoch": 1.981045668490014, + "ewc_loss": 0.07130874693393707, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003615249879658222, + "grad_norm": 8.370716094970703, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8745575547218323, + "num_tokens": 594024091.0, + "step": 15573 + }, + { + "epoch": 1.9811728787686045, + "ewc_loss": 0.07101301848888397, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003585676313377917, + "grad_norm": 8.359886169433594, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8741493225097656, + "num_tokens": 594056018.0, + "step": 15574 + }, + { + "epoch": 1.981300089047195, + "ewc_loss": 0.07118581235408783, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036029567127116024, + "grad_norm": 8.298744201660156, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8668462038040161, + "num_tokens": 594102297.0, + "step": 15575 + }, + { + "epoch": 1.9814272993257855, + "ewc_loss": 0.0711851567029953, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003602890355978161, + "grad_norm": 8.306124687194824, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8645642399787903, + "num_tokens": 594140278.0, + "step": 15576 + }, + { + "epoch": 1.981554509604376, + "ewc_loss": 0.0711568146944046, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003600056516006589, + "grad_norm": 8.40699577331543, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8687887787818909, + "num_tokens": 594175025.0, + "step": 15577 + }, + { + "epoch": 1.9816817198829666, + "ewc_loss": 0.0710906982421875, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035934452898800373, + "grad_norm": 8.306218147277832, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8632652759552002, + "num_tokens": 594217906.0, + "step": 15578 + }, + { + "epoch": 1.9818089301615571, + "ewc_loss": 0.07133612036705017, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003617987094912678, + "grad_norm": 8.380231857299805, + "learning_rate": 1e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8457989692687988, + "num_tokens": 594264331.0, + "step": 15579 + }, + { + "epoch": 1.9819361404401477, + "ewc_loss": 0.0709594190120697, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003580317134037614, + "grad_norm": 8.342185020446777, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8793012499809265, + "num_tokens": 594300220.0, + "step": 15580 + }, + { + "epoch": 1.9820633507187382, + "ewc_loss": 0.07130111753940582, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036144867772236466, + "grad_norm": 8.346781730651855, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8694018721580505, + "num_tokens": 594337849.0, + "step": 15581 + }, + { + "epoch": 1.9821905609973287, + "ewc_loss": 0.07105955481529236, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003590330888982862, + "grad_norm": 8.297855377197266, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8564465641975403, + "num_tokens": 594379295.0, + "step": 15582 + }, + { + "epoch": 1.9823177712759192, + "ewc_loss": 0.07124941051006317, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003609315899666399, + "grad_norm": 8.452352523803711, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8590220212936401, + "num_tokens": 594408350.0, + "step": 15583 + }, + { + "epoch": 1.9824449815545098, + "ewc_loss": 0.07094317674636841, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003578692558221519, + "grad_norm": 8.279808044433594, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8537687063217163, + "num_tokens": 594455110.0, + "step": 15584 + }, + { + "epoch": 1.9825721918331, + "ewc_loss": 0.07144837081432343, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036292121512815356, + "grad_norm": 8.42098617553711, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8675298094749451, + "num_tokens": 594499231.0, + "step": 15585 + }, + { + "epoch": 1.9826994021116906, + "ewc_loss": 0.07081540673971176, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035659156856127083, + "grad_norm": 8.292587280273438, + "learning_rate": 1e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8451504707336426, + "num_tokens": 594541478.0, + "step": 15586 + }, + { + "epoch": 1.9828266123902811, + "ewc_loss": 0.07115744799375534, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.00036245337105356157, + "grad_norm": 8.417705535888672, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8660173416137695, + "num_tokens": 594579075.0, + "step": 15587 + }, + { + "epoch": 1.9829538226688717, + "ewc_loss": 0.07060270011425018, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.000356905919034034, + "grad_norm": 8.291650772094727, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8797824382781982, + "num_tokens": 594616189.0, + "step": 15588 + }, + { + "epoch": 1.9830810329474622, + "ewc_loss": 0.07142043113708496, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003626418474595994, + "grad_norm": 8.397049903869629, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8708726167678833, + "num_tokens": 594656633.0, + "step": 15589 + }, + { + "epoch": 1.9832082432260527, + "ewc_loss": 0.07095227390527725, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035796023439615965, + "grad_norm": 8.314742088317871, + "learning_rate": 1e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8451090455055237, + "num_tokens": 594696833.0, + "step": 15590 + }, + { + "epoch": 1.983335453504643, + "ewc_loss": 0.07085837423801422, + "ewc_loss_diag": 3.4809112548828125e-05, + "ewc_loss_parallel": 0.0003594626614358276, + "grad_norm": 8.386605262756348, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8513860702514648, + "num_tokens": 594735138.0, + "step": 15591 + }, + { + "epoch": 1.9834626637832335, + "ewc_loss": 0.0710410624742508, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003588481340557337, + "grad_norm": 8.346721649169922, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.857874870300293, + "num_tokens": 594769587.0, + "step": 15592 + }, + { + "epoch": 1.983589874061824, + "ewc_loss": 0.07117290049791336, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036016650847159326, + "grad_norm": 8.32667064666748, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8690991997718811, + "num_tokens": 594803635.0, + "step": 15593 + }, + { + "epoch": 1.9837170843404146, + "ewc_loss": 0.07115399837493896, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035997750819660723, + "grad_norm": 8.352704048156738, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8796715140342712, + "num_tokens": 594839872.0, + "step": 15594 + }, + { + "epoch": 1.9838442946190051, + "ewc_loss": 0.07127797603607178, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036121721495874226, + "grad_norm": 8.337726593017578, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8521056175231934, + "num_tokens": 594879208.0, + "step": 15595 + }, + { + "epoch": 1.9839715048975957, + "ewc_loss": 0.07134070992469788, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036184454802423716, + "grad_norm": 8.38017749786377, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8705211877822876, + "num_tokens": 594910377.0, + "step": 15596 + }, + { + "epoch": 1.9840987151761862, + "ewc_loss": 0.07120411098003387, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036047855974175036, + "grad_norm": 8.267244338989258, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8662359714508057, + "num_tokens": 594956574.0, + "step": 15597 + }, + { + "epoch": 1.9842259254547767, + "ewc_loss": 0.07141785323619843, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003626160032581538, + "grad_norm": 8.38771915435791, + "learning_rate": 1e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8491435050964355, + "num_tokens": 594998036.0, + "step": 15598 + }, + { + "epoch": 1.9843531357333672, + "ewc_loss": 0.07115726917982101, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036001019179821014, + "grad_norm": 8.346662521362305, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.85845947265625, + "num_tokens": 595031404.0, + "step": 15599 + }, + { + "epoch": 1.9844803460119578, + "ewc_loss": 0.07138100266456604, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003622475196607411, + "grad_norm": 8.384167671203613, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8722784519195557, + "num_tokens": 595069800.0, + "step": 15600 + }, + { + "epoch": 1.9846075562905483, + "ewc_loss": 0.07135923951864243, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036202988121658564, + "grad_norm": 8.37971019744873, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8696832060813904, + "num_tokens": 595110987.0, + "step": 15601 + }, + { + "epoch": 1.9847347665691388, + "ewc_loss": 0.07141366600990295, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036257418105378747, + "grad_norm": 8.394291877746582, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8678010702133179, + "num_tokens": 595149192.0, + "step": 15602 + }, + { + "epoch": 1.9848619768477294, + "ewc_loss": 0.0711490660905838, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003599281481001526, + "grad_norm": 8.391473770141602, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8603050112724304, + "num_tokens": 595182993.0, + "step": 15603 + }, + { + "epoch": 1.9849891871263199, + "ewc_loss": 0.07127824425697327, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036121992161497474, + "grad_norm": 8.423462867736816, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8609585165977478, + "num_tokens": 595216177.0, + "step": 15604 + }, + { + "epoch": 1.9851163974049104, + "ewc_loss": 0.07117243111133575, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003601618518587202, + "grad_norm": 8.389911651611328, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8642071485519409, + "num_tokens": 595253379.0, + "step": 15605 + }, + { + "epoch": 1.985243607683501, + "ewc_loss": 0.0710933580994606, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035937107168138027, + "grad_norm": 8.39426326751709, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.86809903383255, + "num_tokens": 595292878.0, + "step": 15606 + }, + { + "epoch": 1.9853708179620915, + "ewc_loss": 0.0710814893245697, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035925241536460817, + "grad_norm": 8.381354331970215, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8646783232688904, + "num_tokens": 595329082.0, + "step": 15607 + }, + { + "epoch": 1.985498028240682, + "ewc_loss": 0.07114938646554947, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003599313786253333, + "grad_norm": 8.495882034301758, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8552577495574951, + "num_tokens": 595367246.0, + "step": 15608 + }, + { + "epoch": 1.9856252385192723, + "ewc_loss": 0.07064434885978699, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035488102003000677, + "grad_norm": 8.256814956665039, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8705462217330933, + "num_tokens": 595404450.0, + "step": 15609 + }, + { + "epoch": 1.9857524487978628, + "ewc_loss": 0.07148608565330505, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036329840077087283, + "grad_norm": 8.523687362670898, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8765808939933777, + "num_tokens": 595439718.0, + "step": 15610 + }, + { + "epoch": 1.9858796590764534, + "ewc_loss": 0.07066838443279266, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003551213594619185, + "grad_norm": 8.309755325317383, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8667376041412354, + "num_tokens": 595473617.0, + "step": 15611 + }, + { + "epoch": 1.986006869355044, + "ewc_loss": 0.07144585251808167, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036289601121097803, + "grad_norm": 8.429676055908203, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8606041669845581, + "num_tokens": 595516432.0, + "step": 15612 + }, + { + "epoch": 1.9861340796336344, + "ewc_loss": 0.07090131938457489, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035745074274018407, + "grad_norm": 8.397554397583008, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8646849393844604, + "num_tokens": 595556278.0, + "step": 15613 + }, + { + "epoch": 1.986261289912225, + "ewc_loss": 0.07104755192995071, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035891300649382174, + "grad_norm": 8.355830192565918, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8694764375686646, + "num_tokens": 595598713.0, + "step": 15614 + }, + { + "epoch": 1.9863885001908153, + "ewc_loss": 0.07118724286556244, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003603099612519145, + "grad_norm": 8.51778793334961, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8670334815979004, + "num_tokens": 595630597.0, + "step": 15615 + }, + { + "epoch": 1.9865157104694058, + "ewc_loss": 0.07080289721488953, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003564664220903069, + "grad_norm": 8.268327713012695, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.882583737373352, + "num_tokens": 595664163.0, + "step": 15616 + }, + { + "epoch": 1.9866429207479963, + "ewc_loss": 0.07146283984184265, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036306591937318444, + "grad_norm": 8.471084594726562, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8735373020172119, + "num_tokens": 595698809.0, + "step": 15617 + }, + { + "epoch": 1.9867701310265868, + "ewc_loss": 0.07067763805389404, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035521388053894043, + "grad_norm": 8.308236122131348, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8588197827339172, + "num_tokens": 595741166.0, + "step": 15618 + }, + { + "epoch": 1.9868973413051774, + "ewc_loss": 0.07141374051570892, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003625749086495489, + "grad_norm": 8.430636405944824, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8581584692001343, + "num_tokens": 595787079.0, + "step": 15619 + }, + { + "epoch": 1.987024551583768, + "ewc_loss": 0.07089066505432129, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035734413540922105, + "grad_norm": 8.357938766479492, + "learning_rate": 1e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8396696448326111, + "num_tokens": 595829085.0, + "step": 15620 + }, + { + "epoch": 1.9871517618623584, + "ewc_loss": 0.0711594820022583, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003600323689170182, + "grad_norm": 8.390615463256836, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8624652624130249, + "num_tokens": 595870646.0, + "step": 15621 + }, + { + "epoch": 1.987278972140949, + "ewc_loss": 0.07099906355142593, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035842813667841256, + "grad_norm": 8.408535957336426, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8533824682235718, + "num_tokens": 595918473.0, + "step": 15622 + }, + { + "epoch": 1.9874061824195395, + "ewc_loss": 0.07109331339597702, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003593706351239234, + "grad_norm": 8.428545951843262, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8520983457565308, + "num_tokens": 595954097.0, + "step": 15623 + }, + { + "epoch": 1.98753339269813, + "ewc_loss": 0.07098343968391418, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035827187821269035, + "grad_norm": 8.385409355163574, + "learning_rate": 1e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8460322618484497, + "num_tokens": 595990191.0, + "step": 15624 + }, + { + "epoch": 1.9876606029767205, + "ewc_loss": 0.07122468203306198, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036068432382307947, + "grad_norm": 8.402817726135254, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8564038276672363, + "num_tokens": 596027109.0, + "step": 15625 + }, + { + "epoch": 1.987787813255311, + "ewc_loss": 0.07099796831607819, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003584171936381608, + "grad_norm": 8.392925262451172, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8720305562019348, + "num_tokens": 596067930.0, + "step": 15626 + }, + { + "epoch": 1.9879150235339016, + "ewc_loss": 0.07105988264083862, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003590363485272974, + "grad_norm": 8.346137046813965, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.874931275844574, + "num_tokens": 596105809.0, + "step": 15627 + }, + { + "epoch": 1.9880422338124921, + "ewc_loss": 0.07124283909797668, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003608659317251295, + "grad_norm": 8.445232391357422, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8681447505950928, + "num_tokens": 596144220.0, + "step": 15628 + }, + { + "epoch": 1.9881694440910826, + "ewc_loss": 0.07097762823104858, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003582137869670987, + "grad_norm": 8.407426834106445, + "learning_rate": 1e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.8407543897628784, + "num_tokens": 596183825.0, + "step": 15629 + }, + { + "epoch": 1.9882966543696732, + "ewc_loss": 0.07120375335216522, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036047506728209555, + "grad_norm": 8.372580528259277, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8723940253257751, + "num_tokens": 596232333.0, + "step": 15630 + }, + { + "epoch": 1.9884238646482637, + "ewc_loss": 0.07114651799201965, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003599027113523334, + "grad_norm": 8.359321594238281, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8722113966941833, + "num_tokens": 596272138.0, + "step": 15631 + }, + { + "epoch": 1.9885510749268542, + "ewc_loss": 0.07117599248886108, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003601974749471992, + "grad_norm": 8.429750442504883, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8736313581466675, + "num_tokens": 596310254.0, + "step": 15632 + }, + { + "epoch": 1.9886782852054448, + "ewc_loss": 0.07104372978210449, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003588747640606016, + "grad_norm": 8.363216400146484, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8610106110572815, + "num_tokens": 596350396.0, + "step": 15633 + }, + { + "epoch": 1.988805495484035, + "ewc_loss": 0.07114254683256149, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035986295551992953, + "grad_norm": 8.428995132446289, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8677823543548584, + "num_tokens": 596388809.0, + "step": 15634 + }, + { + "epoch": 1.9889327057626256, + "ewc_loss": 0.07101918756961823, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035862933145835996, + "grad_norm": 8.306477546691895, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8707084059715271, + "num_tokens": 596427366.0, + "step": 15635 + }, + { + "epoch": 1.9890599160412161, + "ewc_loss": 0.07138349115848541, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003622724616434425, + "grad_norm": 8.465468406677246, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8559554815292358, + "num_tokens": 596469999.0, + "step": 15636 + }, + { + "epoch": 1.9891871263198067, + "ewc_loss": 0.07089371979236603, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035737475263886154, + "grad_norm": 8.329050064086914, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8488256931304932, + "num_tokens": 596506773.0, + "step": 15637 + }, + { + "epoch": 1.9893143365983972, + "ewc_loss": 0.07149460166692734, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003633835003711283, + "grad_norm": 8.423181533813477, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8668115139007568, + "num_tokens": 596547440.0, + "step": 15638 + }, + { + "epoch": 1.9894415468769877, + "ewc_loss": 0.07110665738582611, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035950407618656754, + "grad_norm": 8.346609115600586, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.870952308177948, + "num_tokens": 596580212.0, + "step": 15639 + }, + { + "epoch": 1.989568757155578, + "ewc_loss": 0.07135525345802307, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003619900089688599, + "grad_norm": 8.423012733459473, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8858765363693237, + "num_tokens": 596611790.0, + "step": 15640 + }, + { + "epoch": 1.9896959674341685, + "ewc_loss": 0.07103218138217926, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003587593382690102, + "grad_norm": 8.34305477142334, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8800150156021118, + "num_tokens": 596646984.0, + "step": 15641 + }, + { + "epoch": 1.989823177712759, + "ewc_loss": 0.071383997797966, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003622774383984506, + "grad_norm": 8.422675132751465, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8605594635009766, + "num_tokens": 596688362.0, + "step": 15642 + }, + { + "epoch": 1.9899503879913496, + "ewc_loss": 0.07113533467054367, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003597908653318882, + "grad_norm": 8.330410957336426, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8667789101600647, + "num_tokens": 596725298.0, + "step": 15643 + }, + { + "epoch": 1.9900775982699401, + "ewc_loss": 0.07143605500459671, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036279804771766067, + "grad_norm": 8.43868637084961, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8695785403251648, + "num_tokens": 596757400.0, + "step": 15644 + }, + { + "epoch": 1.9902048085485307, + "ewc_loss": 0.07110141217708588, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003594516310840845, + "grad_norm": 8.356019020080566, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8759826421737671, + "num_tokens": 596790533.0, + "step": 15645 + }, + { + "epoch": 1.9903320188271212, + "ewc_loss": 0.07149994373321533, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036343696410767734, + "grad_norm": 8.435881614685059, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8583964109420776, + "num_tokens": 596834467.0, + "step": 15646 + }, + { + "epoch": 1.9904592291057117, + "ewc_loss": 0.07114860415458679, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003599235787987709, + "grad_norm": 8.402949333190918, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8520640134811401, + "num_tokens": 596873262.0, + "step": 15647 + }, + { + "epoch": 1.9905864393843022, + "ewc_loss": 0.07140607386827469, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003624982200562954, + "grad_norm": 8.473337173461914, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8645179271697998, + "num_tokens": 596912833.0, + "step": 15648 + }, + { + "epoch": 1.9907136496628928, + "ewc_loss": 0.07109633088111877, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003594008448999375, + "grad_norm": 8.335299491882324, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.87126624584198, + "num_tokens": 596953554.0, + "step": 15649 + }, + { + "epoch": 1.9908408599414833, + "ewc_loss": 0.07148928940296173, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003633304440882057, + "grad_norm": 8.449325561523438, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8590989112854004, + "num_tokens": 596991206.0, + "step": 15650 + }, + { + "epoch": 1.9909680702200738, + "ewc_loss": 0.07091768085956573, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003576143062673509, + "grad_norm": 8.356369972229004, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8663785457611084, + "num_tokens": 597027222.0, + "step": 15651 + }, + { + "epoch": 1.9910952804986644, + "ewc_loss": 0.07147303968667984, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036316789919510484, + "grad_norm": 8.445016860961914, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8732128143310547, + "num_tokens": 597069040.0, + "step": 15652 + }, + { + "epoch": 1.9912224907772549, + "ewc_loss": 0.07104489207267761, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035888643469661474, + "grad_norm": 8.39256477355957, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8562402725219727, + "num_tokens": 597112529.0, + "step": 15653 + }, + { + "epoch": 1.9913497010558454, + "ewc_loss": 0.07123932242393494, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003608306869864464, + "grad_norm": 8.47709846496582, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8645637035369873, + "num_tokens": 597148304.0, + "step": 15654 + }, + { + "epoch": 1.991476911334436, + "ewc_loss": 0.07096607983112335, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035809827386401594, + "grad_norm": 8.345879554748535, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8726927042007446, + "num_tokens": 597184244.0, + "step": 15655 + }, + { + "epoch": 1.9916041216130265, + "ewc_loss": 0.07133254408836365, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000361762911779806, + "grad_norm": 8.498693466186523, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8575773239135742, + "num_tokens": 597218178.0, + "step": 15656 + }, + { + "epoch": 1.991731331891617, + "ewc_loss": 0.07086049020290375, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035704238689504564, + "grad_norm": 8.325356483459473, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8624848127365112, + "num_tokens": 597259256.0, + "step": 15657 + }, + { + "epoch": 1.9918585421702073, + "ewc_loss": 0.07123316824436188, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036076921969652176, + "grad_norm": 8.42635440826416, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.873947024345398, + "num_tokens": 597296584.0, + "step": 15658 + }, + { + "epoch": 1.9919857524487978, + "ewc_loss": 0.07100380957126617, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035847563412971795, + "grad_norm": 8.376177787780762, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8480024337768555, + "num_tokens": 597341116.0, + "step": 15659 + }, + { + "epoch": 1.9921129627273884, + "ewc_loss": 0.07132501900196075, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003616876492742449, + "grad_norm": 8.485030174255371, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8623350858688354, + "num_tokens": 597375595.0, + "step": 15660 + }, + { + "epoch": 1.9922401730059789, + "ewc_loss": 0.07098139822483063, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035825150553137064, + "grad_norm": 8.355380058288574, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8496801853179932, + "num_tokens": 597419226.0, + "step": 15661 + }, + { + "epoch": 1.9923673832845694, + "ewc_loss": 0.07126224040985107, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003610599087551236, + "grad_norm": 8.441523551940918, + "learning_rate": 1e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8451272249221802, + "num_tokens": 597461242.0, + "step": 15662 + }, + { + "epoch": 1.99249459356316, + "ewc_loss": 0.07103247195482254, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003587622195482254, + "grad_norm": 8.338902473449707, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8637863993644714, + "num_tokens": 597500030.0, + "step": 15663 + }, + { + "epoch": 1.9926218038417502, + "ewc_loss": 0.0713282898068428, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036172039108350873, + "grad_norm": 8.490699768066406, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8475426435470581, + "num_tokens": 597540428.0, + "step": 15664 + }, + { + "epoch": 1.9927490141203408, + "ewc_loss": 0.0709926187992096, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003583636716939509, + "grad_norm": 8.376274108886719, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8748271465301514, + "num_tokens": 597575070.0, + "step": 15665 + }, + { + "epoch": 1.9928762243989313, + "ewc_loss": 0.071451336145401, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003629508428275585, + "grad_norm": 8.465386390686035, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8617786169052124, + "num_tokens": 597617425.0, + "step": 15666 + }, + { + "epoch": 1.9930034346775218, + "ewc_loss": 0.07086308300495148, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035706834751181304, + "grad_norm": 8.351411819458008, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8706305027008057, + "num_tokens": 597660489.0, + "step": 15667 + }, + { + "epoch": 1.9931306449561124, + "ewc_loss": 0.07130403816699982, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003614779270719737, + "grad_norm": 8.41278076171875, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8697433471679688, + "num_tokens": 597703626.0, + "step": 15668 + }, + { + "epoch": 1.993257855234703, + "ewc_loss": 0.07103638350963593, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035880133509635925, + "grad_norm": 8.36925983428955, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8596403002738953, + "num_tokens": 597741176.0, + "step": 15669 + }, + { + "epoch": 1.9933850655132934, + "ewc_loss": 0.07101811468601227, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035861865035258234, + "grad_norm": 8.40051555633545, + "learning_rate": 1e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8518452048301697, + "num_tokens": 597779763.0, + "step": 15670 + }, + { + "epoch": 1.993512275791884, + "ewc_loss": 0.07105174660682678, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035895491600967944, + "grad_norm": 8.354741096496582, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8675622940063477, + "num_tokens": 597813300.0, + "step": 15671 + }, + { + "epoch": 1.9936394860704745, + "ewc_loss": 0.07111296057701111, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003595671441871673, + "grad_norm": 8.39928913116455, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8532466888427734, + "num_tokens": 597849062.0, + "step": 15672 + }, + { + "epoch": 1.993766696349065, + "ewc_loss": 0.07104760408401489, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035891353036276996, + "grad_norm": 8.386992454528809, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8648566007614136, + "num_tokens": 597886565.0, + "step": 15673 + }, + { + "epoch": 1.9938939066276555, + "ewc_loss": 0.07103677093982697, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035880517680197954, + "grad_norm": 8.330453872680664, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8658469319343567, + "num_tokens": 597925097.0, + "step": 15674 + }, + { + "epoch": 1.994021116906246, + "ewc_loss": 0.07119914889335632, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036042893771082163, + "grad_norm": 8.428074836730957, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8762608766555786, + "num_tokens": 597960786.0, + "step": 15675 + }, + { + "epoch": 1.9941483271848366, + "ewc_loss": 0.07100958377122879, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035853334702551365, + "grad_norm": 8.392094612121582, + "learning_rate": 1e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8456515073776245, + "num_tokens": 598000442.0, + "step": 15676 + }, + { + "epoch": 1.9942755374634271, + "ewc_loss": 0.07110932469367981, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035953076439909637, + "grad_norm": 8.390483856201172, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.86138916015625, + "num_tokens": 598042946.0, + "step": 15677 + }, + { + "epoch": 1.9944027477420176, + "ewc_loss": 0.07112814486026764, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035971897887066007, + "grad_norm": 8.372615814208984, + "learning_rate": 1e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8429456949234009, + "num_tokens": 598081659.0, + "step": 15678 + }, + { + "epoch": 1.9945299580206082, + "ewc_loss": 0.07098042964935303, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035824175574816763, + "grad_norm": 8.375772476196289, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8580120801925659, + "num_tokens": 598120203.0, + "step": 15679 + }, + { + "epoch": 1.9946571682991987, + "ewc_loss": 0.07115372270345688, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003599747142288834, + "grad_norm": 8.424859046936035, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8493099212646484, + "num_tokens": 598154779.0, + "step": 15680 + }, + { + "epoch": 1.9947843785777892, + "ewc_loss": 0.07093237340450287, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035776125150732696, + "grad_norm": 8.277807235717773, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8639545440673828, + "num_tokens": 598197075.0, + "step": 15681 + }, + { + "epoch": 1.9949115888563798, + "ewc_loss": 0.07167280465364456, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036272412398830056, + "grad_norm": 8.443839073181152, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8550465106964111, + "num_tokens": 598240539.0, + "step": 15682 + }, + { + "epoch": 1.99503879913497, + "ewc_loss": 0.07093590497970581, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035779658355750144, + "grad_norm": 8.26993465423584, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8649163842201233, + "num_tokens": 598281776.0, + "step": 15683 + }, + { + "epoch": 1.9951660094135606, + "ewc_loss": 0.0716066062450409, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003645035612862557, + "grad_norm": 8.453784942626953, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8697695732116699, + "num_tokens": 598322715.0, + "step": 15684 + }, + { + "epoch": 1.9952932196921511, + "ewc_loss": 0.07102999091148376, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003587373939808458, + "grad_norm": 8.317975044250488, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8572520613670349, + "num_tokens": 598364495.0, + "step": 15685 + }, + { + "epoch": 1.9954204299707416, + "ewc_loss": 0.07156713306903839, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003641088551376015, + "grad_norm": 8.457306861877441, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8609477281570435, + "num_tokens": 598407102.0, + "step": 15686 + }, + { + "epoch": 1.9955476402493322, + "ewc_loss": 0.071017324924469, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003586107341106981, + "grad_norm": 8.350199699401855, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8487423658370972, + "num_tokens": 598448600.0, + "step": 15687 + }, + { + "epoch": 1.9956748505279227, + "ewc_loss": 0.07154546678066254, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036389220622368157, + "grad_norm": 8.431157112121582, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8534855842590332, + "num_tokens": 598491140.0, + "step": 15688 + }, + { + "epoch": 1.995802060806513, + "ewc_loss": 0.07113265246152878, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003597640316002071, + "grad_norm": 8.346996307373047, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8587116599082947, + "num_tokens": 598527831.0, + "step": 15689 + }, + { + "epoch": 1.9959292710851035, + "ewc_loss": 0.07143669575452805, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036280445056036115, + "grad_norm": 8.432393074035645, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8503851294517517, + "num_tokens": 598568680.0, + "step": 15690 + }, + { + "epoch": 1.996056481363694, + "ewc_loss": 0.07117055356502533, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003601429925765842, + "grad_norm": 8.378413200378418, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8520534038543701, + "num_tokens": 598606254.0, + "step": 15691 + }, + { + "epoch": 1.9961836916422846, + "ewc_loss": 0.07143190503120422, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003627565747592598, + "grad_norm": 8.415538787841797, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8651638031005859, + "num_tokens": 598642400.0, + "step": 15692 + }, + { + "epoch": 1.9963109019208751, + "ewc_loss": 0.07141733169555664, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003601693897508085, + "grad_norm": 8.342731475830078, + "learning_rate": 1e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8409284353256226, + "num_tokens": 598678379.0, + "step": 15693 + }, + { + "epoch": 1.9964381121994657, + "ewc_loss": 0.07132269442081451, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036166448262520134, + "grad_norm": 8.376169204711914, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8619550466537476, + "num_tokens": 598718234.0, + "step": 15694 + }, + { + "epoch": 1.9965653224780562, + "ewc_loss": 0.0712643638253212, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003610811254475266, + "grad_norm": 8.38215446472168, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8670822381973267, + "num_tokens": 598754897.0, + "step": 15695 + }, + { + "epoch": 1.9966925327566467, + "ewc_loss": 0.0711948499083519, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003603859804570675, + "grad_norm": 8.355609893798828, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.856986403465271, + "num_tokens": 598785268.0, + "step": 15696 + }, + { + "epoch": 1.9968197430352372, + "ewc_loss": 0.07132074236869812, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003616449539549649, + "grad_norm": 8.367051124572754, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8684328198432922, + "num_tokens": 598827131.0, + "step": 15697 + }, + { + "epoch": 1.9969469533138278, + "ewc_loss": 0.07144314050674438, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036042751162312925, + "grad_norm": 8.355940818786621, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8646601438522339, + "num_tokens": 598859507.0, + "step": 15698 + }, + { + "epoch": 1.9970741635924183, + "ewc_loss": 0.0716034397482872, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036203049239702523, + "grad_norm": 8.404666900634766, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8547034859657288, + "num_tokens": 598893280.0, + "step": 15699 + }, + { + "epoch": 1.9972013738710088, + "ewc_loss": 0.0713910162448883, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003599062329158187, + "grad_norm": 8.254281044006348, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8762401938438416, + "num_tokens": 598932759.0, + "step": 15700 + }, + { + "epoch": 1.9973285841495994, + "ewc_loss": 0.0718754380941391, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036475047818385065, + "grad_norm": 8.438844680786133, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8722899556159973, + "num_tokens": 598972899.0, + "step": 15701 + }, + { + "epoch": 1.9974557944281899, + "ewc_loss": 0.07152709364891052, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036126701161265373, + "grad_norm": 8.337849617004395, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8654226660728455, + "num_tokens": 599012811.0, + "step": 15702 + }, + { + "epoch": 1.9975830047067804, + "ewc_loss": 0.07188428938388824, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003648389538284391, + "grad_norm": 8.462105751037598, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8543350100517273, + "num_tokens": 599055632.0, + "step": 15703 + }, + { + "epoch": 1.997710214985371, + "ewc_loss": 0.07141812890768051, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003601773933041841, + "grad_norm": 8.35246753692627, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8467391133308411, + "num_tokens": 599090924.0, + "step": 15704 + }, + { + "epoch": 1.9978374252639615, + "ewc_loss": 0.07189849019050598, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003649809514172375, + "grad_norm": 8.467886924743652, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8766118288040161, + "num_tokens": 599130434.0, + "step": 15705 + }, + { + "epoch": 1.997964635542552, + "ewc_loss": 0.07148593664169312, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.000360855512553826, + "grad_norm": 8.425581932067871, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8622946739196777, + "num_tokens": 599171018.0, + "step": 15706 + }, + { + "epoch": 1.9980918458211423, + "ewc_loss": 0.07147612422704697, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000363198749255389, + "grad_norm": 8.5103178024292, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8547903299331665, + "num_tokens": 599212445.0, + "step": 15707 + }, + { + "epoch": 1.9982190560997328, + "ewc_loss": 0.07123097032308578, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000360747188096866, + "grad_norm": 8.463574409484863, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.866155207157135, + "num_tokens": 599250591.0, + "step": 15708 + }, + { + "epoch": 1.9983462663783234, + "ewc_loss": 0.07118190824985504, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003602565557230264, + "grad_norm": 8.406134605407715, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8589075207710266, + "num_tokens": 599284987.0, + "step": 15709 + }, + { + "epoch": 1.9984734766569139, + "ewc_loss": 0.07125469297170639, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003609844425227493, + "grad_norm": 8.498289108276367, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.858331024646759, + "num_tokens": 599324656.0, + "step": 15710 + }, + { + "epoch": 1.9986006869355044, + "ewc_loss": 0.07117076218128204, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036014511715620756, + "grad_norm": 8.490643501281738, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8739404678344727, + "num_tokens": 599364014.0, + "step": 15711 + }, + { + "epoch": 1.998727897214095, + "ewc_loss": 0.07108087837696075, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035924630356021225, + "grad_norm": 8.383220672607422, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8563119173049927, + "num_tokens": 599396781.0, + "step": 15712 + }, + { + "epoch": 1.9988551074926852, + "ewc_loss": 0.07119036465883255, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036034113145433366, + "grad_norm": 8.42624568939209, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8733173608779907, + "num_tokens": 599433240.0, + "step": 15713 + }, + { + "epoch": 1.9989823177712758, + "ewc_loss": 0.07119612395763397, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036039872793480754, + "grad_norm": 8.493117332458496, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8703333139419556, + "num_tokens": 599474883.0, + "step": 15714 + }, + { + "epoch": 1.9991095280498663, + "ewc_loss": 0.07127620279788971, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003587581741157919, + "grad_norm": 8.402230262756348, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8641572594642639, + "num_tokens": 599508492.0, + "step": 15715 + }, + { + "epoch": 1.9992367383284568, + "ewc_loss": 0.07124599069356918, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003608973929658532, + "grad_norm": 8.535475730895996, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8653202652931213, + "num_tokens": 599545946.0, + "step": 15716 + }, + { + "epoch": 1.9993639486070474, + "ewc_loss": 0.07084451615810394, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003568826650734991, + "grad_norm": 8.442770957946777, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8759828805923462, + "num_tokens": 599586217.0, + "step": 15717 + }, + { + "epoch": 1.9994911588856379, + "ewc_loss": 0.07132406532764435, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.000359236728399992, + "grad_norm": 8.435953140258789, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8722982406616211, + "num_tokens": 599620431.0, + "step": 15718 + }, + { + "epoch": 1.9996183691642284, + "ewc_loss": 0.07117235660552979, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003577196621336043, + "grad_norm": 8.50786304473877, + "learning_rate": 1e-06, + "loss": 0.523, + "mean_token_accuracy": 0.847834050655365, + "num_tokens": 599659959.0, + "step": 15719 + }, + { + "epoch": 1.999745579442819, + "ewc_loss": 0.07089557498693466, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.000357393262675032, + "grad_norm": 8.385787010192871, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8601266145706177, + "num_tokens": 599696558.0, + "step": 15720 + }, + { + "epoch": 1.9998727897214095, + "ewc_loss": 0.07111607491970062, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035959825618192554, + "grad_norm": 8.481073379516602, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8523432612419128, + "num_tokens": 599734925.0, + "step": 15721 + }, + { + "epoch": 2.0, + "ewc_loss": 0.0707530826330185, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003559682809282094, + "grad_norm": 8.366965293884277, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8669222593307495, + "num_tokens": 599772613.0, + "step": 15722 + }, + { + "epoch": 2.0001272102785905, + "ewc_loss": 0.07124969363212585, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003609344130381942, + "grad_norm": 8.410492897033691, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8667187094688416, + "num_tokens": 599813892.0, + "step": 15723 + }, + { + "epoch": 2.000254420557181, + "ewc_loss": 0.0709356963634491, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035779442987404764, + "grad_norm": 8.378288269042969, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8650394678115845, + "num_tokens": 599852807.0, + "step": 15724 + }, + { + "epoch": 2.0003816308357716, + "ewc_loss": 0.07121314108371735, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003605688689276576, + "grad_norm": 8.42378044128418, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8715095520019531, + "num_tokens": 599893712.0, + "step": 15725 + }, + { + "epoch": 2.000508841114362, + "ewc_loss": 0.07138779759407043, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.000359874073183164, + "grad_norm": 8.411627769470215, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8659906387329102, + "num_tokens": 599929954.0, + "step": 15726 + }, + { + "epoch": 2.0006360513929526, + "ewc_loss": 0.0711778998374939, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036021650885231793, + "grad_norm": 8.423311233520508, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8634762763977051, + "num_tokens": 599967763.0, + "step": 15727 + }, + { + "epoch": 2.000763261671543, + "ewc_loss": 0.07118299603462219, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.0003602674405556172, + "grad_norm": 8.433349609375, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8734647035598755, + "num_tokens": 600008938.0, + "step": 15728 + }, + { + "epoch": 2.0008904719501337, + "ewc_loss": 0.07114437967538834, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00035988129093311727, + "grad_norm": 8.362319946289062, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8602086305618286, + "num_tokens": 600051849.0, + "step": 15729 + }, + { + "epoch": 2.0010176822287242, + "ewc_loss": 0.07139302790164948, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036236774758435786, + "grad_norm": 8.532638549804688, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8711130023002625, + "num_tokens": 600088568.0, + "step": 15730 + }, + { + "epoch": 2.0011448925073148, + "ewc_loss": 0.07120959460735321, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00035809201654046774, + "grad_norm": 8.373113632202148, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8687649965286255, + "num_tokens": 600131553.0, + "step": 15731 + }, + { + "epoch": 2.0012721027859053, + "ewc_loss": 0.07169244438409805, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036292054574005306, + "grad_norm": 8.499401092529297, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8651995658874512, + "num_tokens": 600173198.0, + "step": 15732 + }, + { + "epoch": 2.001399313064496, + "ewc_loss": 0.07118301093578339, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003578262112569064, + "grad_norm": 8.372457504272461, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8686642050743103, + "num_tokens": 600211347.0, + "step": 15733 + }, + { + "epoch": 2.0015265233430863, + "ewc_loss": 0.07169526070356369, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003629486891441047, + "grad_norm": 8.44154167175293, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8589168190956116, + "num_tokens": 600249953.0, + "step": 15734 + }, + { + "epoch": 2.0016537336216764, + "ewc_loss": 0.0713181346654892, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003591774730011821, + "grad_norm": 8.416622161865234, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8559617400169373, + "num_tokens": 600291211.0, + "step": 15735 + }, + { + "epoch": 2.001780943900267, + "ewc_loss": 0.07162201404571533, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036221620393916965, + "grad_norm": 8.419371604919434, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8692581653594971, + "num_tokens": 600332906.0, + "step": 15736 + }, + { + "epoch": 2.0019081541788575, + "ewc_loss": 0.07123503088951111, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036078778794035316, + "grad_norm": 8.417396545410156, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8660959005355835, + "num_tokens": 600370570.0, + "step": 15737 + }, + { + "epoch": 2.002035364457448, + "ewc_loss": 0.07150321453809738, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003610282437875867, + "grad_norm": 8.436746597290039, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8553771376609802, + "num_tokens": 600409368.0, + "step": 15738 + }, + { + "epoch": 2.0021625747360385, + "ewc_loss": 0.07124774158000946, + "ewc_loss_diag": 3.504753112792969e-05, + "ewc_loss_parallel": 0.00036091491347178817, + "grad_norm": 8.378091812133789, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8694192171096802, + "num_tokens": 600442019.0, + "step": 15739 + }, + { + "epoch": 2.002289785014629, + "ewc_loss": 0.07165724039077759, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036256853491067886, + "grad_norm": 8.519974708557129, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8721479177474976, + "num_tokens": 600471665.0, + "step": 15740 + }, + { + "epoch": 2.0024169952932196, + "ewc_loss": 0.071198470890522, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00035798081080429256, + "grad_norm": 8.323503494262695, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8620179295539856, + "num_tokens": 600508804.0, + "step": 15741 + }, + { + "epoch": 2.00254420557181, + "ewc_loss": 0.07211647927761078, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003671608865261078, + "grad_norm": 8.592262268066406, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.852226734161377, + "num_tokens": 600543895.0, + "step": 15742 + }, + { + "epoch": 2.0026714158504006, + "ewc_loss": 0.07102389633655548, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00035623510484583676, + "grad_norm": 8.224120140075684, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8660348057746887, + "num_tokens": 600585818.0, + "step": 15743 + }, + { + "epoch": 2.002798626128991, + "ewc_loss": 0.07238341867923737, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003698302898555994, + "grad_norm": 8.715249061584473, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8677463531494141, + "num_tokens": 600623106.0, + "step": 15744 + }, + { + "epoch": 2.0029258364075817, + "ewc_loss": 0.07103389501571655, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003563350473996252, + "grad_norm": 8.231551170349121, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.887053370475769, + "num_tokens": 600662662.0, + "step": 15745 + }, + { + "epoch": 2.0030530466861722, + "ewc_loss": 0.07247467339038849, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00037074286956340075, + "grad_norm": 8.60462474822998, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8518307209014893, + "num_tokens": 600699416.0, + "step": 15746 + }, + { + "epoch": 2.0031802569647628, + "ewc_loss": 0.07104435563087463, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003564396174624562, + "grad_norm": 8.24416446685791, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.858063817024231, + "num_tokens": 600739351.0, + "step": 15747 + }, + { + "epoch": 2.0033074672433533, + "ewc_loss": 0.07243840396404266, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003703801485244185, + "grad_norm": 8.547860145568848, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8667482137680054, + "num_tokens": 600781198.0, + "step": 15748 + }, + { + "epoch": 2.003434677521944, + "ewc_loss": 0.07126769423484802, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00035867298720404506, + "grad_norm": 8.26190185546875, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.861509382724762, + "num_tokens": 600821743.0, + "step": 15749 + }, + { + "epoch": 2.0035618878005343, + "ewc_loss": 0.07237955927848816, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003697916981764138, + "grad_norm": 8.549385070800781, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8799856901168823, + "num_tokens": 600862463.0, + "step": 15750 + }, + { + "epoch": 2.003689098079125, + "ewc_loss": 0.07138822972774506, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003598784387577325, + "grad_norm": 8.373295783996582, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8552157878875732, + "num_tokens": 600902629.0, + "step": 15751 + }, + { + "epoch": 2.0038163083577154, + "ewc_loss": 0.07203365117311478, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036633259151130915, + "grad_norm": 8.49010181427002, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8625959157943726, + "num_tokens": 600941778.0, + "step": 15752 + }, + { + "epoch": 2.003943518636306, + "ewc_loss": 0.07157914340496063, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003617874754127115, + "grad_norm": 8.348653793334961, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8733963966369629, + "num_tokens": 600979425.0, + "step": 15753 + }, + { + "epoch": 2.0040707289148965, + "ewc_loss": 0.07209034264087677, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003668995632324368, + "grad_norm": 8.49817943572998, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8593961596488953, + "num_tokens": 601017882.0, + "step": 15754 + }, + { + "epoch": 2.004197939193487, + "ewc_loss": 0.07163796573877335, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036237575113773346, + "grad_norm": 8.374120712280273, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8593084812164307, + "num_tokens": 601053394.0, + "step": 15755 + }, + { + "epoch": 2.0043251494720775, + "ewc_loss": 0.07219774276018143, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036553211975842714, + "grad_norm": 8.424108505249023, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8678524494171143, + "num_tokens": 601094940.0, + "step": 15756 + }, + { + "epoch": 2.004452359750668, + "ewc_loss": 0.07204653322696686, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036401997203938663, + "grad_norm": 8.436034202575684, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8652721643447876, + "num_tokens": 601132963.0, + "step": 15757 + }, + { + "epoch": 2.0045795700292586, + "ewc_loss": 0.07164441049098969, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003624401579145342, + "grad_norm": 8.41999340057373, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8631337285041809, + "num_tokens": 601168511.0, + "step": 15758 + }, + { + "epoch": 2.0047067803078487, + "ewc_loss": 0.07175546139478683, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036355070187710226, + "grad_norm": 8.357522964477539, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.866323709487915, + "num_tokens": 601208922.0, + "step": 15759 + }, + { + "epoch": 2.004833990586439, + "ewc_loss": 0.0720093622803688, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036364831612445414, + "grad_norm": 8.434876441955566, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8595268130302429, + "num_tokens": 601253814.0, + "step": 15760 + }, + { + "epoch": 2.0049612008650297, + "ewc_loss": 0.07162831723690033, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003622792137321085, + "grad_norm": 8.295368194580078, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.865562915802002, + "num_tokens": 601295584.0, + "step": 15761 + }, + { + "epoch": 2.0050884111436202, + "ewc_loss": 0.07198847830295563, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003658809291664511, + "grad_norm": 8.515210151672363, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8564465045928955, + "num_tokens": 601329134.0, + "step": 15762 + }, + { + "epoch": 2.0052156214222108, + "ewc_loss": 0.07161732017993927, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036216925946064293, + "grad_norm": 8.349390029907227, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8737808465957642, + "num_tokens": 601369100.0, + "step": 15763 + }, + { + "epoch": 2.0053428317008013, + "ewc_loss": 0.07216621190309525, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036765821278095245, + "grad_norm": 8.478242874145508, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8780543208122253, + "num_tokens": 601407876.0, + "step": 15764 + }, + { + "epoch": 2.005470041979392, + "ewc_loss": 0.0717284083366394, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036328021087683737, + "grad_norm": 8.382607460021973, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8787400722503662, + "num_tokens": 601446996.0, + "step": 15765 + }, + { + "epoch": 2.0055972522579824, + "ewc_loss": 0.07191735506057739, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036516960244625807, + "grad_norm": 8.478592872619629, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8561756610870361, + "num_tokens": 601482158.0, + "step": 15766 + }, + { + "epoch": 2.005724462536573, + "ewc_loss": 0.0716768130660057, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036276422906666994, + "grad_norm": 8.423419952392578, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8719215989112854, + "num_tokens": 601517940.0, + "step": 15767 + }, + { + "epoch": 2.0058516728151634, + "ewc_loss": 0.07198390364646912, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036583509063348174, + "grad_norm": 8.467493057250977, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8863569498062134, + "num_tokens": 601553823.0, + "step": 15768 + }, + { + "epoch": 2.005978883093754, + "ewc_loss": 0.07179374992847443, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003614921879488975, + "grad_norm": 8.339917182922363, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8535327911376953, + "num_tokens": 601595482.0, + "step": 15769 + }, + { + "epoch": 2.0061060933723445, + "ewc_loss": 0.0720302015542984, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036629807436838746, + "grad_norm": 8.468077659606934, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8724333047866821, + "num_tokens": 601632451.0, + "step": 15770 + }, + { + "epoch": 2.006233303650935, + "ewc_loss": 0.07151438295841217, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036113994428887963, + "grad_norm": 8.361349105834961, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8630037307739258, + "num_tokens": 601667573.0, + "step": 15771 + }, + { + "epoch": 2.0063605139295255, + "ewc_loss": 0.07182960212230682, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.000364292151061818, + "grad_norm": 8.403756141662598, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8728238344192505, + "num_tokens": 601704494.0, + "step": 15772 + }, + { + "epoch": 2.006487724208116, + "ewc_loss": 0.07164651900529861, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003624612872954458, + "grad_norm": 8.353595733642578, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.854411244392395, + "num_tokens": 601745307.0, + "step": 15773 + }, + { + "epoch": 2.0066149344867066, + "ewc_loss": 0.07188066840171814, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036480274866335094, + "grad_norm": 8.444231986999512, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8798955082893372, + "num_tokens": 601782551.0, + "step": 15774 + }, + { + "epoch": 2.006742144765297, + "ewc_loss": 0.07161427289247513, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003621388168539852, + "grad_norm": 8.377437591552734, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8674125671386719, + "num_tokens": 601823378.0, + "step": 15775 + }, + { + "epoch": 2.0068693550438876, + "ewc_loss": 0.07185949385166168, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003645910182967782, + "grad_norm": 8.341052055358887, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8652266263961792, + "num_tokens": 601860881.0, + "step": 15776 + }, + { + "epoch": 2.006996565322478, + "ewc_loss": 0.07212928682565689, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036484753945842385, + "grad_norm": 8.484787940979004, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8626872301101685, + "num_tokens": 601903502.0, + "step": 15777 + }, + { + "epoch": 2.0071237756010687, + "ewc_loss": 0.07179099321365356, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036146456841379404, + "grad_norm": 8.368122100830078, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8837752938270569, + "num_tokens": 601937097.0, + "step": 15778 + }, + { + "epoch": 2.007250985879659, + "ewc_loss": 0.07214836776256561, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036503837327472866, + "grad_norm": 8.350054740905762, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8692699670791626, + "num_tokens": 601978050.0, + "step": 15779 + }, + { + "epoch": 2.0073781961582498, + "ewc_loss": 0.07186727225780487, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036222737981006503, + "grad_norm": 8.393890380859375, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8640173673629761, + "num_tokens": 602018167.0, + "step": 15780 + }, + { + "epoch": 2.0075054064368403, + "ewc_loss": 0.07211913913488388, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003647460835054517, + "grad_norm": 8.432257652282715, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8519818782806396, + "num_tokens": 602054485.0, + "step": 15781 + }, + { + "epoch": 2.007632616715431, + "ewc_loss": 0.07168029993772507, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003627990954555571, + "grad_norm": 8.34915828704834, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8704557418823242, + "num_tokens": 602092881.0, + "step": 15782 + }, + { + "epoch": 2.0077598269940213, + "ewc_loss": 0.07188975811004639, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003648936690296978, + "grad_norm": 8.430091857910156, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8668116331100464, + "num_tokens": 602127424.0, + "step": 15783 + }, + { + "epoch": 2.0078870372726114, + "ewc_loss": 0.07159622758626938, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036195837310515344, + "grad_norm": 8.317858695983887, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8716527223587036, + "num_tokens": 602168340.0, + "step": 15784 + }, + { + "epoch": 2.008014247551202, + "ewc_loss": 0.07205317914485931, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036652787821367383, + "grad_norm": 8.495287895202637, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8560599088668823, + "num_tokens": 602207894.0, + "step": 15785 + }, + { + "epoch": 2.0081414578297925, + "ewc_loss": 0.07150308787822723, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003610269632190466, + "grad_norm": 8.306092262268066, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8675939440727234, + "num_tokens": 602245924.0, + "step": 15786 + }, + { + "epoch": 2.008268668108383, + "ewc_loss": 0.07211233675479889, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003671195008791983, + "grad_norm": 8.478555679321289, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8661900758743286, + "num_tokens": 602278417.0, + "step": 15787 + }, + { + "epoch": 2.0083958783869735, + "ewc_loss": 0.07143335044384003, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.00036032963544130325, + "grad_norm": 8.252510070800781, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8686643242835999, + "num_tokens": 602318431.0, + "step": 15788 + }, + { + "epoch": 2.008523088665564, + "ewc_loss": 0.07238340377807617, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036738874041475356, + "grad_norm": 8.471979141235352, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8558969497680664, + "num_tokens": 602356315.0, + "step": 15789 + }, + { + "epoch": 2.0086502989441546, + "ewc_loss": 0.07147742807865143, + "ewc_loss_diag": 3.528594970703125e-05, + "ewc_loss_parallel": 0.0003607704129535705, + "grad_norm": 8.268095016479492, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8801554441452026, + "num_tokens": 602398865.0, + "step": 15790 + }, + { + "epoch": 2.008777509222745, + "ewc_loss": 0.07242968678474426, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036785154952667654, + "grad_norm": 8.529685974121094, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8763283491134644, + "num_tokens": 602435835.0, + "step": 15791 + }, + { + "epoch": 2.0089047195013356, + "ewc_loss": 0.07170220464468002, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003605767269618809, + "grad_norm": 8.301972389221191, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8729200959205627, + "num_tokens": 602470359.0, + "step": 15792 + }, + { + "epoch": 2.009031929779926, + "ewc_loss": 0.07241049408912659, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003676596097648144, + "grad_norm": 8.500091552734375, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8689588308334351, + "num_tokens": 602512132.0, + "step": 15793 + }, + { + "epoch": 2.0091591400585167, + "ewc_loss": 0.07168081402778625, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003603628429118544, + "grad_norm": 8.321579933166504, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8624460101127625, + "num_tokens": 602552700.0, + "step": 15794 + }, + { + "epoch": 2.0092863503371072, + "ewc_loss": 0.07244619727134705, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036801668466068804, + "grad_norm": 8.531439781188965, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8649280071258545, + "num_tokens": 602587779.0, + "step": 15795 + }, + { + "epoch": 2.0094135606156978, + "ewc_loss": 0.07157064229249954, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00035926111740991473, + "grad_norm": 8.330401420593262, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8564865589141846, + "num_tokens": 602623834.0, + "step": 15796 + }, + { + "epoch": 2.0095407708942883, + "ewc_loss": 0.07237288355827332, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.000367283500963822, + "grad_norm": 8.560465812683105, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8782241940498352, + "num_tokens": 602660141.0, + "step": 15797 + }, + { + "epoch": 2.009667981172879, + "ewc_loss": 0.07167742401361465, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003603289369493723, + "grad_norm": 8.31812572479248, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8708232045173645, + "num_tokens": 602698990.0, + "step": 15798 + }, + { + "epoch": 2.0097951914514693, + "ewc_loss": 0.07233551889657974, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003669098950922489, + "grad_norm": 8.522576332092285, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8461304903030396, + "num_tokens": 602732053.0, + "step": 15799 + }, + { + "epoch": 2.00992240173006, + "ewc_loss": 0.07158114016056061, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00035936612403020263, + "grad_norm": 8.28845500946045, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8660309314727783, + "num_tokens": 602770975.0, + "step": 15800 + }, + { + "epoch": 2.0100496120086504, + "ewc_loss": 0.07236054539680481, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003671601880341768, + "grad_norm": 8.515822410583496, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8574362993240356, + "num_tokens": 602813207.0, + "step": 15801 + }, + { + "epoch": 2.010176822287241, + "ewc_loss": 0.071571484208107, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00035926952841691673, + "grad_norm": 8.30233383178711, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.870972752571106, + "num_tokens": 602849951.0, + "step": 15802 + }, + { + "epoch": 2.0103040325658315, + "ewc_loss": 0.07231725752353668, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003667272685561329, + "grad_norm": 8.497920036315918, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8569728136062622, + "num_tokens": 602891418.0, + "step": 15803 + }, + { + "epoch": 2.010431242844422, + "ewc_loss": 0.07158337533473969, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00035938838846050203, + "grad_norm": 8.270971298217773, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8729256391525269, + "num_tokens": 602933039.0, + "step": 15804 + }, + { + "epoch": 2.0105584531230125, + "ewc_loss": 0.0722595751285553, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.00036615040153265, + "grad_norm": 8.587587356567383, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8814364075660706, + "num_tokens": 602965546.0, + "step": 15805 + }, + { + "epoch": 2.010685663401603, + "ewc_loss": 0.0714804157614708, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003583588404580951, + "grad_norm": 8.221460342407227, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.865405797958374, + "num_tokens": 603004473.0, + "step": 15806 + }, + { + "epoch": 2.0108128736801936, + "ewc_loss": 0.07246133685112, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003681680536828935, + "grad_norm": 8.510133743286133, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8576329946517944, + "num_tokens": 603041644.0, + "step": 15807 + }, + { + "epoch": 2.0109400839587837, + "ewc_loss": 0.07138752937316895, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003574299335014075, + "grad_norm": 8.252680778503418, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8529295921325684, + "num_tokens": 603084698.0, + "step": 15808 + }, + { + "epoch": 2.011067294237374, + "ewc_loss": 0.07252815365791321, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003688362776301801, + "grad_norm": 8.539664268493652, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8487852811813354, + "num_tokens": 603121059.0, + "step": 15809 + }, + { + "epoch": 2.0111945045159647, + "ewc_loss": 0.07147367298603058, + "ewc_loss_diag": 3.552436828613281e-05, + "ewc_loss_parallel": 0.0003582914068829268, + "grad_norm": 8.300437927246094, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8633648157119751, + "num_tokens": 603156604.0, + "step": 15810 + }, + { + "epoch": 2.0113217147945552, + "ewc_loss": 0.07228818535804749, + "ewc_loss_diag": 3.5762786865234375e-05, + "ewc_loss_parallel": 0.00036643652128987014, + "grad_norm": 8.526576042175293, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8671097159385681, + "num_tokens": 603183743.0, + "step": 15811 + }, + { + "epoch": 2.0114489250731458, + "ewc_loss": 0.07159698009490967, + "ewc_loss_diag": 3.5762786865234375e-05, + "ewc_loss_parallel": 0.00035952453617937863, + "grad_norm": 8.27841567993164, + "learning_rate": 1e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8433240652084351, + "num_tokens": 603222099.0, + "step": 15812 + }, + { + "epoch": 2.0115761353517363, + "ewc_loss": 0.07251010835170746, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003662143717519939, + "grad_norm": 8.425322532653809, + "learning_rate": 1e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8453216552734375, + "num_tokens": 603263467.0, + "step": 15813 + }, + { + "epoch": 2.011703345630327, + "ewc_loss": 0.07192988693714142, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003604121448006481, + "grad_norm": 8.316671371459961, + "learning_rate": 1e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8461057543754578, + "num_tokens": 603296726.0, + "step": 15814 + }, + { + "epoch": 2.0118305559089174, + "ewc_loss": 0.0723130851984024, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036424410063773394, + "grad_norm": 8.457651138305664, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8712327480316162, + "num_tokens": 603329691.0, + "step": 15815 + }, + { + "epoch": 2.011957766187508, + "ewc_loss": 0.07197803258895874, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003608936385717243, + "grad_norm": 8.249849319458008, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.873284101486206, + "num_tokens": 603371619.0, + "step": 15816 + }, + { + "epoch": 2.0120849764660984, + "ewc_loss": 0.07244899868965149, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036560322041623294, + "grad_norm": 8.498169898986816, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8674591779708862, + "num_tokens": 603400018.0, + "step": 15817 + }, + { + "epoch": 2.012212186744689, + "ewc_loss": 0.07192978262901306, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003604110679589212, + "grad_norm": 8.257777214050293, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.868281364440918, + "num_tokens": 603436971.0, + "step": 15818 + }, + { + "epoch": 2.0123393970232795, + "ewc_loss": 0.0726967602968216, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003680808877106756, + "grad_norm": 8.51990795135498, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8650810718536377, + "num_tokens": 603472802.0, + "step": 15819 + }, + { + "epoch": 2.01246660730187, + "ewc_loss": 0.07187147438526154, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00035982797271572053, + "grad_norm": 8.290760040283203, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8701328039169312, + "num_tokens": 603506547.0, + "step": 15820 + }, + { + "epoch": 2.0125938175804605, + "ewc_loss": 0.07262751460075378, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036738842027261853, + "grad_norm": 8.489083290100098, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8485338687896729, + "num_tokens": 603544967.0, + "step": 15821 + }, + { + "epoch": 2.012721027859051, + "ewc_loss": 0.0719432607293129, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036054590600542724, + "grad_norm": 8.316819190979004, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8613516092300415, + "num_tokens": 603582522.0, + "step": 15822 + }, + { + "epoch": 2.0128482381376416, + "ewc_loss": 0.0726761594414711, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036543348687700927, + "grad_norm": 8.462334632873535, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8760102987289429, + "num_tokens": 603619584.0, + "step": 15823 + }, + { + "epoch": 2.012975448416232, + "ewc_loss": 0.07186420261859894, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.000359755358658731, + "grad_norm": 8.329784393310547, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8530012965202332, + "num_tokens": 603660542.0, + "step": 15824 + }, + { + "epoch": 2.0131026586948226, + "ewc_loss": 0.07236512005329132, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036476447712630033, + "grad_norm": 8.462389945983887, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8673954010009766, + "num_tokens": 603698900.0, + "step": 15825 + }, + { + "epoch": 2.013229868973413, + "ewc_loss": 0.07213447988033295, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036001665284857154, + "grad_norm": 8.38634204864502, + "learning_rate": 1e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8460942506790161, + "num_tokens": 603733019.0, + "step": 15826 + }, + { + "epoch": 2.0133570792520037, + "ewc_loss": 0.07239116728305817, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003625834942795336, + "grad_norm": 8.376431465148926, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8727463483810425, + "num_tokens": 603772071.0, + "step": 15827 + }, + { + "epoch": 2.013484289530594, + "ewc_loss": 0.07235422730445862, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036221410846337676, + "grad_norm": 8.371538162231445, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8580183982849121, + "num_tokens": 603811008.0, + "step": 15828 + }, + { + "epoch": 2.0136114998091847, + "ewc_loss": 0.0723230391740799, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036190231912769377, + "grad_norm": 8.382074356079102, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8541519641876221, + "num_tokens": 603852647.0, + "step": 15829 + }, + { + "epoch": 2.0137387100877753, + "ewc_loss": 0.07230989634990692, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003617707989178598, + "grad_norm": 8.359955787658691, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8651963472366333, + "num_tokens": 603894198.0, + "step": 15830 + }, + { + "epoch": 2.013865920366366, + "ewc_loss": 0.07235102355480194, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003621821233537048, + "grad_norm": 8.364283561706543, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8596856594085693, + "num_tokens": 603930168.0, + "step": 15831 + }, + { + "epoch": 2.0139931306449563, + "ewc_loss": 0.07214293628931046, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036254263250157237, + "grad_norm": 8.372780799865723, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8748750686645508, + "num_tokens": 603966189.0, + "step": 15832 + }, + { + "epoch": 2.0141203409235464, + "ewc_loss": 0.0722314864397049, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003634281747508794, + "grad_norm": 8.331099510192871, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8622185587882996, + "num_tokens": 604011110.0, + "step": 15833 + }, + { + "epoch": 2.014247551202137, + "ewc_loss": 0.07217033207416534, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036281661596149206, + "grad_norm": 8.45911979675293, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8483453989028931, + "num_tokens": 604044256.0, + "step": 15834 + }, + { + "epoch": 2.0143747614807275, + "ewc_loss": 0.07212694734334946, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00035994136123917997, + "grad_norm": 8.303820610046387, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8734077215194702, + "num_tokens": 604078652.0, + "step": 15835 + }, + { + "epoch": 2.014501971759318, + "ewc_loss": 0.0727355033159256, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036602685577236116, + "grad_norm": 8.505952835083008, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8741053342819214, + "num_tokens": 604112302.0, + "step": 15836 + }, + { + "epoch": 2.0146291820379085, + "ewc_loss": 0.07184091210365295, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00035952244070358574, + "grad_norm": 8.259538650512695, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8814781904220581, + "num_tokens": 604150011.0, + "step": 15837 + }, + { + "epoch": 2.014756392316499, + "ewc_loss": 0.0724373459815979, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003654867468867451, + "grad_norm": 8.457489013671875, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8656704425811768, + "num_tokens": 604188859.0, + "step": 15838 + }, + { + "epoch": 2.0148836025950896, + "ewc_loss": 0.07175377011299133, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003586509556043893, + "grad_norm": 8.251641273498535, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8605757355690002, + "num_tokens": 604232273.0, + "step": 15839 + }, + { + "epoch": 2.01501081287368, + "ewc_loss": 0.07260920107364655, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036720524076372385, + "grad_norm": 8.46883773803711, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8730150461196899, + "num_tokens": 604265236.0, + "step": 15840 + }, + { + "epoch": 2.0151380231522706, + "ewc_loss": 0.07184477150440216, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003595610032789409, + "grad_norm": 8.277751922607422, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8799982070922852, + "num_tokens": 604304784.0, + "step": 15841 + }, + { + "epoch": 2.015265233430861, + "ewc_loss": 0.07250659167766571, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036617921432480216, + "grad_norm": 8.472514152526855, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8903458714485168, + "num_tokens": 604341461.0, + "step": 15842 + }, + { + "epoch": 2.0153924437094517, + "ewc_loss": 0.07238432765007019, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036007375456392765, + "grad_norm": 8.365558624267578, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8719290494918823, + "num_tokens": 604378628.0, + "step": 15843 + }, + { + "epoch": 2.0155196539880422, + "ewc_loss": 0.07242729514837265, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000362944818334654, + "grad_norm": 8.363163948059082, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.874866783618927, + "num_tokens": 604418070.0, + "step": 15844 + }, + { + "epoch": 2.0156468642666328, + "ewc_loss": 0.07224865257740021, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003611584543250501, + "grad_norm": 8.333776473999023, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8772165775299072, + "num_tokens": 604454445.0, + "step": 15845 + }, + { + "epoch": 2.0157740745452233, + "ewc_loss": 0.07225000858306885, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036361339152790606, + "grad_norm": 8.425116539001465, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8691307902336121, + "num_tokens": 604488559.0, + "step": 15846 + }, + { + "epoch": 2.015901284823814, + "ewc_loss": 0.07221336662769318, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003608055121731013, + "grad_norm": 8.391436576843262, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8574777841567993, + "num_tokens": 604522538.0, + "step": 15847 + }, + { + "epoch": 2.0160284951024043, + "ewc_loss": 0.07249464094638824, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036361825186759233, + "grad_norm": 8.349181175231934, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8843260407447815, + "num_tokens": 604561750.0, + "step": 15848 + }, + { + "epoch": 2.016155705380995, + "ewc_loss": 0.07249307632446289, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000363602681318298, + "grad_norm": 8.453685760498047, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8787076473236084, + "num_tokens": 604594961.0, + "step": 15849 + }, + { + "epoch": 2.0162829156595854, + "ewc_loss": 0.07223623245954514, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036103418096899986, + "grad_norm": 8.39672565460205, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8556783199310303, + "num_tokens": 604631301.0, + "step": 15850 + }, + { + "epoch": 2.016410125938176, + "ewc_loss": 0.07214058935642242, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003625192039180547, + "grad_norm": 8.432038307189941, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8456065654754639, + "num_tokens": 604666049.0, + "step": 15851 + }, + { + "epoch": 2.0165373362167665, + "ewc_loss": 0.07207208126783371, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003618340997491032, + "grad_norm": 8.392912864685059, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8865524530410767, + "num_tokens": 604701984.0, + "step": 15852 + }, + { + "epoch": 2.016664546495357, + "ewc_loss": 0.07230235636234283, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003616953908931464, + "grad_norm": 8.424871444702148, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8770904541015625, + "num_tokens": 604738414.0, + "step": 15853 + }, + { + "epoch": 2.0167917567739475, + "ewc_loss": 0.07220678776502609, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003607397375162691, + "grad_norm": 8.306142807006836, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8683676719665527, + "num_tokens": 604779961.0, + "step": 15854 + }, + { + "epoch": 2.016918967052538, + "ewc_loss": 0.07223540544509888, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003634673194028437, + "grad_norm": 8.460424423217773, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8591547608375549, + "num_tokens": 604819729.0, + "step": 15855 + }, + { + "epoch": 2.0170461773311286, + "ewc_loss": 0.07190752774477005, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036018856917507946, + "grad_norm": 8.34385871887207, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8769091963768005, + "num_tokens": 604864108.0, + "step": 15856 + }, + { + "epoch": 2.0171733876097186, + "ewc_loss": 0.07255344092845917, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036420629476197064, + "grad_norm": 8.462390899658203, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8595532178878784, + "num_tokens": 604905582.0, + "step": 15857 + }, + { + "epoch": 2.017300597888309, + "ewc_loss": 0.07206252962350845, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003592971770558506, + "grad_norm": 8.34532642364502, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8745414018630981, + "num_tokens": 604943931.0, + "step": 15858 + }, + { + "epoch": 2.0174278081668997, + "ewc_loss": 0.07246554642915726, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036576876300387084, + "grad_norm": 8.479632377624512, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8622042536735535, + "num_tokens": 604982100.0, + "step": 15859 + }, + { + "epoch": 2.0175550184454902, + "ewc_loss": 0.07187007367610931, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003598140610847622, + "grad_norm": 8.34350299835205, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8554456233978271, + "num_tokens": 605026289.0, + "step": 15860 + }, + { + "epoch": 2.0176822287240808, + "ewc_loss": 0.07243609428405762, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036547420313581824, + "grad_norm": 8.486322402954102, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8709847927093506, + "num_tokens": 605070214.0, + "step": 15861 + }, + { + "epoch": 2.0178094390026713, + "ewc_loss": 0.07212401926517487, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003599120827857405, + "grad_norm": 8.278711318969727, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8623276948928833, + "num_tokens": 605113495.0, + "step": 15862 + }, + { + "epoch": 2.017936649281262, + "ewc_loss": 0.07237128913402557, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003648261190392077, + "grad_norm": 8.534082412719727, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8693225979804993, + "num_tokens": 605145717.0, + "step": 15863 + }, + { + "epoch": 2.0180638595598523, + "ewc_loss": 0.07202069461345673, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003588787803892046, + "grad_norm": 8.362088203430176, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.857952892780304, + "num_tokens": 605180930.0, + "step": 15864 + }, + { + "epoch": 2.018191069838443, + "ewc_loss": 0.07263106852769852, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036498255212791264, + "grad_norm": 8.559353828430176, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.873079776763916, + "num_tokens": 605214004.0, + "step": 15865 + }, + { + "epoch": 2.0183182801170334, + "ewc_loss": 0.07168642431497574, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00035797752207145095, + "grad_norm": 8.2743558883667, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8675410747528076, + "num_tokens": 605253244.0, + "step": 15866 + }, + { + "epoch": 2.018445490395624, + "ewc_loss": 0.07269565761089325, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003656284243334085, + "grad_norm": 8.45270824432373, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8620011210441589, + "num_tokens": 605293832.0, + "step": 15867 + }, + { + "epoch": 2.0185727006742145, + "ewc_loss": 0.07195974141359329, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003582692879717797, + "grad_norm": 8.320355415344238, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8756039142608643, + "num_tokens": 605333811.0, + "step": 15868 + }, + { + "epoch": 2.018699910952805, + "ewc_loss": 0.07256551086902618, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036432701745070517, + "grad_norm": 8.499252319335938, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8661741018295288, + "num_tokens": 605371560.0, + "step": 15869 + }, + { + "epoch": 2.0188271212313955, + "ewc_loss": 0.07213470339775085, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036001886473968625, + "grad_norm": 8.327736854553223, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8650301098823547, + "num_tokens": 605410555.0, + "step": 15870 + }, + { + "epoch": 2.018954331509986, + "ewc_loss": 0.07275630533695221, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036623497726395726, + "grad_norm": 8.492203712463379, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8607860803604126, + "num_tokens": 605451851.0, + "step": 15871 + }, + { + "epoch": 2.0190815417885766, + "ewc_loss": 0.07205632328987122, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003592350985854864, + "grad_norm": 8.32603931427002, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8560226559638977, + "num_tokens": 605490442.0, + "step": 15872 + }, + { + "epoch": 2.019208752067167, + "ewc_loss": 0.0726316049695015, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003649879072327167, + "grad_norm": 8.465113639831543, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8640624284744263, + "num_tokens": 605525237.0, + "step": 15873 + }, + { + "epoch": 2.0193359623457576, + "ewc_loss": 0.0722646564245224, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003613184380810708, + "grad_norm": 8.383353233337402, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8709799647331238, + "num_tokens": 605563624.0, + "step": 15874 + }, + { + "epoch": 2.019463172624348, + "ewc_loss": 0.07251878082752228, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036385972634889185, + "grad_norm": 8.5078706741333, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8753246665000916, + "num_tokens": 605597419.0, + "step": 15875 + }, + { + "epoch": 2.0195903829029387, + "ewc_loss": 0.0724165290594101, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003603957302402705, + "grad_norm": 8.329473495483398, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8573358058929443, + "num_tokens": 605639954.0, + "step": 15876 + }, + { + "epoch": 2.019717593181529, + "ewc_loss": 0.07262982428073883, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003649701247923076, + "grad_norm": 8.483057022094727, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8660653829574585, + "num_tokens": 605678226.0, + "step": 15877 + }, + { + "epoch": 2.0198448034601197, + "ewc_loss": 0.0722239762544632, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000360911653842777, + "grad_norm": 8.379043579101562, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8748562932014465, + "num_tokens": 605719925.0, + "step": 15878 + }, + { + "epoch": 2.0199720137387103, + "ewc_loss": 0.0725676640868187, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003643485251814127, + "grad_norm": 8.41401195526123, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.848097562789917, + "num_tokens": 605761282.0, + "step": 15879 + }, + { + "epoch": 2.020099224017301, + "ewc_loss": 0.07233414053916931, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003620132338255644, + "grad_norm": 8.380451202392578, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8763866424560547, + "num_tokens": 605800681.0, + "step": 15880 + }, + { + "epoch": 2.0202264342958913, + "ewc_loss": 0.07244722545146942, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003631441213656217, + "grad_norm": 8.41020679473877, + "learning_rate": 1e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8484815359115601, + "num_tokens": 605836282.0, + "step": 15881 + }, + { + "epoch": 2.0203536445744814, + "ewc_loss": 0.07231651246547699, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003618369810283184, + "grad_norm": 8.375707626342773, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8675076365470886, + "num_tokens": 605880360.0, + "step": 15882 + }, + { + "epoch": 2.020480854853072, + "ewc_loss": 0.07258307933807373, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036206122604198754, + "grad_norm": 8.399764060974121, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8759802579879761, + "num_tokens": 605918363.0, + "step": 15883 + }, + { + "epoch": 2.0206080651316625, + "ewc_loss": 0.07238219678401947, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036249388358555734, + "grad_norm": 8.384025573730469, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8784661293029785, + "num_tokens": 605953032.0, + "step": 15884 + }, + { + "epoch": 2.020735275410253, + "ewc_loss": 0.07239852845668793, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036265721428208053, + "grad_norm": 8.47487735748291, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.872901439666748, + "num_tokens": 605992633.0, + "step": 15885 + }, + { + "epoch": 2.0208624856888435, + "ewc_loss": 0.07216188311576843, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003602907236199826, + "grad_norm": 8.330665588378906, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8676526546478271, + "num_tokens": 606031776.0, + "step": 15886 + }, + { + "epoch": 2.020989695967434, + "ewc_loss": 0.07251526415348053, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003638245107140392, + "grad_norm": 8.44027328491211, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.858691930770874, + "num_tokens": 606070614.0, + "step": 15887 + }, + { + "epoch": 2.0211169062460246, + "ewc_loss": 0.07211872935295105, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00035985911381430924, + "grad_norm": 8.30706787109375, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8664460182189941, + "num_tokens": 606115898.0, + "step": 15888 + }, + { + "epoch": 2.021244116524615, + "ewc_loss": 0.07264970242977142, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036516887485049665, + "grad_norm": 8.448807716369629, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8692715167999268, + "num_tokens": 606150231.0, + "step": 15889 + }, + { + "epoch": 2.0213713268032056, + "ewc_loss": 0.07217337936162949, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036040565464645624, + "grad_norm": 8.323068618774414, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8585543036460876, + "num_tokens": 606191527.0, + "step": 15890 + }, + { + "epoch": 2.021498537081796, + "ewc_loss": 0.07261916249990463, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036486348835751414, + "grad_norm": 8.465447425842285, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8589420914649963, + "num_tokens": 606229070.0, + "step": 15891 + }, + { + "epoch": 2.0216257473603867, + "ewc_loss": 0.07233908772468567, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003620627976488322, + "grad_norm": 8.323877334594727, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8658881783485413, + "num_tokens": 606269555.0, + "step": 15892 + }, + { + "epoch": 2.021752957638977, + "ewc_loss": 0.0725955218076706, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003646270779427141, + "grad_norm": 8.497912406921387, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8606150150299072, + "num_tokens": 606302040.0, + "step": 15893 + }, + { + "epoch": 2.0218801679175677, + "ewc_loss": 0.07224555313587189, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036112742964178324, + "grad_norm": 8.365097045898438, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8745739459991455, + "num_tokens": 606341769.0, + "step": 15894 + }, + { + "epoch": 2.0220073781961583, + "ewc_loss": 0.07273197919130325, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036599166924133897, + "grad_norm": 8.47673511505127, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8523736000061035, + "num_tokens": 606379989.0, + "step": 15895 + }, + { + "epoch": 2.022134588474749, + "ewc_loss": 0.07220005244016647, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003606723912525922, + "grad_norm": 8.376202583312988, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8600858449935913, + "num_tokens": 606414396.0, + "step": 15896 + }, + { + "epoch": 2.0222617987533393, + "ewc_loss": 0.07272607088088989, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036593255936168134, + "grad_norm": 8.480775833129883, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8697995543479919, + "num_tokens": 606445949.0, + "step": 15897 + }, + { + "epoch": 2.02238900903193, + "ewc_loss": 0.07213477045297623, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036246099625714123, + "grad_norm": 8.367646217346191, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.868861198425293, + "num_tokens": 606484751.0, + "step": 15898 + }, + { + "epoch": 2.0225162193105204, + "ewc_loss": 0.0725821778178215, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036449366598390043, + "grad_norm": 8.44163990020752, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8697040677070618, + "num_tokens": 606525390.0, + "step": 15899 + }, + { + "epoch": 2.022643429589111, + "ewc_loss": 0.07245656102895737, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003632374864537269, + "grad_norm": 8.411153793334961, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8633815050125122, + "num_tokens": 606563364.0, + "step": 15900 + }, + { + "epoch": 2.0227706398677014, + "ewc_loss": 0.07242171466350555, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003628890262916684, + "grad_norm": 8.459656715393066, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8691784143447876, + "num_tokens": 606598140.0, + "step": 15901 + }, + { + "epoch": 2.022897850146292, + "ewc_loss": 0.07216046005487442, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003627178957685828, + "grad_norm": 8.390562057495117, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8589280247688293, + "num_tokens": 606636480.0, + "step": 15902 + }, + { + "epoch": 2.0230250604248825, + "ewc_loss": 0.07236012071371078, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003622730728238821, + "grad_norm": 8.391496658325195, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.869101345539093, + "num_tokens": 606676044.0, + "step": 15903 + }, + { + "epoch": 2.023152270703473, + "ewc_loss": 0.07238604873418808, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036253235884942114, + "grad_norm": 8.363787651062012, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8588333129882812, + "num_tokens": 606713262.0, + "step": 15904 + }, + { + "epoch": 2.0232794809820636, + "ewc_loss": 0.07255695015192032, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000364241364877671, + "grad_norm": 8.416583061218262, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8616650700569153, + "num_tokens": 606754844.0, + "step": 15905 + }, + { + "epoch": 2.0234066912606536, + "ewc_loss": 0.07234258949756622, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003620977222453803, + "grad_norm": 8.358859062194824, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8677202463150024, + "num_tokens": 606800910.0, + "step": 15906 + }, + { + "epoch": 2.023533901539244, + "ewc_loss": 0.07235479354858398, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036466121673583984, + "grad_norm": 8.431201934814453, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8604476451873779, + "num_tokens": 606841855.0, + "step": 15907 + }, + { + "epoch": 2.0236611118178347, + "ewc_loss": 0.07206649333238602, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036177822039462626, + "grad_norm": 8.3541841506958, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8540629744529724, + "num_tokens": 606888261.0, + "step": 15908 + }, + { + "epoch": 2.0237883220964252, + "ewc_loss": 0.07234558463096619, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003645691613201052, + "grad_norm": 8.399311065673828, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8584246635437012, + "num_tokens": 606926235.0, + "step": 15909 + }, + { + "epoch": 2.0239155323750158, + "ewc_loss": 0.07231789827346802, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036185086355544627, + "grad_norm": 8.388898849487305, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8617738485336304, + "num_tokens": 606962527.0, + "step": 15910 + }, + { + "epoch": 2.0240427426536063, + "ewc_loss": 0.0725618302822113, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003642901428975165, + "grad_norm": 8.361028671264648, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8711548447608948, + "num_tokens": 607008924.0, + "step": 15911 + }, + { + "epoch": 2.024169952932197, + "ewc_loss": 0.07266248762607574, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003652966988738626, + "grad_norm": 8.408773422241211, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8619991540908813, + "num_tokens": 607049788.0, + "step": 15912 + }, + { + "epoch": 2.0242971632107873, + "ewc_loss": 0.07241547107696533, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003628265985753387, + "grad_norm": 8.390120506286621, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8587342500686646, + "num_tokens": 607087592.0, + "step": 15913 + }, + { + "epoch": 2.024424373489378, + "ewc_loss": 0.07261931896209717, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003648650599643588, + "grad_norm": 8.450997352600098, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8814117908477783, + "num_tokens": 607119954.0, + "step": 15914 + }, + { + "epoch": 2.0245515837679684, + "ewc_loss": 0.07233119010925293, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036198378074914217, + "grad_norm": 8.381660461425781, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8569178581237793, + "num_tokens": 607156511.0, + "step": 15915 + }, + { + "epoch": 2.024678794046559, + "ewc_loss": 0.07261595875024796, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003648314450401813, + "grad_norm": 8.454385757446289, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8739283084869385, + "num_tokens": 607198292.0, + "step": 15916 + }, + { + "epoch": 2.0248060043251495, + "ewc_loss": 0.07260581851005554, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036228864337317646, + "grad_norm": 8.344586372375488, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8620061874389648, + "num_tokens": 607237572.0, + "step": 15917 + }, + { + "epoch": 2.02493321460374, + "ewc_loss": 0.0726289451122284, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003649613354355097, + "grad_norm": 8.427971839904785, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8574255108833313, + "num_tokens": 607282321.0, + "step": 15918 + }, + { + "epoch": 2.0250604248823305, + "ewc_loss": 0.07239362597465515, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036260817432776093, + "grad_norm": 8.41332721710205, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8628730773925781, + "num_tokens": 607313355.0, + "step": 15919 + }, + { + "epoch": 2.025187635160921, + "ewc_loss": 0.0724952444434166, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003636243345681578, + "grad_norm": 8.314188957214355, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8628413081169128, + "num_tokens": 607351961.0, + "step": 15920 + }, + { + "epoch": 2.0253148454395116, + "ewc_loss": 0.07263174653053284, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003649893042165786, + "grad_norm": 8.38911247253418, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8689165711402893, + "num_tokens": 607391368.0, + "step": 15921 + }, + { + "epoch": 2.025442055718102, + "ewc_loss": 0.07249587029218674, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000363630591891706, + "grad_norm": 8.345192909240723, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8840577006340027, + "num_tokens": 607427328.0, + "step": 15922 + }, + { + "epoch": 2.0255692659966926, + "ewc_loss": 0.07247251272201538, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036583837936632335, + "grad_norm": 8.412782669067383, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8604794144630432, + "num_tokens": 607468243.0, + "step": 15923 + }, + { + "epoch": 2.025696476275283, + "ewc_loss": 0.07260696589946747, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036474151420406997, + "grad_norm": 8.398163795471191, + "learning_rate": 1e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8495635986328125, + "num_tokens": 607502466.0, + "step": 15924 + }, + { + "epoch": 2.0258236865538737, + "ewc_loss": 0.07268348336219788, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003655067121144384, + "grad_norm": 8.386836051940918, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8560701608657837, + "num_tokens": 607544093.0, + "step": 15925 + }, + { + "epoch": 2.025950896832464, + "ewc_loss": 0.07251010090112686, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036377288051880896, + "grad_norm": 8.39355182647705, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8711696863174438, + "num_tokens": 607576054.0, + "step": 15926 + }, + { + "epoch": 2.0260781071110547, + "ewc_loss": 0.07249388843774796, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036605214700102806, + "grad_norm": 8.42466926574707, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8778244853019714, + "num_tokens": 607608462.0, + "step": 15927 + }, + { + "epoch": 2.0262053173896453, + "ewc_loss": 0.07246915251016617, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003633634187281132, + "grad_norm": 8.348127365112305, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8584634065628052, + "num_tokens": 607645272.0, + "step": 15928 + }, + { + "epoch": 2.026332527668236, + "ewc_loss": 0.07308877259492874, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036711819120682776, + "grad_norm": 8.463008880615234, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8679846525192261, + "num_tokens": 607682585.0, + "step": 15929 + }, + { + "epoch": 2.0264597379468263, + "ewc_loss": 0.07248471677303314, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003610776038840413, + "grad_norm": 8.281754493713379, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8828619122505188, + "num_tokens": 607721302.0, + "step": 15930 + }, + { + "epoch": 2.0265869482254164, + "ewc_loss": 0.0731760710477829, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003679911606013775, + "grad_norm": 8.456766128540039, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8614537715911865, + "num_tokens": 607758357.0, + "step": 15931 + }, + { + "epoch": 2.026714158504007, + "ewc_loss": 0.07260125875473022, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003622430085670203, + "grad_norm": 8.311559677124023, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8657710552215576, + "num_tokens": 607795467.0, + "step": 15932 + }, + { + "epoch": 2.0268413687825975, + "ewc_loss": 0.07303977757692337, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036662822822108865, + "grad_norm": 8.39629077911377, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8605208396911621, + "num_tokens": 607833192.0, + "step": 15933 + }, + { + "epoch": 2.026968579061188, + "ewc_loss": 0.07237161695957184, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036238806205801666, + "grad_norm": 8.3295259475708, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8751023411750793, + "num_tokens": 607866672.0, + "step": 15934 + }, + { + "epoch": 2.0270957893397785, + "ewc_loss": 0.0727647989988327, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036631981492973864, + "grad_norm": 8.447477340698242, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8660107851028442, + "num_tokens": 607907976.0, + "step": 15935 + }, + { + "epoch": 2.027222999618369, + "ewc_loss": 0.07269066572189331, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003631371655501425, + "grad_norm": 8.350287437438965, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8694735169410706, + "num_tokens": 607946009.0, + "step": 15936 + }, + { + "epoch": 2.0273502098969596, + "ewc_loss": 0.07278705388307571, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036410102620720863, + "grad_norm": 8.375301361083984, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8702566027641296, + "num_tokens": 607986800.0, + "step": 15937 + }, + { + "epoch": 2.02747742017555, + "ewc_loss": 0.07271164655685425, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036334688775241375, + "grad_norm": 8.401251792907715, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8596055507659912, + "num_tokens": 608021807.0, + "step": 15938 + }, + { + "epoch": 2.0276046304541406, + "ewc_loss": 0.07273674011230469, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036359785008244216, + "grad_norm": 8.34372615814209, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8653687238693237, + "num_tokens": 608060294.0, + "step": 15939 + }, + { + "epoch": 2.027731840732731, + "ewc_loss": 0.07283549755811691, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003645854303613305, + "grad_norm": 8.400247573852539, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8643175363540649, + "num_tokens": 608100790.0, + "step": 15940 + }, + { + "epoch": 2.0278590510113217, + "ewc_loss": 0.07240884006023407, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003627602709457278, + "grad_norm": 8.337964057922363, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8607248067855835, + "num_tokens": 608142164.0, + "step": 15941 + }, + { + "epoch": 2.027986261289912, + "ewc_loss": 0.07296836376190186, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003659141075331718, + "grad_norm": 8.40633773803711, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8656840920448303, + "num_tokens": 608184175.0, + "step": 15942 + }, + { + "epoch": 2.0281134715685027, + "ewc_loss": 0.0726541206240654, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036277167964726686, + "grad_norm": 8.362554550170898, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8597239255905151, + "num_tokens": 608226801.0, + "step": 15943 + }, + { + "epoch": 2.0282406818470933, + "ewc_loss": 0.07275541126728058, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003662260132841766, + "grad_norm": 8.391712188720703, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.872647762298584, + "num_tokens": 608265155.0, + "step": 15944 + }, + { + "epoch": 2.028367892125684, + "ewc_loss": 0.07256323844194412, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000364304258255288, + "grad_norm": 8.413470268249512, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8759206533432007, + "num_tokens": 608303817.0, + "step": 15945 + }, + { + "epoch": 2.0284951024042743, + "ewc_loss": 0.07253986597061157, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003640704962890595, + "grad_norm": 8.396397590637207, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8719350695610046, + "num_tokens": 608338845.0, + "step": 15946 + }, + { + "epoch": 2.028622312682865, + "ewc_loss": 0.07260584831237793, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003647303383331746, + "grad_norm": 8.408721923828125, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.865991473197937, + "num_tokens": 608377661.0, + "step": 15947 + }, + { + "epoch": 2.0287495229614554, + "ewc_loss": 0.07251451909542084, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036381708923727274, + "grad_norm": 8.411197662353516, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8655229806900024, + "num_tokens": 608412897.0, + "step": 15948 + }, + { + "epoch": 2.028876733240046, + "ewc_loss": 0.07289192080497742, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003651496663223952, + "grad_norm": 8.76103687286377, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8580265641212463, + "num_tokens": 608453584.0, + "step": 15949 + }, + { + "epoch": 2.0290039435186364, + "ewc_loss": 0.07202917337417603, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000358963618054986, + "grad_norm": 8.299015045166016, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8616193532943726, + "num_tokens": 608492211.0, + "step": 15950 + }, + { + "epoch": 2.029131153797227, + "ewc_loss": 0.07307812571525574, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003694531333167106, + "grad_norm": 8.515963554382324, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8615312576293945, + "num_tokens": 608534317.0, + "step": 15951 + }, + { + "epoch": 2.0292583640758175, + "ewc_loss": 0.07223547250032425, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003585851809475571, + "grad_norm": 8.23314380645752, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8757686614990234, + "num_tokens": 608569763.0, + "step": 15952 + }, + { + "epoch": 2.029385574354408, + "ewc_loss": 0.07333763688802719, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003696068306453526, + "grad_norm": 8.508369445800781, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8689095377922058, + "num_tokens": 608612145.0, + "step": 15953 + }, + { + "epoch": 2.0295127846329986, + "ewc_loss": 0.07227195799350739, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036139143048785627, + "grad_norm": 8.308944702148438, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8659200668334961, + "num_tokens": 608649528.0, + "step": 15954 + }, + { + "epoch": 2.0296399949115886, + "ewc_loss": 0.0734555572271347, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003707860014401376, + "grad_norm": 8.546554565429688, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.860173225402832, + "num_tokens": 608686973.0, + "step": 15955 + }, + { + "epoch": 2.029767205190179, + "ewc_loss": 0.07234956324100494, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000362167542334646, + "grad_norm": 8.324146270751953, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8676986694335938, + "num_tokens": 608729169.0, + "step": 15956 + }, + { + "epoch": 2.0298944154687697, + "ewc_loss": 0.07334264367818832, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036965691833756864, + "grad_norm": 8.551036834716797, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8633670210838318, + "num_tokens": 608764502.0, + "step": 15957 + }, + { + "epoch": 2.0300216257473602, + "ewc_loss": 0.07265304028987885, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036276085302233696, + "grad_norm": 8.393899917602539, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8666383028030396, + "num_tokens": 608805190.0, + "step": 15958 + }, + { + "epoch": 2.0301488360259508, + "ewc_loss": 0.0731828510761261, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003680589434225112, + "grad_norm": 8.522197723388672, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8709962368011475, + "num_tokens": 608840006.0, + "step": 15959 + }, + { + "epoch": 2.0302760463045413, + "ewc_loss": 0.07266920804977417, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036292249569669366, + "grad_norm": 8.471115112304688, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8731586337089539, + "num_tokens": 608871551.0, + "step": 15960 + }, + { + "epoch": 2.030403256583132, + "ewc_loss": 0.07294648885726929, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003656953922472894, + "grad_norm": 8.566001892089844, + "learning_rate": 1e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8441067934036255, + "num_tokens": 608909610.0, + "step": 15961 + }, + { + "epoch": 2.0305304668617223, + "ewc_loss": 0.07267642766237259, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036299476050771773, + "grad_norm": 8.507511138916016, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8636314868927002, + "num_tokens": 608950758.0, + "step": 15962 + }, + { + "epoch": 2.030657677140313, + "ewc_loss": 0.07261177897453308, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036234830622561276, + "grad_norm": 8.393614768981934, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8530657291412354, + "num_tokens": 608990695.0, + "step": 15963 + }, + { + "epoch": 2.0307848874189034, + "ewc_loss": 0.07257817685604095, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003644535900093615, + "grad_norm": 8.406716346740723, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8795172572135925, + "num_tokens": 609030978.0, + "step": 15964 + }, + { + "epoch": 2.030912097697494, + "ewc_loss": 0.07263311743736267, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003625616373028606, + "grad_norm": 8.414132118225098, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8752449750900269, + "num_tokens": 609071974.0, + "step": 15965 + }, + { + "epoch": 2.0310393079760845, + "ewc_loss": 0.07254768908023834, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003641488146968186, + "grad_norm": 8.423097610473633, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8682950735092163, + "num_tokens": 609111268.0, + "step": 15966 + }, + { + "epoch": 2.031166518254675, + "ewc_loss": 0.07246493548154831, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003633212181739509, + "grad_norm": 8.396523475646973, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8623946905136108, + "num_tokens": 609149681.0, + "step": 15967 + }, + { + "epoch": 2.0312937285332655, + "ewc_loss": 0.0726541131734848, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003652130253612995, + "grad_norm": 8.448951721191406, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8712303638458252, + "num_tokens": 609189121.0, + "step": 15968 + }, + { + "epoch": 2.031420938811856, + "ewc_loss": 0.07269121706485748, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036314260796643794, + "grad_norm": 8.358948707580566, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.874265730381012, + "num_tokens": 609228070.0, + "step": 15969 + }, + { + "epoch": 2.0315481490904466, + "ewc_loss": 0.07304976135492325, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003667280834633857, + "grad_norm": 8.491250038146973, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8681881427764893, + "num_tokens": 609269718.0, + "step": 15970 + }, + { + "epoch": 2.031675359369037, + "ewc_loss": 0.07258328050374985, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036206329241394997, + "grad_norm": 8.353565216064453, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8687839508056641, + "num_tokens": 609307236.0, + "step": 15971 + }, + { + "epoch": 2.0318025696476276, + "ewc_loss": 0.07302631437778473, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003664935939013958, + "grad_norm": 8.628037452697754, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8473701477050781, + "num_tokens": 609342993.0, + "step": 15972 + }, + { + "epoch": 2.031929779926218, + "ewc_loss": 0.07169991731643677, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003581124183256179, + "grad_norm": 8.274259567260742, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8724302053451538, + "num_tokens": 609376817.0, + "step": 15973 + }, + { + "epoch": 2.0320569902048087, + "ewc_loss": 0.07326038181781769, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003712757315952331, + "grad_norm": 8.904864311218262, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8630512952804565, + "num_tokens": 609411209.0, + "step": 15974 + }, + { + "epoch": 2.032184200483399, + "ewc_loss": 0.07145516574382782, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.0003556649317033589, + "grad_norm": 8.193023681640625, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.88315749168396, + "num_tokens": 609449055.0, + "step": 15975 + }, + { + "epoch": 2.0323114107619897, + "ewc_loss": 0.07387617230415344, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003774335782509297, + "grad_norm": 9.238818168640137, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.864487886428833, + "num_tokens": 609486332.0, + "step": 15976 + }, + { + "epoch": 2.0324386210405803, + "ewc_loss": 0.07160935550928116, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003547654487192631, + "grad_norm": 8.14270305633545, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8523768186569214, + "num_tokens": 609526924.0, + "step": 15977 + }, + { + "epoch": 2.032565831319171, + "ewc_loss": 0.0747341439127922, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00038601330015808344, + "grad_norm": 9.293264389038086, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8792179822921753, + "num_tokens": 609559542.0, + "step": 15978 + }, + { + "epoch": 2.032693041597761, + "ewc_loss": 0.07196186482906342, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003582905337680131, + "grad_norm": 8.250396728515625, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.859232485294342, + "num_tokens": 609598989.0, + "step": 15979 + }, + { + "epoch": 2.0328202518763514, + "ewc_loss": 0.07472661137580872, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003859379794448614, + "grad_norm": 8.939757347106934, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8537145853042603, + "num_tokens": 609635790.0, + "step": 15980 + }, + { + "epoch": 2.032947462154942, + "ewc_loss": 0.07258698344230652, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003645417164079845, + "grad_norm": 8.653092384338379, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.858690619468689, + "num_tokens": 609670120.0, + "step": 15981 + }, + { + "epoch": 2.0330746724335325, + "ewc_loss": 0.07318199425935745, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00037049181992188096, + "grad_norm": 8.533921241760254, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.864629328250885, + "num_tokens": 609713260.0, + "step": 15982 + }, + { + "epoch": 2.033201882712123, + "ewc_loss": 0.07287271320819855, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003673989849630743, + "grad_norm": 8.452032089233398, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8718221187591553, + "num_tokens": 609753236.0, + "step": 15983 + }, + { + "epoch": 2.0333290929907135, + "ewc_loss": 0.07292144000530243, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036788624129258096, + "grad_norm": 8.657812118530273, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8684955835342407, + "num_tokens": 609791629.0, + "step": 15984 + }, + { + "epoch": 2.033456303269304, + "ewc_loss": 0.07247280329465866, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003633999149315059, + "grad_norm": 8.434674263000488, + "learning_rate": 1e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8499640822410583, + "num_tokens": 609833457.0, + "step": 15985 + }, + { + "epoch": 2.0335835135478946, + "ewc_loss": 0.07297730445861816, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003684449184220284, + "grad_norm": 8.585738182067871, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8575531840324402, + "num_tokens": 609866007.0, + "step": 15986 + }, + { + "epoch": 2.033710723826485, + "ewc_loss": 0.07238119840621948, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003624838718678802, + "grad_norm": 8.549443244934082, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8702857494354248, + "num_tokens": 609903346.0, + "step": 15987 + }, + { + "epoch": 2.0338379341050756, + "ewc_loss": 0.07237052917480469, + "ewc_loss_diag": 3.600120544433594e-05, + "ewc_loss_parallel": 0.00036481861025094986, + "grad_norm": 8.551725387573242, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8711260557174683, + "num_tokens": 609935885.0, + "step": 15988 + }, + { + "epoch": 2.033965144383666, + "ewc_loss": 0.07243214547634125, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003629933635238558, + "grad_norm": 8.487189292907715, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8587844371795654, + "num_tokens": 609971127.0, + "step": 15989 + }, + { + "epoch": 2.0340923546622567, + "ewc_loss": 0.07232651114463806, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036193698178976774, + "grad_norm": 8.383853912353516, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8570599555969238, + "num_tokens": 610016719.0, + "step": 15990 + }, + { + "epoch": 2.034219564940847, + "ewc_loss": 0.0726315975189209, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036498784902505577, + "grad_norm": 8.55832290649414, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8549584746360779, + "num_tokens": 610057002.0, + "step": 15991 + }, + { + "epoch": 2.0343467752194377, + "ewc_loss": 0.07213293015956879, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000360001198714599, + "grad_norm": 8.38376235961914, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8705921173095703, + "num_tokens": 610092779.0, + "step": 15992 + }, + { + "epoch": 2.0344739854980283, + "ewc_loss": 0.07266679406166077, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036533985985442996, + "grad_norm": 8.570942878723145, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.855004072189331, + "num_tokens": 610137092.0, + "step": 15993 + }, + { + "epoch": 2.034601195776619, + "ewc_loss": 0.07203031331300735, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00035897502675652504, + "grad_norm": 8.293819427490234, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8748495578765869, + "num_tokens": 610178632.0, + "step": 15994 + }, + { + "epoch": 2.0347284060552093, + "ewc_loss": 0.07333841919898987, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003696146886795759, + "grad_norm": 8.679092407226562, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8632224798202515, + "num_tokens": 610219789.0, + "step": 15995 + }, + { + "epoch": 2.0348556163338, + "ewc_loss": 0.0721665769815445, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003578962932806462, + "grad_norm": 8.335744857788086, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8514084815979004, + "num_tokens": 610259104.0, + "step": 15996 + }, + { + "epoch": 2.0349828266123904, + "ewc_loss": 0.07344450056552887, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037067546509206295, + "grad_norm": 8.639520645141602, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8592582941055298, + "num_tokens": 610296317.0, + "step": 15997 + }, + { + "epoch": 2.035110036890981, + "ewc_loss": 0.07227322459220886, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003589626867324114, + "grad_norm": 8.336243629455566, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8642005920410156, + "num_tokens": 610339300.0, + "step": 15998 + }, + { + "epoch": 2.0352372471695714, + "ewc_loss": 0.0732795000076294, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036902542342431843, + "grad_norm": 8.579581260681152, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8598695397377014, + "num_tokens": 610373917.0, + "step": 15999 + }, + { + "epoch": 2.035364457448162, + "ewc_loss": 0.0722409039735794, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003610808926168829, + "grad_norm": 8.356742858886719, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.864220142364502, + "num_tokens": 610419571.0, + "step": 16000 + }, + { + "epoch": 2.0354916677267525, + "ewc_loss": 0.07299196720123291, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036859160172753036, + "grad_norm": 8.508072853088379, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8570734262466431, + "num_tokens": 610458073.0, + "step": 16001 + }, + { + "epoch": 2.035618878005343, + "ewc_loss": 0.07265998423099518, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036283035296946764, + "grad_norm": 8.454353332519531, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8545233011245728, + "num_tokens": 610497223.0, + "step": 16002 + }, + { + "epoch": 2.0357460882839336, + "ewc_loss": 0.07271483540534973, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036582021857611835, + "grad_norm": 8.492837905883789, + "learning_rate": 1e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8466082811355591, + "num_tokens": 610532563.0, + "step": 16003 + }, + { + "epoch": 2.0358732985625236, + "ewc_loss": 0.07271961867809296, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003634266904555261, + "grad_norm": 8.414090156555176, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8780163526535034, + "num_tokens": 610566644.0, + "step": 16004 + }, + { + "epoch": 2.036000508841114, + "ewc_loss": 0.07263386994600296, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003650105791166425, + "grad_norm": 8.435381889343262, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8538519144058228, + "num_tokens": 610604755.0, + "step": 16005 + }, + { + "epoch": 2.0361277191197047, + "ewc_loss": 0.07293738424777985, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003656043263617903, + "grad_norm": 8.406699180603027, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8687056303024292, + "num_tokens": 610645905.0, + "step": 16006 + }, + { + "epoch": 2.036254929398295, + "ewc_loss": 0.07276327908039093, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003663046227302402, + "grad_norm": 8.432385444641113, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.882720410823822, + "num_tokens": 610686800.0, + "step": 16007 + }, + { + "epoch": 2.0363821396768857, + "ewc_loss": 0.07296081632375717, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036583864130079746, + "grad_norm": 8.431878089904785, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8647865056991577, + "num_tokens": 610728291.0, + "step": 16008 + }, + { + "epoch": 2.0365093499554763, + "ewc_loss": 0.07301868498325348, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036641728365793824, + "grad_norm": 8.534173965454102, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8621878027915955, + "num_tokens": 610759818.0, + "step": 16009 + }, + { + "epoch": 2.036636560234067, + "ewc_loss": 0.07248532027006149, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003635250614024699, + "grad_norm": 8.346616744995117, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8514311909675598, + "num_tokens": 610800125.0, + "step": 16010 + }, + { + "epoch": 2.0367637705126573, + "ewc_loss": 0.07300732284784317, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003687450953293592, + "grad_norm": 8.472686767578125, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.856738805770874, + "num_tokens": 610839261.0, + "step": 16011 + }, + { + "epoch": 2.036890980791248, + "ewc_loss": 0.07276052236557007, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036383565748110414, + "grad_norm": 8.395604133605957, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8701095581054688, + "num_tokens": 610879092.0, + "step": 16012 + }, + { + "epoch": 2.0370181910698384, + "ewc_loss": 0.07309014350175858, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003671318991109729, + "grad_norm": 8.442373275756836, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8651243448257446, + "num_tokens": 610914494.0, + "step": 16013 + }, + { + "epoch": 2.037145401348429, + "ewc_loss": 0.07266709953546524, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000365342857548967, + "grad_norm": 8.39224624633789, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8744062781333923, + "num_tokens": 610950784.0, + "step": 16014 + }, + { + "epoch": 2.0372726116270194, + "ewc_loss": 0.0729408711194992, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003680805675685406, + "grad_norm": 8.517236709594727, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.853635311126709, + "num_tokens": 610989254.0, + "step": 16015 + }, + { + "epoch": 2.03739982190561, + "ewc_loss": 0.07260528206825256, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036472472129389644, + "grad_norm": 8.43028736114502, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8585014343261719, + "num_tokens": 611022668.0, + "step": 16016 + }, + { + "epoch": 2.0375270321842005, + "ewc_loss": 0.07309761643409729, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036720657953992486, + "grad_norm": 8.442569732666016, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8655002117156982, + "num_tokens": 611062110.0, + "step": 16017 + }, + { + "epoch": 2.037654242462791, + "ewc_loss": 0.07293657213449478, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036559620639309287, + "grad_norm": 8.44474983215332, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8706059455871582, + "num_tokens": 611098663.0, + "step": 16018 + }, + { + "epoch": 2.0377814527413816, + "ewc_loss": 0.07296395301818848, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003658700152300298, + "grad_norm": 8.416101455688477, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8622564077377319, + "num_tokens": 611134058.0, + "step": 16019 + }, + { + "epoch": 2.037908663019972, + "ewc_loss": 0.07304753363132477, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036670578992925584, + "grad_norm": 8.525643348693848, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8653866052627563, + "num_tokens": 611174748.0, + "step": 16020 + }, + { + "epoch": 2.0380358732985626, + "ewc_loss": 0.0728408470749855, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036463892320171, + "grad_norm": 8.35166072845459, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8804351687431335, + "num_tokens": 611216792.0, + "step": 16021 + }, + { + "epoch": 2.038163083577153, + "ewc_loss": 0.07332813739776611, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036951180663891137, + "grad_norm": 8.493773460388184, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8745040893554688, + "num_tokens": 611253996.0, + "step": 16022 + }, + { + "epoch": 2.0382902938557437, + "ewc_loss": 0.07284151017665863, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003646455879788846, + "grad_norm": 8.41895580291748, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8745297789573669, + "num_tokens": 611295351.0, + "step": 16023 + }, + { + "epoch": 2.038417504134334, + "ewc_loss": 0.07323256880044937, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036855615326203406, + "grad_norm": 8.531905174255371, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8642764091491699, + "num_tokens": 611330513.0, + "step": 16024 + }, + { + "epoch": 2.0385447144129247, + "ewc_loss": 0.0727473646402359, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003637041081674397, + "grad_norm": 8.429696083068848, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.873329222202301, + "num_tokens": 611369679.0, + "step": 16025 + }, + { + "epoch": 2.0386719246915153, + "ewc_loss": 0.07312043011188507, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003674347826745361, + "grad_norm": 8.434285163879395, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8724550008773804, + "num_tokens": 611410398.0, + "step": 16026 + }, + { + "epoch": 2.038799134970106, + "ewc_loss": 0.07286175340414047, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036484800511971116, + "grad_norm": 8.37585163116455, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8567277789115906, + "num_tokens": 611447151.0, + "step": 16027 + }, + { + "epoch": 2.0389263452486963, + "ewc_loss": 0.07303811609745026, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036661160993389785, + "grad_norm": 8.448400497436523, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8563119173049927, + "num_tokens": 611486923.0, + "step": 16028 + }, + { + "epoch": 2.0390535555272864, + "ewc_loss": 0.07308123260736465, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036704278318211436, + "grad_norm": 8.472770690917969, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8699377775192261, + "num_tokens": 611519502.0, + "step": 16029 + }, + { + "epoch": 2.039180765805877, + "ewc_loss": 0.07290872186422348, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036531768273562193, + "grad_norm": 8.387504577636719, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8516685962677002, + "num_tokens": 611557647.0, + "step": 16030 + }, + { + "epoch": 2.0393079760844675, + "ewc_loss": 0.07313632220029831, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003675936895888299, + "grad_norm": 8.45176887512207, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8637479543685913, + "num_tokens": 611597405.0, + "step": 16031 + }, + { + "epoch": 2.039435186363058, + "ewc_loss": 0.07305687665939331, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036679927143268287, + "grad_norm": 8.472535133361816, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8585306406021118, + "num_tokens": 611635996.0, + "step": 16032 + }, + { + "epoch": 2.0395623966416485, + "ewc_loss": 0.07286733388900757, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003649037971626967, + "grad_norm": 8.415505409240723, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8734934329986572, + "num_tokens": 611677900.0, + "step": 16033 + }, + { + "epoch": 2.039689606920239, + "ewc_loss": 0.07306912541389465, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003669216821435839, + "grad_norm": 8.456184387207031, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8688087463378906, + "num_tokens": 611719184.0, + "step": 16034 + }, + { + "epoch": 2.0398168171988296, + "ewc_loss": 0.07286950200796127, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000364925479516387, + "grad_norm": 8.365239143371582, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.863932192325592, + "num_tokens": 611761270.0, + "step": 16035 + }, + { + "epoch": 2.03994402747742, + "ewc_loss": 0.07299366593360901, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003661670780275017, + "grad_norm": 8.479605674743652, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8637561798095703, + "num_tokens": 611794939.0, + "step": 16036 + }, + { + "epoch": 2.0400712377560106, + "ewc_loss": 0.07283182442188263, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003645487013272941, + "grad_norm": 8.482544898986816, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.877232015132904, + "num_tokens": 611834093.0, + "step": 16037 + }, + { + "epoch": 2.040198448034601, + "ewc_loss": 0.0730009526014328, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036623995401896536, + "grad_norm": 8.442139625549316, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.860538125038147, + "num_tokens": 611871327.0, + "step": 16038 + }, + { + "epoch": 2.0403256583131917, + "ewc_loss": 0.07295140624046326, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003657445777207613, + "grad_norm": 8.493391990661621, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8580625653266907, + "num_tokens": 611907837.0, + "step": 16039 + }, + { + "epoch": 2.040452868591782, + "ewc_loss": 0.07281337678432465, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036436424124985933, + "grad_norm": 8.449142456054688, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8469107151031494, + "num_tokens": 611950964.0, + "step": 16040 + }, + { + "epoch": 2.0405800788703727, + "ewc_loss": 0.07284620404243469, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003646924742497504, + "grad_norm": 10.554738998413086, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8602075576782227, + "num_tokens": 611985867.0, + "step": 16041 + }, + { + "epoch": 2.0407072891489633, + "ewc_loss": 0.07281817495822906, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003644122334662825, + "grad_norm": 8.267080307006836, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8674012422561646, + "num_tokens": 612017819.0, + "step": 16042 + }, + { + "epoch": 2.040834499427554, + "ewc_loss": 0.07546897232532501, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00039092014776542783, + "grad_norm": 9.042426109313965, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8503594398498535, + "num_tokens": 612060669.0, + "step": 16043 + }, + { + "epoch": 2.0409617097061443, + "ewc_loss": 0.07205419987440109, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00035677244886755943, + "grad_norm": 10.512333869934082, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8503661155700684, + "num_tokens": 612099801.0, + "step": 16044 + }, + { + "epoch": 2.041088919984735, + "ewc_loss": 0.07282240688800812, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036445457953959703, + "grad_norm": 8.283469200134277, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8686008453369141, + "num_tokens": 612139454.0, + "step": 16045 + }, + { + "epoch": 2.0412161302633254, + "ewc_loss": 0.07433708757162094, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037960134795866907, + "grad_norm": 8.859472274780273, + "learning_rate": 1e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8459876179695129, + "num_tokens": 612181695.0, + "step": 16046 + }, + { + "epoch": 2.041343340541916, + "ewc_loss": 0.07170446217060089, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00035327504156157374, + "grad_norm": 8.2178316116333, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8810024261474609, + "num_tokens": 612223200.0, + "step": 16047 + }, + { + "epoch": 2.0414705508205064, + "ewc_loss": 0.07480986416339874, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00038432914880104363, + "grad_norm": 8.869963645935059, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8663316369056702, + "num_tokens": 612265946.0, + "step": 16048 + }, + { + "epoch": 2.041597761099097, + "ewc_loss": 0.07211598753929138, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00035739029408432543, + "grad_norm": 8.285500526428223, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8818231821060181, + "num_tokens": 612301246.0, + "step": 16049 + }, + { + "epoch": 2.0417249713776875, + "ewc_loss": 0.07435424625873566, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037977294414304197, + "grad_norm": 8.76486873626709, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8500549793243408, + "num_tokens": 612337789.0, + "step": 16050 + }, + { + "epoch": 2.041852181656278, + "ewc_loss": 0.07224777340888977, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003611495776567608, + "grad_norm": 8.365962982177734, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8776062726974487, + "num_tokens": 612372391.0, + "step": 16051 + }, + { + "epoch": 2.0419793919348685, + "ewc_loss": 0.07395784556865692, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037580891512334347, + "grad_norm": 8.799240112304688, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8680942058563232, + "num_tokens": 612408806.0, + "step": 16052 + }, + { + "epoch": 2.0421066022134586, + "ewc_loss": 0.0724378228187561, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036060871207155287, + "grad_norm": 8.306687355041504, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8769499063491821, + "num_tokens": 612446349.0, + "step": 16053 + }, + { + "epoch": 2.042233812492049, + "ewc_loss": 0.07368114590644836, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00037548335967585444, + "grad_norm": 8.599312782287598, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8777966499328613, + "num_tokens": 612489899.0, + "step": 16054 + }, + { + "epoch": 2.0423610227706397, + "ewc_loss": 0.07255187630653381, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003641906660050154, + "grad_norm": 8.427130699157715, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8597743511199951, + "num_tokens": 612532314.0, + "step": 16055 + }, + { + "epoch": 2.04248823304923, + "ewc_loss": 0.07339523732662201, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003701828536577523, + "grad_norm": 8.616130828857422, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8726786375045776, + "num_tokens": 612568033.0, + "step": 16056 + }, + { + "epoch": 2.0426154433278207, + "ewc_loss": 0.07256510853767395, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036432291381061077, + "grad_norm": 8.374250411987305, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8708900213241577, + "num_tokens": 612605160.0, + "step": 16057 + }, + { + "epoch": 2.0427426536064113, + "ewc_loss": 0.07357442378997803, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000371974689187482, + "grad_norm": 8.605071067810059, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8532451391220093, + "num_tokens": 612647851.0, + "step": 16058 + }, + { + "epoch": 2.042869863885002, + "ewc_loss": 0.07245032489299774, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036317508784122765, + "grad_norm": 8.420963287353516, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8654001951217651, + "num_tokens": 612684770.0, + "step": 16059 + }, + { + "epoch": 2.0429970741635923, + "ewc_loss": 0.07306476682424545, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003693195467349142, + "grad_norm": 8.606175422668457, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8542488217353821, + "num_tokens": 612721689.0, + "step": 16060 + }, + { + "epoch": 2.043124284442183, + "ewc_loss": 0.07272379100322723, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003634683380369097, + "grad_norm": 8.39057445526123, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.863929271697998, + "num_tokens": 612757256.0, + "step": 16061 + }, + { + "epoch": 2.0432514947207734, + "ewc_loss": 0.07324334979057312, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036866392474621534, + "grad_norm": 8.556005477905273, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8716623783111572, + "num_tokens": 612797806.0, + "step": 16062 + }, + { + "epoch": 2.043378704999364, + "ewc_loss": 0.07284513115882874, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036468179314397275, + "grad_norm": 8.479802131652832, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.875573992729187, + "num_tokens": 612833507.0, + "step": 16063 + }, + { + "epoch": 2.0435059152779544, + "ewc_loss": 0.07311917841434479, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003674222098197788, + "grad_norm": 8.534863471984863, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8713823556900024, + "num_tokens": 612872920.0, + "step": 16064 + }, + { + "epoch": 2.043633125556545, + "ewc_loss": 0.07277239114046097, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036395437200553715, + "grad_norm": 8.430191040039062, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8730018138885498, + "num_tokens": 612909534.0, + "step": 16065 + }, + { + "epoch": 2.0437603358351355, + "ewc_loss": 0.07306347787380219, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003668652498163283, + "grad_norm": 8.56794548034668, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8618338108062744, + "num_tokens": 612950531.0, + "step": 16066 + }, + { + "epoch": 2.043887546113726, + "ewc_loss": 0.07266243547201157, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036285482929088175, + "grad_norm": 8.448856353759766, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8681588172912598, + "num_tokens": 612984638.0, + "step": 16067 + }, + { + "epoch": 2.0440147563923166, + "ewc_loss": 0.07297656685113907, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003659961512312293, + "grad_norm": 8.583317756652832, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8767896890640259, + "num_tokens": 613020632.0, + "step": 16068 + }, + { + "epoch": 2.044141966670907, + "ewc_loss": 0.07264364510774612, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003626669349614531, + "grad_norm": 8.452031135559082, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8697139024734497, + "num_tokens": 613057381.0, + "step": 16069 + }, + { + "epoch": 2.0442691769494976, + "ewc_loss": 0.07310116291046143, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003672420862130821, + "grad_norm": 8.527679443359375, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8624974489212036, + "num_tokens": 613094287.0, + "step": 16070 + }, + { + "epoch": 2.044396387228088, + "ewc_loss": 0.07267339527606964, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036296446342021227, + "grad_norm": 8.441344261169434, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8726367354393005, + "num_tokens": 613126315.0, + "step": 16071 + }, + { + "epoch": 2.0445235975066787, + "ewc_loss": 0.07292933762073517, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036552385427057743, + "grad_norm": 8.504656791687012, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8612486720085144, + "num_tokens": 613160970.0, + "step": 16072 + }, + { + "epoch": 2.044650807785269, + "ewc_loss": 0.07276473939418793, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036387788713909686, + "grad_norm": 8.37569808959961, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8687311410903931, + "num_tokens": 613204685.0, + "step": 16073 + }, + { + "epoch": 2.0447780180638597, + "ewc_loss": 0.07326304167509079, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003688608994707465, + "grad_norm": 8.550980567932129, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8752822279930115, + "num_tokens": 613242394.0, + "step": 16074 + }, + { + "epoch": 2.0449052283424503, + "ewc_loss": 0.072660431265831, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003628348058555275, + "grad_norm": 8.327927589416504, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8734697103500366, + "num_tokens": 613283517.0, + "step": 16075 + }, + { + "epoch": 2.045032438621041, + "ewc_loss": 0.07315844297409058, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00037025631172582507, + "grad_norm": 8.583113670349121, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8687595129013062, + "num_tokens": 613317237.0, + "step": 16076 + }, + { + "epoch": 2.045159648899631, + "ewc_loss": 0.07244439423084259, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036311583244241774, + "grad_norm": 8.347321510314941, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8693408966064453, + "num_tokens": 613355741.0, + "step": 16077 + }, + { + "epoch": 2.0452868591782214, + "ewc_loss": 0.07324311137199402, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00037110294215381145, + "grad_norm": 8.569689750671387, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.856150209903717, + "num_tokens": 613392131.0, + "step": 16078 + }, + { + "epoch": 2.045414069456812, + "ewc_loss": 0.07281449437141418, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036437538801692426, + "grad_norm": 8.52924633026123, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8568990230560303, + "num_tokens": 613433687.0, + "step": 16079 + }, + { + "epoch": 2.0455412797354025, + "ewc_loss": 0.07315592467784882, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003677896747831255, + "grad_norm": 8.425019264221191, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8697695732116699, + "num_tokens": 613472882.0, + "step": 16080 + }, + { + "epoch": 2.045668490013993, + "ewc_loss": 0.07292608916759491, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003679327783174813, + "grad_norm": 8.48284912109375, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.870391845703125, + "num_tokens": 613515869.0, + "step": 16081 + }, + { + "epoch": 2.0457957002925835, + "ewc_loss": 0.07268650829792023, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003655369800981134, + "grad_norm": 8.488763809204102, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8655116558074951, + "num_tokens": 613551835.0, + "step": 16082 + }, + { + "epoch": 2.045922910571174, + "ewc_loss": 0.07293804734945297, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.000368052365956828, + "grad_norm": 8.43896770477295, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.863926887512207, + "num_tokens": 613592277.0, + "step": 16083 + }, + { + "epoch": 2.0460501208497646, + "ewc_loss": 0.07301429659128189, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003663734532892704, + "grad_norm": 10.7190580368042, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8654749393463135, + "num_tokens": 613625594.0, + "step": 16084 + }, + { + "epoch": 2.046177331128355, + "ewc_loss": 0.07302205264568329, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036889236071147025, + "grad_norm": 8.299222946166992, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8692041635513306, + "num_tokens": 613659924.0, + "step": 16085 + }, + { + "epoch": 2.0463045414069456, + "ewc_loss": 0.07562468945980072, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00039491881034336984, + "grad_norm": 9.047806739807129, + "learning_rate": 1e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8478659391403198, + "num_tokens": 613697067.0, + "step": 16086 + }, + { + "epoch": 2.046431751685536, + "ewc_loss": 0.07219351828098297, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036060705315321684, + "grad_norm": 8.36563777923584, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8737802505493164, + "num_tokens": 613738211.0, + "step": 16087 + }, + { + "epoch": 2.0465589619641267, + "ewc_loss": 0.07541453093290329, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00039281719364225864, + "grad_norm": 9.005134582519531, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8715637922286987, + "num_tokens": 613782590.0, + "step": 16088 + }, + { + "epoch": 2.046686172242717, + "ewc_loss": 0.07264911383390427, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036516302498057485, + "grad_norm": 8.430118560791016, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8734424114227295, + "num_tokens": 613825770.0, + "step": 16089 + }, + { + "epoch": 2.0468133825213077, + "ewc_loss": 0.07458117604255676, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00038448366103693843, + "grad_norm": 8.876258850097656, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8713192939758301, + "num_tokens": 613864025.0, + "step": 16090 + }, + { + "epoch": 2.0469405927998983, + "ewc_loss": 0.07295437157154083, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003682155511341989, + "grad_norm": 8.497272491455078, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.868990421295166, + "num_tokens": 613907181.0, + "step": 16091 + }, + { + "epoch": 2.047067803078489, + "ewc_loss": 0.07403148710727692, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003789867914747447, + "grad_norm": 8.808680534362793, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8568671345710754, + "num_tokens": 613951258.0, + "step": 16092 + }, + { + "epoch": 2.0471950133570793, + "ewc_loss": 0.07289081066846848, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036757998168468475, + "grad_norm": 8.56798267364502, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8645445704460144, + "num_tokens": 613991221.0, + "step": 16093 + }, + { + "epoch": 2.04732222363567, + "ewc_loss": 0.07346475124359131, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003733194025699049, + "grad_norm": 8.666281700134277, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8766505718231201, + "num_tokens": 614026862.0, + "step": 16094 + }, + { + "epoch": 2.0474494339142604, + "ewc_loss": 0.07288612425327301, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036753309541381896, + "grad_norm": 8.510167121887207, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8567502498626709, + "num_tokens": 614067560.0, + "step": 16095 + }, + { + "epoch": 2.047576644192851, + "ewc_loss": 0.07317361980676651, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003704080590978265, + "grad_norm": 8.705048561096191, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8699580430984497, + "num_tokens": 614106974.0, + "step": 16096 + }, + { + "epoch": 2.0477038544714414, + "ewc_loss": 0.07274916768074036, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003661634982563555, + "grad_norm": 8.499722480773926, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8922213912010193, + "num_tokens": 614145641.0, + "step": 16097 + }, + { + "epoch": 2.047831064750032, + "ewc_loss": 0.07313884794712067, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003700603556353599, + "grad_norm": 8.583806991577148, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8710178732872009, + "num_tokens": 614181476.0, + "step": 16098 + }, + { + "epoch": 2.0479582750286225, + "ewc_loss": 0.07275518774986267, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036622374318540096, + "grad_norm": 8.534290313720703, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8604063987731934, + "num_tokens": 614219212.0, + "step": 16099 + }, + { + "epoch": 2.048085485307213, + "ewc_loss": 0.0728960782289505, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036763265961781144, + "grad_norm": 8.522028923034668, + "learning_rate": 1e-06, + "loss": 0.5556, + "mean_token_accuracy": 0.8371037244796753, + "num_tokens": 614264017.0, + "step": 16100 + }, + { + "epoch": 2.0482126955858035, + "ewc_loss": 0.07289479672908783, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003676198539324105, + "grad_norm": 8.53630542755127, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8490888476371765, + "num_tokens": 614299031.0, + "step": 16101 + }, + { + "epoch": 2.0483399058643936, + "ewc_loss": 0.07286548614501953, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036732672015205026, + "grad_norm": 8.47828483581543, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.872612476348877, + "num_tokens": 614334387.0, + "step": 16102 + }, + { + "epoch": 2.048467116142984, + "ewc_loss": 0.07316277921199799, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00037029970553703606, + "grad_norm": 8.538771629333496, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8673528432846069, + "num_tokens": 614372438.0, + "step": 16103 + }, + { + "epoch": 2.0485943264215747, + "ewc_loss": 0.07273001968860626, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003659720823634416, + "grad_norm": 8.458796501159668, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8607338666915894, + "num_tokens": 614409976.0, + "step": 16104 + }, + { + "epoch": 2.048721536700165, + "ewc_loss": 0.0731905996799469, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003705778217408806, + "grad_norm": 8.581033706665039, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8735133409500122, + "num_tokens": 614448740.0, + "step": 16105 + }, + { + "epoch": 2.0488487469787557, + "ewc_loss": 0.07304270565509796, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003666575066745281, + "grad_norm": 8.480195999145508, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8701866865158081, + "num_tokens": 614484644.0, + "step": 16106 + }, + { + "epoch": 2.0489759572573463, + "ewc_loss": 0.07323739677667618, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00037104584043845534, + "grad_norm": 8.539563179016113, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8462918400764465, + "num_tokens": 614521210.0, + "step": 16107 + }, + { + "epoch": 2.049103167535937, + "ewc_loss": 0.07286632061004639, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036733507295139134, + "grad_norm": 8.454463005065918, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8600651025772095, + "num_tokens": 614561783.0, + "step": 16108 + }, + { + "epoch": 2.0492303778145273, + "ewc_loss": 0.07342676818370819, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003704981063492596, + "grad_norm": 8.682479858398438, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8669554591178894, + "num_tokens": 614598643.0, + "step": 16109 + }, + { + "epoch": 2.049357588093118, + "ewc_loss": 0.07280619442462921, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036429238389246166, + "grad_norm": 8.487482070922852, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8656967878341675, + "num_tokens": 614633520.0, + "step": 16110 + }, + { + "epoch": 2.0494847983717084, + "ewc_loss": 0.07352335751056671, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037146409158594906, + "grad_norm": 8.582562446594238, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8522876501083374, + "num_tokens": 614673266.0, + "step": 16111 + }, + { + "epoch": 2.049612008650299, + "ewc_loss": 0.07263746857643127, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036504652234725654, + "grad_norm": 8.407376289367676, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8566791415214539, + "num_tokens": 614707201.0, + "step": 16112 + }, + { + "epoch": 2.0497392189288894, + "ewc_loss": 0.07332152873277664, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003718871739692986, + "grad_norm": 8.549431800842285, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.870893657207489, + "num_tokens": 614746542.0, + "step": 16113 + }, + { + "epoch": 2.04986642920748, + "ewc_loss": 0.07277482748031616, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003664201940409839, + "grad_norm": 8.391422271728516, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8554482460021973, + "num_tokens": 614781804.0, + "step": 16114 + }, + { + "epoch": 2.0499936394860705, + "ewc_loss": 0.07327932119369507, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003714650811161846, + "grad_norm": 8.490342140197754, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8656637668609619, + "num_tokens": 614817281.0, + "step": 16115 + }, + { + "epoch": 2.050120849764661, + "ewc_loss": 0.07277479767799377, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036641984479501843, + "grad_norm": 8.391809463500977, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8561168909072876, + "num_tokens": 614853666.0, + "step": 16116 + }, + { + "epoch": 2.0502480600432516, + "ewc_loss": 0.07318183034658432, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003704901610035449, + "grad_norm": 8.474960327148438, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8674265742301941, + "num_tokens": 614891221.0, + "step": 16117 + }, + { + "epoch": 2.050375270321842, + "ewc_loss": 0.07287871837615967, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036745902616530657, + "grad_norm": 8.433201789855957, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8735156059265137, + "num_tokens": 614925956.0, + "step": 16118 + }, + { + "epoch": 2.0505024806004326, + "ewc_loss": 0.07338786125183105, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037010913365520537, + "grad_norm": 8.47616195678711, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8723312020301819, + "num_tokens": 614959934.0, + "step": 16119 + }, + { + "epoch": 2.050629690879023, + "ewc_loss": 0.0730244368314743, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.00036891622585244477, + "grad_norm": 8.44698429107666, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8620551824569702, + "num_tokens": 615000579.0, + "step": 16120 + }, + { + "epoch": 2.0507569011576137, + "ewc_loss": 0.07303456962108612, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003690175653900951, + "grad_norm": 8.5230131149292, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8598065376281738, + "num_tokens": 615037371.0, + "step": 16121 + }, + { + "epoch": 2.050884111436204, + "ewc_loss": 0.0732433944940567, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003686643613036722, + "grad_norm": 8.515992164611816, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8548204898834229, + "num_tokens": 615075948.0, + "step": 16122 + }, + { + "epoch": 2.0510113217147947, + "ewc_loss": 0.07305465638637543, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003692184400279075, + "grad_norm": 8.427512168884277, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8806782960891724, + "num_tokens": 615114619.0, + "step": 16123 + }, + { + "epoch": 2.0511385319933853, + "ewc_loss": 0.07316967844963074, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036792727769352496, + "grad_norm": 8.463419914245605, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8656927943229675, + "num_tokens": 615149260.0, + "step": 16124 + }, + { + "epoch": 2.051265742271976, + "ewc_loss": 0.0728776603937149, + "ewc_loss_diag": 3.62396240234375e-05, + "ewc_loss_parallel": 0.0003674485196825117, + "grad_norm": 8.442002296447754, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.867813229560852, + "num_tokens": 615183284.0, + "step": 16125 + }, + { + "epoch": 2.0513929525505663, + "ewc_loss": 0.0732252299785614, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003684827242977917, + "grad_norm": 8.475081443786621, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8729208707809448, + "num_tokens": 615220015.0, + "step": 16126 + }, + { + "epoch": 2.0515201628291564, + "ewc_loss": 0.07310251146554947, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000367255590390414, + "grad_norm": 8.42891788482666, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.858373761177063, + "num_tokens": 615255614.0, + "step": 16127 + }, + { + "epoch": 2.051647373107747, + "ewc_loss": 0.07332906126976013, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036952103255316615, + "grad_norm": 8.480584144592285, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8667252063751221, + "num_tokens": 615290232.0, + "step": 16128 + }, + { + "epoch": 2.0517745833863374, + "ewc_loss": 0.0729975625872612, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003662061062641442, + "grad_norm": 8.335951805114746, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8711529970169067, + "num_tokens": 615328204.0, + "step": 16129 + }, + { + "epoch": 2.051901793664928, + "ewc_loss": 0.07343701273202896, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003706005809362978, + "grad_norm": 8.536537170410156, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8583173155784607, + "num_tokens": 615362846.0, + "step": 16130 + }, + { + "epoch": 2.0520290039435185, + "ewc_loss": 0.07291635870933533, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003653941093944013, + "grad_norm": 8.3056058883667, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8647568821907043, + "num_tokens": 615405404.0, + "step": 16131 + }, + { + "epoch": 2.052156214222109, + "ewc_loss": 0.07362548261880875, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037248528678901494, + "grad_norm": 8.613018989562988, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8539583086967468, + "num_tokens": 615439835.0, + "step": 16132 + }, + { + "epoch": 2.0522834245006996, + "ewc_loss": 0.07283248007297516, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036455527879297733, + "grad_norm": 8.36775016784668, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8760510087013245, + "num_tokens": 615477832.0, + "step": 16133 + }, + { + "epoch": 2.05241063477929, + "ewc_loss": 0.07372720539569855, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037350255297496915, + "grad_norm": 8.528618812561035, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8567906022071838, + "num_tokens": 615518089.0, + "step": 16134 + }, + { + "epoch": 2.0525378450578806, + "ewc_loss": 0.07291261851787567, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003653566527646035, + "grad_norm": 8.384407043457031, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8700519800186157, + "num_tokens": 615557286.0, + "step": 16135 + }, + { + "epoch": 2.052665055336471, + "ewc_loss": 0.07357671856880188, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037199765210971236, + "grad_norm": 8.540510177612305, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.870526909828186, + "num_tokens": 615593284.0, + "step": 16136 + }, + { + "epoch": 2.0527922656150617, + "ewc_loss": 0.0730798989534378, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003670294536277652, + "grad_norm": 8.428327560424805, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8622349500656128, + "num_tokens": 615633579.0, + "step": 16137 + }, + { + "epoch": 2.052919475893652, + "ewc_loss": 0.0734231248497963, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003704617265611887, + "grad_norm": 8.485440254211426, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8709882497787476, + "num_tokens": 615680135.0, + "step": 16138 + }, + { + "epoch": 2.0530466861722427, + "ewc_loss": 0.07310307770967484, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003672612365335226, + "grad_norm": 8.463531494140625, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8538039922714233, + "num_tokens": 615714801.0, + "step": 16139 + }, + { + "epoch": 2.0531738964508333, + "ewc_loss": 0.07322625815868378, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003684930852614343, + "grad_norm": 8.495970726013184, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8582556247711182, + "num_tokens": 615750538.0, + "step": 16140 + }, + { + "epoch": 2.053301106729424, + "ewc_loss": 0.07322613149881363, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036849177558906376, + "grad_norm": 8.481524467468262, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8688000440597534, + "num_tokens": 615789465.0, + "step": 16141 + }, + { + "epoch": 2.0534283170080143, + "ewc_loss": 0.07298097014427185, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036604018532671034, + "grad_norm": 8.39736557006836, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.851291298866272, + "num_tokens": 615832553.0, + "step": 16142 + }, + { + "epoch": 2.053555527286605, + "ewc_loss": 0.07318440824747086, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036807454307563603, + "grad_norm": 8.42960262298584, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8622390627861023, + "num_tokens": 615868341.0, + "step": 16143 + }, + { + "epoch": 2.0536827375651954, + "ewc_loss": 0.07315415889024734, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003677720669656992, + "grad_norm": 8.447490692138672, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8755884766578674, + "num_tokens": 615904324.0, + "step": 16144 + }, + { + "epoch": 2.053809947843786, + "ewc_loss": 0.07311739027500153, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003674043982755393, + "grad_norm": 8.398052215576172, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8728320598602295, + "num_tokens": 615947656.0, + "step": 16145 + }, + { + "epoch": 2.0539371581223764, + "ewc_loss": 0.0732058510184288, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036828898009844124, + "grad_norm": 8.458075523376465, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8598092794418335, + "num_tokens": 615989050.0, + "step": 16146 + }, + { + "epoch": 2.054064368400967, + "ewc_loss": 0.07312141358852386, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036744456156156957, + "grad_norm": 8.42025089263916, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8688004016876221, + "num_tokens": 616027638.0, + "step": 16147 + }, + { + "epoch": 2.0541915786795575, + "ewc_loss": 0.07337164878845215, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036994690890423954, + "grad_norm": 8.511194229125977, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8733952045440674, + "num_tokens": 616066392.0, + "step": 16148 + }, + { + "epoch": 2.054318788958148, + "ewc_loss": 0.07315253466367722, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036775582702830434, + "grad_norm": 8.360976219177246, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8740525245666504, + "num_tokens": 616108303.0, + "step": 16149 + }, + { + "epoch": 2.0544459992367385, + "ewc_loss": 0.07350873947143555, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003713179030455649, + "grad_norm": 8.529630661010742, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8579056262969971, + "num_tokens": 616154174.0, + "step": 16150 + }, + { + "epoch": 2.0545732095153286, + "ewc_loss": 0.07306629419326782, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003668934223242104, + "grad_norm": 8.377479553222656, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8620051741600037, + "num_tokens": 616193954.0, + "step": 16151 + }, + { + "epoch": 2.054700419793919, + "ewc_loss": 0.0737319216132164, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037354970118030906, + "grad_norm": 8.566640853881836, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8702294826507568, + "num_tokens": 616232771.0, + "step": 16152 + }, + { + "epoch": 2.0548276300725097, + "ewc_loss": 0.07315555214881897, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036778594949282706, + "grad_norm": 8.46462345123291, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8767609596252441, + "num_tokens": 616265151.0, + "step": 16153 + }, + { + "epoch": 2.0549548403511, + "ewc_loss": 0.07340510189533234, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037028148653917015, + "grad_norm": 8.514196395874023, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8592983484268188, + "num_tokens": 616303886.0, + "step": 16154 + }, + { + "epoch": 2.0550820506296907, + "ewc_loss": 0.07326478511095047, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036887830356135964, + "grad_norm": 8.464290618896484, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8763221502304077, + "num_tokens": 616340040.0, + "step": 16155 + }, + { + "epoch": 2.0552092609082813, + "ewc_loss": 0.07349303364753723, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003711608296725899, + "grad_norm": 8.624792098999023, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8703826069831848, + "num_tokens": 616371329.0, + "step": 16156 + }, + { + "epoch": 2.055336471186872, + "ewc_loss": 0.07297077775001526, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036593826371245086, + "grad_norm": 8.354364395141602, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8791172504425049, + "num_tokens": 616406383.0, + "step": 16157 + }, + { + "epoch": 2.0554636814654623, + "ewc_loss": 0.0736987292766571, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037321780109778047, + "grad_norm": 8.583650588989258, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8527370095252991, + "num_tokens": 616445600.0, + "step": 16158 + }, + { + "epoch": 2.055590891744053, + "ewc_loss": 0.07282872498035431, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003645177639555186, + "grad_norm": 8.349555969238281, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8577953577041626, + "num_tokens": 616485376.0, + "step": 16159 + }, + { + "epoch": 2.0557181020226434, + "ewc_loss": 0.07377469539642334, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003739774110727012, + "grad_norm": 8.573556900024414, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.855691134929657, + "num_tokens": 616527104.0, + "step": 16160 + }, + { + "epoch": 2.055845312301234, + "ewc_loss": 0.07313500344753265, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003651391016319394, + "grad_norm": 8.345470428466797, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8675560355186462, + "num_tokens": 616567557.0, + "step": 16161 + }, + { + "epoch": 2.0559725225798244, + "ewc_loss": 0.07395230233669281, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003757534723263234, + "grad_norm": 8.577529907226562, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8704248666763306, + "num_tokens": 616601612.0, + "step": 16162 + }, + { + "epoch": 2.056099732858415, + "ewc_loss": 0.07294247299432755, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036565519985742867, + "grad_norm": 8.341014862060547, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8637923002243042, + "num_tokens": 616638019.0, + "step": 16163 + }, + { + "epoch": 2.0562269431370055, + "ewc_loss": 0.07387734949588776, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003750039613805711, + "grad_norm": 9.24167251586914, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8643123507499695, + "num_tokens": 616675224.0, + "step": 16164 + }, + { + "epoch": 2.056354153415596, + "ewc_loss": 0.07237043976783752, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003599348128773272, + "grad_norm": 8.209113121032715, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8636542558670044, + "num_tokens": 616713628.0, + "step": 16165 + }, + { + "epoch": 2.0564813636941865, + "ewc_loss": 0.07492208480834961, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003854513051919639, + "grad_norm": 8.759401321411133, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8636904954910278, + "num_tokens": 616755985.0, + "step": 16166 + }, + { + "epoch": 2.056608573972777, + "ewc_loss": 0.07257449626922607, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036197545705363154, + "grad_norm": 8.25866413116455, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8672105073928833, + "num_tokens": 616786564.0, + "step": 16167 + }, + { + "epoch": 2.0567357842513676, + "ewc_loss": 0.07452701777219772, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00038150063483044505, + "grad_norm": 8.713247299194336, + "learning_rate": 1e-06, + "loss": 0.5564, + "mean_token_accuracy": 0.836700975894928, + "num_tokens": 616823285.0, + "step": 16168 + }, + { + "epoch": 2.056862994529958, + "ewc_loss": 0.07317569106817245, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003679873771034181, + "grad_norm": 8.306713104248047, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8739731311798096, + "num_tokens": 616864465.0, + "step": 16169 + }, + { + "epoch": 2.0569902048085487, + "ewc_loss": 0.07431714236736298, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003794019285123795, + "grad_norm": 8.627584457397461, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8554995059967041, + "num_tokens": 616906670.0, + "step": 16170 + }, + { + "epoch": 2.057117415087139, + "ewc_loss": 0.07325080037117004, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036873851786367595, + "grad_norm": 8.383963584899902, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8646931648254395, + "num_tokens": 616947656.0, + "step": 16171 + }, + { + "epoch": 2.0572446253657297, + "ewc_loss": 0.07414358109235764, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003776662633754313, + "grad_norm": 8.604009628295898, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8590439558029175, + "num_tokens": 616980161.0, + "step": 16172 + }, + { + "epoch": 2.0573718356443202, + "ewc_loss": 0.07340948283672333, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037032534601166844, + "grad_norm": 8.442388534545898, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8651527166366577, + "num_tokens": 617015281.0, + "step": 16173 + }, + { + "epoch": 2.0574990459229108, + "ewc_loss": 0.07374972105026245, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003737276419997215, + "grad_norm": 8.515543937683105, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8665656447410583, + "num_tokens": 617054174.0, + "step": 16174 + }, + { + "epoch": 2.057626256201501, + "ewc_loss": 0.0733395367860794, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003696258645504713, + "grad_norm": 8.35391902923584, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8650834560394287, + "num_tokens": 617095160.0, + "step": 16175 + }, + { + "epoch": 2.0577534664800914, + "ewc_loss": 0.07373489439487457, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037357944529503584, + "grad_norm": 8.557825088500977, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8700800538063049, + "num_tokens": 617130781.0, + "step": 16176 + }, + { + "epoch": 2.057880676758682, + "ewc_loss": 0.07324565201997757, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003686869749799371, + "grad_norm": 8.400983810424805, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8639894723892212, + "num_tokens": 617169473.0, + "step": 16177 + }, + { + "epoch": 2.0580078870372724, + "ewc_loss": 0.0738627165555954, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003748576564248651, + "grad_norm": 8.535134315490723, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8584780097007751, + "num_tokens": 617212208.0, + "step": 16178 + }, + { + "epoch": 2.058135097315863, + "ewc_loss": 0.07312478125095367, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003674783220048994, + "grad_norm": 8.379638671875, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8660517930984497, + "num_tokens": 617246055.0, + "step": 16179 + }, + { + "epoch": 2.0582623075944535, + "ewc_loss": 0.0738351047039032, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037458151928149164, + "grad_norm": 8.493087768554688, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8606688976287842, + "num_tokens": 617285614.0, + "step": 16180 + }, + { + "epoch": 2.058389517873044, + "ewc_loss": 0.07337231934070587, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036995363188907504, + "grad_norm": 8.434404373168945, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8722803592681885, + "num_tokens": 617323401.0, + "step": 16181 + }, + { + "epoch": 2.0585167281516346, + "ewc_loss": 0.07361546158790588, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037238511140458286, + "grad_norm": 8.448765754699707, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8515280485153198, + "num_tokens": 617372550.0, + "step": 16182 + }, + { + "epoch": 2.058643938430225, + "ewc_loss": 0.07351285219192505, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037135896855033934, + "grad_norm": 8.467482566833496, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.870539665222168, + "num_tokens": 617407894.0, + "step": 16183 + }, + { + "epoch": 2.0587711487088156, + "ewc_loss": 0.07344034314155579, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003706338757183403, + "grad_norm": 8.513484001159668, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8569240570068359, + "num_tokens": 617450117.0, + "step": 16184 + }, + { + "epoch": 2.058898358987406, + "ewc_loss": 0.07331421226263046, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003693726030178368, + "grad_norm": 8.459766387939453, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8704344034194946, + "num_tokens": 617485885.0, + "step": 16185 + }, + { + "epoch": 2.0590255692659967, + "ewc_loss": 0.07351325452327728, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003713630430866033, + "grad_norm": 8.535662651062012, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8484501838684082, + "num_tokens": 617520511.0, + "step": 16186 + }, + { + "epoch": 2.059152779544587, + "ewc_loss": 0.07333828508853912, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003696133499033749, + "grad_norm": 8.445971488952637, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.857333779335022, + "num_tokens": 617558288.0, + "step": 16187 + }, + { + "epoch": 2.0592799898231777, + "ewc_loss": 0.07334060221910477, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000369636487448588, + "grad_norm": 8.428390502929688, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8643077611923218, + "num_tokens": 617602991.0, + "step": 16188 + }, + { + "epoch": 2.0594072001017683, + "ewc_loss": 0.07339233160018921, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037015380803495646, + "grad_norm": 8.500861167907715, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8634935617446899, + "num_tokens": 617639629.0, + "step": 16189 + }, + { + "epoch": 2.059534410380359, + "ewc_loss": 0.07327185571193695, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036894905497319996, + "grad_norm": 8.384714126586914, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8585022687911987, + "num_tokens": 617676218.0, + "step": 16190 + }, + { + "epoch": 2.0596616206589493, + "ewc_loss": 0.07352522015571594, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037148265982978046, + "grad_norm": 8.448155403137207, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8578272461891174, + "num_tokens": 617716270.0, + "step": 16191 + }, + { + "epoch": 2.05978883093754, + "ewc_loss": 0.073272705078125, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036895746598020196, + "grad_norm": 8.489206314086914, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8739384412765503, + "num_tokens": 617755559.0, + "step": 16192 + }, + { + "epoch": 2.0599160412161304, + "ewc_loss": 0.07343527674674988, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000370583264157176, + "grad_norm": 8.522793769836426, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8681819438934326, + "num_tokens": 617793522.0, + "step": 16193 + }, + { + "epoch": 2.060043251494721, + "ewc_loss": 0.0732155293226242, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003683857212308794, + "grad_norm": 8.434962272644043, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.869406521320343, + "num_tokens": 617828005.0, + "step": 16194 + }, + { + "epoch": 2.0601704617733114, + "ewc_loss": 0.07337748259305954, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037000529118813574, + "grad_norm": 9.532190322875977, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8609314560890198, + "num_tokens": 617863824.0, + "step": 16195 + }, + { + "epoch": 2.060297672051902, + "ewc_loss": 0.07238581776618958, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036008862662129104, + "grad_norm": 8.141223907470703, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8622210621833801, + "num_tokens": 617903648.0, + "step": 16196 + }, + { + "epoch": 2.0604248823304925, + "ewc_loss": 0.07521937042474747, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003884241741616279, + "grad_norm": 8.807443618774414, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8611291646957397, + "num_tokens": 617946229.0, + "step": 16197 + }, + { + "epoch": 2.060552092609083, + "ewc_loss": 0.0724136009812355, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003603664808906615, + "grad_norm": 8.166887283325195, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8664910197257996, + "num_tokens": 617990486.0, + "step": 16198 + }, + { + "epoch": 2.0606793028876735, + "ewc_loss": 0.07545600086450577, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003907904902007431, + "grad_norm": 8.874438285827637, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8782409429550171, + "num_tokens": 618031743.0, + "step": 16199 + }, + { + "epoch": 2.0608065131662636, + "ewc_loss": 0.07284919917583466, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003647225094027817, + "grad_norm": 8.296162605285645, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8613479137420654, + "num_tokens": 618070181.0, + "step": 16200 + }, + { + "epoch": 2.060933723444854, + "ewc_loss": 0.07494291663169861, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003856596304103732, + "grad_norm": 8.912381172180176, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8711666464805603, + "num_tokens": 618106396.0, + "step": 16201 + }, + { + "epoch": 2.0610609337234447, + "ewc_loss": 0.07314790785312653, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003677095810417086, + "grad_norm": 8.338293075561523, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.870999813079834, + "num_tokens": 618148281.0, + "step": 16202 + }, + { + "epoch": 2.061188144002035, + "ewc_loss": 0.07469155639410019, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003831460198853165, + "grad_norm": 8.76119327545166, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8596286773681641, + "num_tokens": 618182721.0, + "step": 16203 + }, + { + "epoch": 2.0613153542806257, + "ewc_loss": 0.07314793765544891, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036770987208001316, + "grad_norm": 8.384840965270996, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8818385601043701, + "num_tokens": 618219497.0, + "step": 16204 + }, + { + "epoch": 2.0614425645592163, + "ewc_loss": 0.07446913421154022, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00038092181785032153, + "grad_norm": 8.670753479003906, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8822336196899414, + "num_tokens": 618258643.0, + "step": 16205 + }, + { + "epoch": 2.061569774837807, + "ewc_loss": 0.07328790426254272, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036910950439050794, + "grad_norm": 9.520118713378906, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8801142573356628, + "num_tokens": 618292682.0, + "step": 16206 + }, + { + "epoch": 2.0616969851163973, + "ewc_loss": 0.07265171408653259, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003627475816756487, + "grad_norm": 8.330801010131836, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8508296012878418, + "num_tokens": 618331375.0, + "step": 16207 + }, + { + "epoch": 2.061824195394988, + "ewc_loss": 0.07465294748544693, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00038275992847047746, + "grad_norm": 8.799689292907715, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8699116110801697, + "num_tokens": 618367452.0, + "step": 16208 + }, + { + "epoch": 2.0619514056735784, + "ewc_loss": 0.0722491592168808, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00035872208536602557, + "grad_norm": 8.194669723510742, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8631857633590698, + "num_tokens": 618410470.0, + "step": 16209 + }, + { + "epoch": 2.062078615952169, + "ewc_loss": 0.07508912682533264, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003871217486448586, + "grad_norm": 8.898255348205566, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.858940839767456, + "num_tokens": 618447524.0, + "step": 16210 + }, + { + "epoch": 2.0622058262307594, + "ewc_loss": 0.07257486879825592, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003619791241362691, + "grad_norm": 8.267401695251465, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8466118574142456, + "num_tokens": 618481793.0, + "step": 16211 + }, + { + "epoch": 2.06233303650935, + "ewc_loss": 0.07502101361751556, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003864406025968492, + "grad_norm": 8.829768180847168, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8705854415893555, + "num_tokens": 618517097.0, + "step": 16212 + }, + { + "epoch": 2.0624602467879405, + "ewc_loss": 0.07308436930179596, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003670741571113467, + "grad_norm": 8.38721752166748, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8622626066207886, + "num_tokens": 618554221.0, + "step": 16213 + }, + { + "epoch": 2.062587457066531, + "ewc_loss": 0.07522629201412201, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038116920040920377, + "grad_norm": 8.885805130004883, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8757752776145935, + "num_tokens": 618591279.0, + "step": 16214 + }, + { + "epoch": 2.0627146673451215, + "ewc_loss": 0.07317689061164856, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036799939698539674, + "grad_norm": 8.434154510498047, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8530595302581787, + "num_tokens": 618628168.0, + "step": 16215 + }, + { + "epoch": 2.062841877623712, + "ewc_loss": 0.07415477186441422, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003777781967073679, + "grad_norm": 8.661948204040527, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.870991587638855, + "num_tokens": 618664580.0, + "step": 16216 + }, + { + "epoch": 2.0629690879023026, + "ewc_loss": 0.07307794690132141, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036700995406135917, + "grad_norm": 8.410759925842285, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8538182973861694, + "num_tokens": 618699007.0, + "step": 16217 + }, + { + "epoch": 2.063096298180893, + "ewc_loss": 0.07401798665523529, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003764103166759014, + "grad_norm": 8.602088928222656, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8672384023666382, + "num_tokens": 618736630.0, + "step": 16218 + }, + { + "epoch": 2.0632235084594837, + "ewc_loss": 0.07323670387268066, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003685975098051131, + "grad_norm": 8.40535831451416, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8740252256393433, + "num_tokens": 618777868.0, + "step": 16219 + }, + { + "epoch": 2.063350718738074, + "ewc_loss": 0.07392212748527527, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000375451723812148, + "grad_norm": 8.559737205505371, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8686855435371399, + "num_tokens": 618819134.0, + "step": 16220 + }, + { + "epoch": 2.0634779290166647, + "ewc_loss": 0.07327325642108917, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003689630830194801, + "grad_norm": 8.388477325439453, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8647782206535339, + "num_tokens": 618862231.0, + "step": 16221 + }, + { + "epoch": 2.0636051392952552, + "ewc_loss": 0.07379548251628876, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037418529973365366, + "grad_norm": 8.50669002532959, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8588496446609497, + "num_tokens": 618906816.0, + "step": 16222 + }, + { + "epoch": 2.0637323495738458, + "ewc_loss": 0.07349427044391632, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037117316969670355, + "grad_norm": 8.461886405944824, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8652249574661255, + "num_tokens": 618943239.0, + "step": 16223 + }, + { + "epoch": 2.0638595598524363, + "ewc_loss": 0.07370235025882721, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037325400626286864, + "grad_norm": 8.524575233459473, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8782405853271484, + "num_tokens": 618978641.0, + "step": 16224 + }, + { + "epoch": 2.0639867701310264, + "ewc_loss": 0.0736130028963089, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037236048956401646, + "grad_norm": 8.466465950012207, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8669012188911438, + "num_tokens": 619020522.0, + "step": 16225 + }, + { + "epoch": 2.064113980409617, + "ewc_loss": 0.07371953874826431, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037342586438171566, + "grad_norm": 8.481378555297852, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8772740364074707, + "num_tokens": 619060769.0, + "step": 16226 + }, + { + "epoch": 2.0642411906882074, + "ewc_loss": 0.07362233102321625, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037245373823679984, + "grad_norm": 8.444609642028809, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.876083493232727, + "num_tokens": 619098711.0, + "step": 16227 + }, + { + "epoch": 2.064368400966798, + "ewc_loss": 0.07384470105171204, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037467750371433794, + "grad_norm": 8.57844352722168, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8620509505271912, + "num_tokens": 619133673.0, + "step": 16228 + }, + { + "epoch": 2.0644956112453885, + "ewc_loss": 0.07339595258235931, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003701899549923837, + "grad_norm": 8.41249942779541, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8607783317565918, + "num_tokens": 619174554.0, + "step": 16229 + }, + { + "epoch": 2.064622821523979, + "ewc_loss": 0.07388067245483398, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003750371979549527, + "grad_norm": 8.542405128479004, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8554179668426514, + "num_tokens": 619211095.0, + "step": 16230 + }, + { + "epoch": 2.0647500318025696, + "ewc_loss": 0.07340486347675323, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003702791000250727, + "grad_norm": 8.457198143005371, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8748432993888855, + "num_tokens": 619246346.0, + "step": 16231 + }, + { + "epoch": 2.06487724208116, + "ewc_loss": 0.07381545007228851, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037438501021824777, + "grad_norm": 8.528582572937012, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8639302253723145, + "num_tokens": 619280629.0, + "step": 16232 + }, + { + "epoch": 2.0650044523597506, + "ewc_loss": 0.07337497174739838, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003699801454786211, + "grad_norm": 8.430204391479492, + "learning_rate": 1e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8461405038833618, + "num_tokens": 619320928.0, + "step": 16233 + }, + { + "epoch": 2.065131662638341, + "ewc_loss": 0.07379041612148285, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003741346299648285, + "grad_norm": 8.471989631652832, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8674150705337524, + "num_tokens": 619369747.0, + "step": 16234 + }, + { + "epoch": 2.0652588729169317, + "ewc_loss": 0.0734802857041359, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037103332579135895, + "grad_norm": 8.419023513793945, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8682565689086914, + "num_tokens": 619408694.0, + "step": 16235 + }, + { + "epoch": 2.065386083195522, + "ewc_loss": 0.07362313568592072, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003724617708940059, + "grad_norm": 8.438924789428711, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8736610412597656, + "num_tokens": 619446858.0, + "step": 16236 + }, + { + "epoch": 2.0655132934741127, + "ewc_loss": 0.07377311587333679, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037396157858893275, + "grad_norm": 8.546268463134766, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8643575310707092, + "num_tokens": 619482281.0, + "step": 16237 + }, + { + "epoch": 2.0656405037527032, + "ewc_loss": 0.07350149750709534, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037124540540389717, + "grad_norm": 8.408570289611816, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8747590184211731, + "num_tokens": 619520309.0, + "step": 16238 + }, + { + "epoch": 2.065767714031294, + "ewc_loss": 0.0739181637763977, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037541205529123545, + "grad_norm": 8.54743766784668, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8691970109939575, + "num_tokens": 619554662.0, + "step": 16239 + }, + { + "epoch": 2.0658949243098843, + "ewc_loss": 0.07357756793498993, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003720061795320362, + "grad_norm": 8.466529846191406, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8614590764045715, + "num_tokens": 619599632.0, + "step": 16240 + }, + { + "epoch": 2.066022134588475, + "ewc_loss": 0.07380920648574829, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037432252429425716, + "grad_norm": 8.478204727172852, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8797791004180908, + "num_tokens": 619639414.0, + "step": 16241 + }, + { + "epoch": 2.0661493448670654, + "ewc_loss": 0.07361658662557602, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003723963163793087, + "grad_norm": 8.491644859313965, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.867922306060791, + "num_tokens": 619675878.0, + "step": 16242 + }, + { + "epoch": 2.066276555145656, + "ewc_loss": 0.07368162274360657, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003730466414708644, + "grad_norm": 8.524147987365723, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8687745332717896, + "num_tokens": 619709486.0, + "step": 16243 + }, + { + "epoch": 2.0664037654242464, + "ewc_loss": 0.07365387678146362, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037276928196661174, + "grad_norm": 8.461498260498047, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8664243221282959, + "num_tokens": 619753576.0, + "step": 16244 + }, + { + "epoch": 2.066530975702837, + "ewc_loss": 0.07382406294345856, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003744710993487388, + "grad_norm": 8.605298042297363, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8662503957748413, + "num_tokens": 619792520.0, + "step": 16245 + }, + { + "epoch": 2.0666581859814275, + "ewc_loss": 0.0733458399772644, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003696889034472406, + "grad_norm": 8.49553108215332, + "learning_rate": 1e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.8437516689300537, + "num_tokens": 619827871.0, + "step": 16246 + }, + { + "epoch": 2.066785396260018, + "ewc_loss": 0.07376465201377869, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037387697375379503, + "grad_norm": 8.509832382202148, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8611977100372314, + "num_tokens": 619864527.0, + "step": 16247 + }, + { + "epoch": 2.0669126065386085, + "ewc_loss": 0.07344013452529907, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037063180934637785, + "grad_norm": 8.426332473754883, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8670325875282288, + "num_tokens": 619904934.0, + "step": 16248 + }, + { + "epoch": 2.0670398168171986, + "ewc_loss": 0.07363098114728928, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037254029302857816, + "grad_norm": 8.526510238647461, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8735578060150146, + "num_tokens": 619945472.0, + "step": 16249 + }, + { + "epoch": 2.067167027095789, + "ewc_loss": 0.073464035987854, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037087081000208855, + "grad_norm": 8.542620658874512, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8789569139480591, + "num_tokens": 619981738.0, + "step": 16250 + }, + { + "epoch": 2.0672942373743797, + "ewc_loss": 0.07343491911888123, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003705797134898603, + "grad_norm": 8.435391426086426, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8670287728309631, + "num_tokens": 620024593.0, + "step": 16251 + }, + { + "epoch": 2.06742144765297, + "ewc_loss": 0.0736112892627716, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003723433765117079, + "grad_norm": 8.517095565795898, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8716729879379272, + "num_tokens": 620064059.0, + "step": 16252 + }, + { + "epoch": 2.0675486579315607, + "ewc_loss": 0.07352833449840546, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003715138300321996, + "grad_norm": 8.459914207458496, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8648812770843506, + "num_tokens": 620100028.0, + "step": 16253 + }, + { + "epoch": 2.0676758682101513, + "ewc_loss": 0.07375915348529816, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037382199661806226, + "grad_norm": 8.546727180480957, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8782954812049866, + "num_tokens": 620135181.0, + "step": 16254 + }, + { + "epoch": 2.067803078488742, + "ewc_loss": 0.07345990836620331, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003708295407705009, + "grad_norm": 8.45400619506836, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8659154176712036, + "num_tokens": 620174887.0, + "step": 16255 + }, + { + "epoch": 2.0679302887673323, + "ewc_loss": 0.0737241804599762, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037347222678363323, + "grad_norm": 8.524763107299805, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8662437796592712, + "num_tokens": 620212924.0, + "step": 16256 + }, + { + "epoch": 2.068057499045923, + "ewc_loss": 0.0735267624258995, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037149808485992253, + "grad_norm": 8.475428581237793, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8555328249931335, + "num_tokens": 620251088.0, + "step": 16257 + }, + { + "epoch": 2.0681847093245134, + "ewc_loss": 0.07378734648227692, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037410392542369664, + "grad_norm": 8.603950500488281, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8648792505264282, + "num_tokens": 620287888.0, + "step": 16258 + }, + { + "epoch": 2.068311919603104, + "ewc_loss": 0.07342421263456345, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003704726113937795, + "grad_norm": 8.481708526611328, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.864687442779541, + "num_tokens": 620324937.0, + "step": 16259 + }, + { + "epoch": 2.0684391298816944, + "ewc_loss": 0.07369564473628998, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037318695103749633, + "grad_norm": 8.545306205749512, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8666151762008667, + "num_tokens": 620363390.0, + "step": 16260 + }, + { + "epoch": 2.068566340160285, + "ewc_loss": 0.07351426780223846, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037137314211577177, + "grad_norm": 8.477280616760254, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8728690147399902, + "num_tokens": 620403707.0, + "step": 16261 + }, + { + "epoch": 2.0686935504388755, + "ewc_loss": 0.07365719974040985, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037280251854099333, + "grad_norm": 8.46617317199707, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8703509569168091, + "num_tokens": 620445994.0, + "step": 16262 + }, + { + "epoch": 2.068820760717466, + "ewc_loss": 0.07360613346099854, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037229180452413857, + "grad_norm": 8.530323028564453, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8743187189102173, + "num_tokens": 620480883.0, + "step": 16263 + }, + { + "epoch": 2.0689479709960565, + "ewc_loss": 0.07352328300476074, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037146330578252673, + "grad_norm": 8.470735549926758, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8788650631904602, + "num_tokens": 620518935.0, + "step": 16264 + }, + { + "epoch": 2.069075181274647, + "ewc_loss": 0.07370807975530624, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037331125349737704, + "grad_norm": 8.55568790435791, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8600596189498901, + "num_tokens": 620551985.0, + "step": 16265 + }, + { + "epoch": 2.0692023915532376, + "ewc_loss": 0.07346302270889282, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037086071097292006, + "grad_norm": 8.460400581359863, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8614888787269592, + "num_tokens": 620592809.0, + "step": 16266 + }, + { + "epoch": 2.069329601831828, + "ewc_loss": 0.0737743079662323, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003739735984709114, + "grad_norm": 8.59289264678955, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8505303859710693, + "num_tokens": 620627951.0, + "step": 16267 + }, + { + "epoch": 2.0694568121104187, + "ewc_loss": 0.07343344390392303, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003705648996401578, + "grad_norm": 8.461394309997559, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8696023225784302, + "num_tokens": 620660892.0, + "step": 16268 + }, + { + "epoch": 2.069584022389009, + "ewc_loss": 0.07384263724088669, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037465683999471366, + "grad_norm": 8.611144065856934, + "learning_rate": 1e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8415250778198242, + "num_tokens": 620698165.0, + "step": 16269 + }, + { + "epoch": 2.0697112326675997, + "ewc_loss": 0.07328236103057861, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036905406159348786, + "grad_norm": 8.485566139221191, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8567334413528442, + "num_tokens": 620734758.0, + "step": 16270 + }, + { + "epoch": 2.0698384429461902, + "ewc_loss": 0.07360953092575073, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003723257395904511, + "grad_norm": 8.55673599243164, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.844773530960083, + "num_tokens": 620770224.0, + "step": 16271 + }, + { + "epoch": 2.0699656532247808, + "ewc_loss": 0.0734105035662651, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037033550324849784, + "grad_norm": 8.540687561035156, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8733621835708618, + "num_tokens": 620805724.0, + "step": 16272 + }, + { + "epoch": 2.070092863503371, + "ewc_loss": 0.07342486083507538, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037047904334031045, + "grad_norm": 8.491226196289062, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.853624701499939, + "num_tokens": 620847622.0, + "step": 16273 + }, + { + "epoch": 2.0702200737819614, + "ewc_loss": 0.07347756624221802, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037100608460605145, + "grad_norm": 8.53773307800293, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8623810410499573, + "num_tokens": 620879042.0, + "step": 16274 + }, + { + "epoch": 2.070347284060552, + "ewc_loss": 0.07335826754570007, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003698131476994604, + "grad_norm": 8.51024341583252, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8623033165931702, + "num_tokens": 620922685.0, + "step": 16275 + }, + { + "epoch": 2.0704744943391424, + "ewc_loss": 0.0734209343791008, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003704398113768548, + "grad_norm": 8.516844749450684, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.860439658164978, + "num_tokens": 620954854.0, + "step": 16276 + }, + { + "epoch": 2.070601704617733, + "ewc_loss": 0.07334718108177185, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003697022330015898, + "grad_norm": 8.482274055480957, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8619240522384644, + "num_tokens": 620994581.0, + "step": 16277 + }, + { + "epoch": 2.0707289148963235, + "ewc_loss": 0.073355533182621, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036978579009883106, + "grad_norm": 8.597277641296387, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.865734875202179, + "num_tokens": 621030052.0, + "step": 16278 + }, + { + "epoch": 2.070856125174914, + "ewc_loss": 0.07310429215431213, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003672733437269926, + "grad_norm": 8.421957969665527, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8747376799583435, + "num_tokens": 621072474.0, + "step": 16279 + }, + { + "epoch": 2.0709833354535045, + "ewc_loss": 0.07345245778560638, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003707550640683621, + "grad_norm": 8.606776237487793, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.87041836977005, + "num_tokens": 621104365.0, + "step": 16280 + }, + { + "epoch": 2.071110545732095, + "ewc_loss": 0.0729159265756607, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036538971471600235, + "grad_norm": 8.415496826171875, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8709495067596436, + "num_tokens": 621140497.0, + "step": 16281 + }, + { + "epoch": 2.0712377560106856, + "ewc_loss": 0.07406294345855713, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037197713390924037, + "grad_norm": 8.60225772857666, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8681239485740662, + "num_tokens": 621177431.0, + "step": 16282 + }, + { + "epoch": 2.071364966289276, + "ewc_loss": 0.07288233935832977, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003650538856163621, + "grad_norm": 8.5015287399292, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8701879978179932, + "num_tokens": 621212874.0, + "step": 16283 + }, + { + "epoch": 2.0714921765678667, + "ewc_loss": 0.0734168291091919, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003703987458720803, + "grad_norm": 8.591690063476562, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8711408376693726, + "num_tokens": 621249927.0, + "step": 16284 + }, + { + "epoch": 2.071619386846457, + "ewc_loss": 0.0734395682811737, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003657433844637126, + "grad_norm": 8.481282234191895, + "learning_rate": 1e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8427538871765137, + "num_tokens": 621280794.0, + "step": 16285 + }, + { + "epoch": 2.0717465971250477, + "ewc_loss": 0.07371969521045685, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003685446281451732, + "grad_norm": 8.56479263305664, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8488097190856934, + "num_tokens": 621321167.0, + "step": 16286 + }, + { + "epoch": 2.0718738074036382, + "ewc_loss": 0.07347704470157623, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00036611815448850393, + "grad_norm": 8.445796966552734, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8775997161865234, + "num_tokens": 621357962.0, + "step": 16287 + }, + { + "epoch": 2.0720010176822288, + "ewc_loss": 0.07322683930397034, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003684988769236952, + "grad_norm": 8.524748802185059, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8839054107666016, + "num_tokens": 621398933.0, + "step": 16288 + }, + { + "epoch": 2.0721282279608193, + "ewc_loss": 0.07303917407989502, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036662223283201456, + "grad_norm": 8.476696014404297, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8855232000350952, + "num_tokens": 621442334.0, + "step": 16289 + }, + { + "epoch": 2.07225543823941, + "ewc_loss": 0.07326187193393707, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036884917062707245, + "grad_norm": 8.640172004699707, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8688122034072876, + "num_tokens": 621478706.0, + "step": 16290 + }, + { + "epoch": 2.0723826485180004, + "ewc_loss": 0.07283030450344086, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003645335673354566, + "grad_norm": 8.400197982788086, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8717606067657471, + "num_tokens": 621513911.0, + "step": 16291 + }, + { + "epoch": 2.072509858796591, + "ewc_loss": 0.0734744593501091, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037097505992278457, + "grad_norm": 8.622620582580566, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8606725931167603, + "num_tokens": 621554080.0, + "step": 16292 + }, + { + "epoch": 2.0726370690751814, + "ewc_loss": 0.07286051660776138, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036483563599176705, + "grad_norm": 8.433856010437012, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.867802619934082, + "num_tokens": 621592059.0, + "step": 16293 + }, + { + "epoch": 2.072764279353772, + "ewc_loss": 0.07336638867855072, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036989437649026513, + "grad_norm": 8.584676742553711, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8791760802268982, + "num_tokens": 621629626.0, + "step": 16294 + }, + { + "epoch": 2.0728914896323625, + "ewc_loss": 0.07274093478918076, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003636398178059608, + "grad_norm": 8.446479797363281, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8703848123550415, + "num_tokens": 621671341.0, + "step": 16295 + }, + { + "epoch": 2.073018699910953, + "ewc_loss": 0.07336404919624329, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003698710061144084, + "grad_norm": 8.585233688354492, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8808603286743164, + "num_tokens": 621708756.0, + "step": 16296 + }, + { + "epoch": 2.0731459101895435, + "ewc_loss": 0.07274572551250458, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036368772271089256, + "grad_norm": 8.445988655090332, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8714545965194702, + "num_tokens": 621748009.0, + "step": 16297 + }, + { + "epoch": 2.0732731204681336, + "ewc_loss": 0.07338292896747589, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037005971535108984, + "grad_norm": 8.583232879638672, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.874954104423523, + "num_tokens": 621788695.0, + "step": 16298 + }, + { + "epoch": 2.073400330746724, + "ewc_loss": 0.07275697588920593, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003638001799117774, + "grad_norm": 8.4616117477417, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.864228367805481, + "num_tokens": 621828099.0, + "step": 16299 + }, + { + "epoch": 2.0735275410253147, + "ewc_loss": 0.07330766320228577, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036930711939930916, + "grad_norm": 8.792655944824219, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8505105376243591, + "num_tokens": 621867033.0, + "step": 16300 + }, + { + "epoch": 2.073654751303905, + "ewc_loss": 0.07258717715740204, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036210226244293153, + "grad_norm": 8.389510154724121, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8566474914550781, + "num_tokens": 621907435.0, + "step": 16301 + }, + { + "epoch": 2.0737819615824957, + "ewc_loss": 0.07364834100008011, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003727138973772526, + "grad_norm": 8.591756820678711, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8559091687202454, + "num_tokens": 621948995.0, + "step": 16302 + }, + { + "epoch": 2.0739091718610863, + "ewc_loss": 0.07278239727020264, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003640544891823083, + "grad_norm": 8.399465560913086, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8777372241020203, + "num_tokens": 621991109.0, + "step": 16303 + }, + { + "epoch": 2.074036382139677, + "ewc_loss": 0.07366886734962463, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003729191957972944, + "grad_norm": 8.647944450378418, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8722330927848816, + "num_tokens": 622026585.0, + "step": 16304 + }, + { + "epoch": 2.0741635924182673, + "ewc_loss": 0.07301327586174011, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003663631796371192, + "grad_norm": 8.429323196411133, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8758869171142578, + "num_tokens": 622065208.0, + "step": 16305 + }, + { + "epoch": 2.074290802696858, + "ewc_loss": 0.07364413142204285, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037267175503075123, + "grad_norm": 8.622051239013672, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8558273911476135, + "num_tokens": 622104838.0, + "step": 16306 + }, + { + "epoch": 2.0744180129754484, + "ewc_loss": 0.07298088073730469, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036603931221179664, + "grad_norm": 8.492875099182129, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8582879900932312, + "num_tokens": 622136879.0, + "step": 16307 + }, + { + "epoch": 2.074545223254039, + "ewc_loss": 0.07354618608951569, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003716923529282212, + "grad_norm": 8.559708595275879, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8777917623519897, + "num_tokens": 622175938.0, + "step": 16308 + }, + { + "epoch": 2.0746724335326294, + "ewc_loss": 0.07333815097808838, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000369611952919513, + "grad_norm": 8.499496459960938, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8744369745254517, + "num_tokens": 622216663.0, + "step": 16309 + }, + { + "epoch": 2.07479964381122, + "ewc_loss": 0.07334396243095398, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000369670131476596, + "grad_norm": 8.556842803955078, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8743773698806763, + "num_tokens": 622250450.0, + "step": 16310 + }, + { + "epoch": 2.0749268540898105, + "ewc_loss": 0.07334916293621063, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003697221400216222, + "grad_norm": 8.552522659301758, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8596101999282837, + "num_tokens": 622291919.0, + "step": 16311 + }, + { + "epoch": 2.075054064368401, + "ewc_loss": 0.07335445284843445, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036977502168156207, + "grad_norm": 8.47775936126709, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8673895597457886, + "num_tokens": 622325892.0, + "step": 16312 + }, + { + "epoch": 2.0751812746469915, + "ewc_loss": 0.0734734833240509, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000370965339243412, + "grad_norm": 8.57477855682373, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.864889919757843, + "num_tokens": 622356966.0, + "step": 16313 + }, + { + "epoch": 2.075308484925582, + "ewc_loss": 0.07320237904787064, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036825425922870636, + "grad_norm": 8.489789962768555, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8693130016326904, + "num_tokens": 622389931.0, + "step": 16314 + }, + { + "epoch": 2.0754356952041726, + "ewc_loss": 0.07355716824531555, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037180210347287357, + "grad_norm": 8.563031196594238, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8638240694999695, + "num_tokens": 622427817.0, + "step": 16315 + }, + { + "epoch": 2.075562905482763, + "ewc_loss": 0.07311709225177765, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003674013423733413, + "grad_norm": 8.422930717468262, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8571453094482422, + "num_tokens": 622470353.0, + "step": 16316 + }, + { + "epoch": 2.0756901157613536, + "ewc_loss": 0.07362207025289536, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037245117709971964, + "grad_norm": 8.496468544006348, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8834856152534485, + "num_tokens": 622507577.0, + "step": 16317 + }, + { + "epoch": 2.075817326039944, + "ewc_loss": 0.07314062118530273, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036763670505024493, + "grad_norm": 8.45031452178955, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8607499003410339, + "num_tokens": 622544746.0, + "step": 16318 + }, + { + "epoch": 2.0759445363185347, + "ewc_loss": 0.07351054251194, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003713359183166176, + "grad_norm": 8.522491455078125, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8596941232681274, + "num_tokens": 622583688.0, + "step": 16319 + }, + { + "epoch": 2.0760717465971252, + "ewc_loss": 0.07329908013343811, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003692213213071227, + "grad_norm": 8.478900909423828, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8673295974731445, + "num_tokens": 622619911.0, + "step": 16320 + }, + { + "epoch": 2.0761989568757158, + "ewc_loss": 0.07341572642326355, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003703877446241677, + "grad_norm": 8.463343620300293, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8746387958526611, + "num_tokens": 622665732.0, + "step": 16321 + }, + { + "epoch": 2.0763261671543063, + "ewc_loss": 0.07334845513105392, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003697150095831603, + "grad_norm": 8.450164794921875, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8740659356117249, + "num_tokens": 622705746.0, + "step": 16322 + }, + { + "epoch": 2.0764533774328964, + "ewc_loss": 0.07340596616268158, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037029015948064625, + "grad_norm": 8.513853073120117, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8672001957893372, + "num_tokens": 622745481.0, + "step": 16323 + }, + { + "epoch": 2.076580587711487, + "ewc_loss": 0.07320629060268402, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036829340388067067, + "grad_norm": 8.444343566894531, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8695354461669922, + "num_tokens": 622786910.0, + "step": 16324 + }, + { + "epoch": 2.0767077979900774, + "ewc_loss": 0.07348807156085968, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037111114943400025, + "grad_norm": 8.492300033569336, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8619011044502258, + "num_tokens": 622826976.0, + "step": 16325 + }, + { + "epoch": 2.076835008268668, + "ewc_loss": 0.07321162521839142, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036834675120189786, + "grad_norm": 8.451072692871094, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8662245273590088, + "num_tokens": 622858945.0, + "step": 16326 + }, + { + "epoch": 2.0769622185472585, + "ewc_loss": 0.07342687249183655, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003704991831909865, + "grad_norm": 8.512773513793945, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8788919448852539, + "num_tokens": 622893229.0, + "step": 16327 + }, + { + "epoch": 2.077089428825849, + "ewc_loss": 0.07319940626621246, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003682245733216405, + "grad_norm": 8.394777297973633, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8764511346817017, + "num_tokens": 622933701.0, + "step": 16328 + }, + { + "epoch": 2.0772166391044395, + "ewc_loss": 0.07361941039562225, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003724245761986822, + "grad_norm": 8.618424415588379, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.853798508644104, + "num_tokens": 622971226.0, + "step": 16329 + }, + { + "epoch": 2.07734384938303, + "ewc_loss": 0.07302588224411011, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003664892865344882, + "grad_norm": 8.419381141662598, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8623405694961548, + "num_tokens": 623009232.0, + "step": 16330 + }, + { + "epoch": 2.0774710596616206, + "ewc_loss": 0.0736648291349411, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037287879968062043, + "grad_norm": 8.509805679321289, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8586916327476501, + "num_tokens": 623048005.0, + "step": 16331 + }, + { + "epoch": 2.077598269940211, + "ewc_loss": 0.07321514189243317, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003683818504214287, + "grad_norm": 8.429404258728027, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8649627566337585, + "num_tokens": 623088474.0, + "step": 16332 + }, + { + "epoch": 2.0777254802188017, + "ewc_loss": 0.07359908521175385, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037222131504677236, + "grad_norm": 8.503185272216797, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8673831224441528, + "num_tokens": 623123700.0, + "step": 16333 + }, + { + "epoch": 2.077852690497392, + "ewc_loss": 0.07331100106239319, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003693404432851821, + "grad_norm": 8.453140258789062, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8694384098052979, + "num_tokens": 623161406.0, + "step": 16334 + }, + { + "epoch": 2.0779799007759827, + "ewc_loss": 0.07350674271583557, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003712979087140411, + "grad_norm": 8.566810607910156, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8562198281288147, + "num_tokens": 623199770.0, + "step": 16335 + }, + { + "epoch": 2.0781071110545732, + "ewc_loss": 0.07323247194290161, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036855522193945944, + "grad_norm": 8.443469047546387, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8694949150085449, + "num_tokens": 623234635.0, + "step": 16336 + }, + { + "epoch": 2.0782343213331638, + "ewc_loss": 0.07368306815624237, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003730611642822623, + "grad_norm": 8.514100074768066, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8677231073379517, + "num_tokens": 623273695.0, + "step": 16337 + }, + { + "epoch": 2.0783615316117543, + "ewc_loss": 0.0734596848487854, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037082729977555573, + "grad_norm": 8.50355339050293, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8505797982215881, + "num_tokens": 623315278.0, + "step": 16338 + }, + { + "epoch": 2.078488741890345, + "ewc_loss": 0.07351119071245193, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000371342379366979, + "grad_norm": 8.488195419311523, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8750031590461731, + "num_tokens": 623351753.0, + "step": 16339 + }, + { + "epoch": 2.0786159521689354, + "ewc_loss": 0.07363151013851166, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037254556082189083, + "grad_norm": 8.59941577911377, + "learning_rate": 1e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8486530780792236, + "num_tokens": 623387812.0, + "step": 16340 + }, + { + "epoch": 2.078743162447526, + "ewc_loss": 0.07322803884744644, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003685108677018434, + "grad_norm": 8.45996379852295, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8699564933776855, + "num_tokens": 623426784.0, + "step": 16341 + }, + { + "epoch": 2.0788703727261164, + "ewc_loss": 0.07354962825775146, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037172678275965154, + "grad_norm": 8.5850248336792, + "learning_rate": 1e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8437244892120361, + "num_tokens": 623463362.0, + "step": 16342 + }, + { + "epoch": 2.078997583004707, + "ewc_loss": 0.07313834130764008, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036761388764716685, + "grad_norm": 8.465655326843262, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8589266538619995, + "num_tokens": 623500108.0, + "step": 16343 + }, + { + "epoch": 2.0791247932832975, + "ewc_loss": 0.07357741892337799, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003720046952366829, + "grad_norm": 8.573355674743652, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8626422882080078, + "num_tokens": 623537159.0, + "step": 16344 + }, + { + "epoch": 2.079252003561888, + "ewc_loss": 0.07313921302556992, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003676225896924734, + "grad_norm": 8.37661361694336, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8847320079803467, + "num_tokens": 623576256.0, + "step": 16345 + }, + { + "epoch": 2.0793792138404785, + "ewc_loss": 0.07379357516765594, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037416620762087405, + "grad_norm": 8.638915061950684, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8617815375328064, + "num_tokens": 623612454.0, + "step": 16346 + }, + { + "epoch": 2.0795064241190686, + "ewc_loss": 0.07306487113237381, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003668791614472866, + "grad_norm": 8.382108688354492, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.880171537399292, + "num_tokens": 623654392.0, + "step": 16347 + }, + { + "epoch": 2.079633634397659, + "ewc_loss": 0.07383118569850922, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037454228731803596, + "grad_norm": 8.63463020324707, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.847907543182373, + "num_tokens": 623697350.0, + "step": 16348 + }, + { + "epoch": 2.0797608446762497, + "ewc_loss": 0.07309460639953613, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003671764861792326, + "grad_norm": 8.452154159545898, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8693352341651917, + "num_tokens": 623736272.0, + "step": 16349 + }, + { + "epoch": 2.07988805495484, + "ewc_loss": 0.07422040402889252, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037355170934461057, + "grad_norm": 8.864700317382812, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8573547005653381, + "num_tokens": 623781319.0, + "step": 16350 + }, + { + "epoch": 2.0800152652334307, + "ewc_loss": 0.07278695702552795, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003641000366769731, + "grad_norm": 8.389897346496582, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8659276366233826, + "num_tokens": 623821944.0, + "step": 16351 + }, + { + "epoch": 2.0801424755120212, + "ewc_loss": 0.07404042780399323, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003766347363125533, + "grad_norm": 8.709659576416016, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8619227409362793, + "num_tokens": 623858399.0, + "step": 16352 + }, + { + "epoch": 2.0802696857906118, + "ewc_loss": 0.07276001572608948, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003638306516222656, + "grad_norm": 8.334710121154785, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8684860467910767, + "num_tokens": 623897839.0, + "step": 16353 + }, + { + "epoch": 2.0803968960692023, + "ewc_loss": 0.07424206286668777, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037865107879042625, + "grad_norm": 8.760820388793945, + "learning_rate": 1e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8492739200592041, + "num_tokens": 623933127.0, + "step": 16354 + }, + { + "epoch": 2.080524106347793, + "ewc_loss": 0.07289906591176987, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003652211162261665, + "grad_norm": 8.48656940460205, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8647070527076721, + "num_tokens": 623972140.0, + "step": 16355 + }, + { + "epoch": 2.0806513166263834, + "ewc_loss": 0.07387226819992065, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003749532043002546, + "grad_norm": 8.761421203613281, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8557190895080566, + "num_tokens": 624008875.0, + "step": 16356 + }, + { + "epoch": 2.080778526904974, + "ewc_loss": 0.07291274517774582, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003653579333331436, + "grad_norm": 8.483918190002441, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.863832950592041, + "num_tokens": 624045162.0, + "step": 16357 + }, + { + "epoch": 2.0809057371835644, + "ewc_loss": 0.07375971972942352, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003738276718650013, + "grad_norm": 8.653451919555664, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8778393268585205, + "num_tokens": 624085275.0, + "step": 16358 + }, + { + "epoch": 2.081032947462155, + "ewc_loss": 0.07306423038244247, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003668727586045861, + "grad_norm": 8.543932914733887, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8637343645095825, + "num_tokens": 624127479.0, + "step": 16359 + }, + { + "epoch": 2.0811601577407455, + "ewc_loss": 0.07341168820858002, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003703473194036633, + "grad_norm": 8.633344650268555, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8688819408416748, + "num_tokens": 624165231.0, + "step": 16360 + }, + { + "epoch": 2.081287368019336, + "ewc_loss": 0.07316868007183075, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036791726597584784, + "grad_norm": 8.527143478393555, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8566617965698242, + "num_tokens": 624205109.0, + "step": 16361 + }, + { + "epoch": 2.0814145782979265, + "ewc_loss": 0.0733506977558136, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036973741953261197, + "grad_norm": 8.513526916503906, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8614034056663513, + "num_tokens": 624241849.0, + "step": 16362 + }, + { + "epoch": 2.081541788576517, + "ewc_loss": 0.07324954867362976, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003687259159050882, + "grad_norm": 8.551727294921875, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.867055356502533, + "num_tokens": 624274174.0, + "step": 16363 + }, + { + "epoch": 2.0816689988551076, + "ewc_loss": 0.07329636067152023, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003691940801218152, + "grad_norm": 8.556200981140137, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8644169569015503, + "num_tokens": 624311526.0, + "step": 16364 + }, + { + "epoch": 2.081796209133698, + "ewc_loss": 0.07329816371202469, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003692120953928679, + "grad_norm": 8.571244239807129, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8515832424163818, + "num_tokens": 624346657.0, + "step": 16365 + }, + { + "epoch": 2.0819234194122886, + "ewc_loss": 0.0733071118593216, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003693015896715224, + "grad_norm": 8.50493049621582, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8504717350006104, + "num_tokens": 624386563.0, + "step": 16366 + }, + { + "epoch": 2.082050629690879, + "ewc_loss": 0.07331567257642746, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003693872131407261, + "grad_norm": 8.48328685760498, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8748359680175781, + "num_tokens": 624427299.0, + "step": 16367 + }, + { + "epoch": 2.0821778399694697, + "ewc_loss": 0.07333843410015106, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036961480509489775, + "grad_norm": 8.559320449829102, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.847868025302887, + "num_tokens": 624461253.0, + "step": 16368 + }, + { + "epoch": 2.0823050502480602, + "ewc_loss": 0.07324031740427017, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003686336276587099, + "grad_norm": 8.473779678344727, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8739700317382812, + "num_tokens": 624498699.0, + "step": 16369 + }, + { + "epoch": 2.0824322605266508, + "ewc_loss": 0.07350823283195496, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037131275166757405, + "grad_norm": 8.582499504089355, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.863653838634491, + "num_tokens": 624534655.0, + "step": 16370 + }, + { + "epoch": 2.082559470805241, + "ewc_loss": 0.07315954566001892, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003678258799482137, + "grad_norm": 8.434329986572266, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8696664571762085, + "num_tokens": 624575342.0, + "step": 16371 + }, + { + "epoch": 2.0826866810838314, + "ewc_loss": 0.0735832154750824, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003720626118592918, + "grad_norm": 8.587721824645996, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8710152506828308, + "num_tokens": 624608493.0, + "step": 16372 + }, + { + "epoch": 2.082813891362422, + "ewc_loss": 0.07309773564338684, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036720786010846496, + "grad_norm": 8.410842895507812, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8469306826591492, + "num_tokens": 624655777.0, + "step": 16373 + }, + { + "epoch": 2.0829411016410124, + "ewc_loss": 0.07380214333534241, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037425191840156913, + "grad_norm": 8.579079627990723, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8752222061157227, + "num_tokens": 624699349.0, + "step": 16374 + }, + { + "epoch": 2.083068311919603, + "ewc_loss": 0.07303640246391296, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003665944968815893, + "grad_norm": 8.47448444366455, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8733770847320557, + "num_tokens": 624730146.0, + "step": 16375 + }, + { + "epoch": 2.0831955221981935, + "ewc_loss": 0.07369984686374664, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037322891876101494, + "grad_norm": 8.579314231872559, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8575973510742188, + "num_tokens": 624773186.0, + "step": 16376 + }, + { + "epoch": 2.083322732476784, + "ewc_loss": 0.0733119398355484, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003693498147185892, + "grad_norm": 8.482097625732422, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8631980419158936, + "num_tokens": 624810256.0, + "step": 16377 + }, + { + "epoch": 2.0834499427553745, + "ewc_loss": 0.07366317510604858, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003728622104972601, + "grad_norm": 8.585018157958984, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8552592396736145, + "num_tokens": 624848166.0, + "step": 16378 + }, + { + "epoch": 2.083577153033965, + "ewc_loss": 0.07339446246623993, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037017514114268124, + "grad_norm": 8.488810539245605, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8639969229698181, + "num_tokens": 624887601.0, + "step": 16379 + }, + { + "epoch": 2.0837043633125556, + "ewc_loss": 0.07358455657958984, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003720759996213019, + "grad_norm": 8.515480041503906, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8640686273574829, + "num_tokens": 624921160.0, + "step": 16380 + }, + { + "epoch": 2.083831573591146, + "ewc_loss": 0.07351624965667725, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037139299092814326, + "grad_norm": 8.56275749206543, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8574578762054443, + "num_tokens": 624959053.0, + "step": 16381 + }, + { + "epoch": 2.0839587838697367, + "ewc_loss": 0.07353892922401428, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003716197097674012, + "grad_norm": 8.599255561828613, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8788480758666992, + "num_tokens": 624995272.0, + "step": 16382 + }, + { + "epoch": 2.084085994148327, + "ewc_loss": 0.07378559559583664, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00036920359707437456, + "grad_norm": 8.519436836242676, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8817075490951538, + "num_tokens": 625033388.0, + "step": 16383 + }, + { + "epoch": 2.0842132044269177, + "ewc_loss": 0.07347594201564789, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037098993198014796, + "grad_norm": 8.547065734863281, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8547943830490112, + "num_tokens": 625071103.0, + "step": 16384 + }, + { + "epoch": 2.0843404147055082, + "ewc_loss": 0.07387660443782806, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003701137029565871, + "grad_norm": 8.637948989868164, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8769657611846924, + "num_tokens": 625109562.0, + "step": 16385 + }, + { + "epoch": 2.0844676249840988, + "ewc_loss": 0.07307387888431549, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036696926690638065, + "grad_norm": 8.522346496582031, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8704270124435425, + "num_tokens": 625147438.0, + "step": 16386 + }, + { + "epoch": 2.0845948352626893, + "ewc_loss": 0.07348980754613876, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003711285535246134, + "grad_norm": 8.582756996154785, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8738674521446228, + "num_tokens": 625184293.0, + "step": 16387 + }, + { + "epoch": 2.08472204554128, + "ewc_loss": 0.07301598787307739, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036639030440710485, + "grad_norm": 8.528644561767578, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8669110536575317, + "num_tokens": 625218201.0, + "step": 16388 + }, + { + "epoch": 2.0848492558198704, + "ewc_loss": 0.07340167462825775, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037024717312306166, + "grad_norm": 8.593871116638184, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8645660877227783, + "num_tokens": 625252072.0, + "step": 16389 + }, + { + "epoch": 2.084976466098461, + "ewc_loss": 0.07352238148450851, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000366571475751698, + "grad_norm": 8.486433982849121, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8791371583938599, + "num_tokens": 625286990.0, + "step": 16390 + }, + { + "epoch": 2.0851036763770514, + "ewc_loss": 0.07345838844776154, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003708144067786634, + "grad_norm": 8.597199440002441, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8763847947120667, + "num_tokens": 625326579.0, + "step": 16391 + }, + { + "epoch": 2.085230886655642, + "ewc_loss": 0.07321526110172272, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003683831309899688, + "grad_norm": 8.53917121887207, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8719053268432617, + "num_tokens": 625367317.0, + "step": 16392 + }, + { + "epoch": 2.0853580969342325, + "ewc_loss": 0.07338802516460419, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003701107343658805, + "grad_norm": 8.575820922851562, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8584074378013611, + "num_tokens": 625401047.0, + "step": 16393 + }, + { + "epoch": 2.085485307212823, + "ewc_loss": 0.07308322191238403, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003670626610983163, + "grad_norm": 8.546914100646973, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8497824668884277, + "num_tokens": 625441878.0, + "step": 16394 + }, + { + "epoch": 2.0856125174914135, + "ewc_loss": 0.07353678345680237, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036915691453032196, + "grad_norm": 8.581310272216797, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8636879920959473, + "num_tokens": 625477007.0, + "step": 16395 + }, + { + "epoch": 2.0857397277700036, + "ewc_loss": 0.07300949096679688, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003663254319690168, + "grad_norm": 8.405065536499023, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8775792121887207, + "num_tokens": 625519908.0, + "step": 16396 + }, + { + "epoch": 2.085866938048594, + "ewc_loss": 0.0735052227973938, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037128268741071224, + "grad_norm": 8.602343559265137, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8690083622932434, + "num_tokens": 625554946.0, + "step": 16397 + }, + { + "epoch": 2.0859941483271847, + "ewc_loss": 0.07307508587837219, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003669813449960202, + "grad_norm": 8.44914722442627, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8752573728561401, + "num_tokens": 625592121.0, + "step": 16398 + }, + { + "epoch": 2.086121358605775, + "ewc_loss": 0.07367859780788422, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037301648990251124, + "grad_norm": 8.587983131408691, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8642665147781372, + "num_tokens": 625630596.0, + "step": 16399 + }, + { + "epoch": 2.0862485688843657, + "ewc_loss": 0.07309886813163757, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003672191814985126, + "grad_norm": 8.500814437866211, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8481807708740234, + "num_tokens": 625669535.0, + "step": 16400 + }, + { + "epoch": 2.0863757791629562, + "ewc_loss": 0.0738750696182251, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037253971095196903, + "grad_norm": 8.607895851135254, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8637300729751587, + "num_tokens": 625710072.0, + "step": 16401 + }, + { + "epoch": 2.0865029894415468, + "ewc_loss": 0.07319703698158264, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003682008245959878, + "grad_norm": 8.474139213562012, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8742828369140625, + "num_tokens": 625746328.0, + "step": 16402 + }, + { + "epoch": 2.0866301997201373, + "ewc_loss": 0.07366696745157242, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003729001327883452, + "grad_norm": 8.665735244750977, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8658709526062012, + "num_tokens": 625787108.0, + "step": 16403 + }, + { + "epoch": 2.086757409998728, + "ewc_loss": 0.07308505475521088, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036708099651150405, + "grad_norm": 8.422004699707031, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8693428635597229, + "num_tokens": 625827887.0, + "step": 16404 + }, + { + "epoch": 2.0868846202773184, + "ewc_loss": 0.07378347963094711, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003740652755368501, + "grad_norm": 8.666321754455566, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8663201928138733, + "num_tokens": 625863475.0, + "step": 16405 + }, + { + "epoch": 2.087011830555909, + "ewc_loss": 0.07322163879871368, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003660054935608059, + "grad_norm": 8.476545333862305, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8809593319892883, + "num_tokens": 625902369.0, + "step": 16406 + }, + { + "epoch": 2.0871390408344994, + "ewc_loss": 0.07364800572395325, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037271049222908914, + "grad_norm": 8.64726734161377, + "learning_rate": 1e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.839394748210907, + "num_tokens": 625941054.0, + "step": 16407 + }, + { + "epoch": 2.08726625111309, + "ewc_loss": 0.0730968564748764, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003671990125440061, + "grad_norm": 8.470797538757324, + "learning_rate": 1e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8476773500442505, + "num_tokens": 625981113.0, + "step": 16408 + }, + { + "epoch": 2.0873934613916805, + "ewc_loss": 0.0736478865146637, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037270932807587087, + "grad_norm": 8.693785667419434, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8609074950218201, + "num_tokens": 626014758.0, + "step": 16409 + }, + { + "epoch": 2.087520671670271, + "ewc_loss": 0.0729706883430481, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036593739059753716, + "grad_norm": 8.471325874328613, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8499203324317932, + "num_tokens": 626054442.0, + "step": 16410 + }, + { + "epoch": 2.0876478819488615, + "ewc_loss": 0.07375521957874298, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003737826773431152, + "grad_norm": 8.683626174926758, + "learning_rate": 1e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8465783596038818, + "num_tokens": 626093993.0, + "step": 16411 + }, + { + "epoch": 2.087775092227452, + "ewc_loss": 0.07295650988817215, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003657955676317215, + "grad_norm": 8.493326187133789, + "learning_rate": 1e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8477356433868408, + "num_tokens": 626131242.0, + "step": 16412 + }, + { + "epoch": 2.0879023025060426, + "ewc_loss": 0.07362854480743408, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037251587491482496, + "grad_norm": 8.705500602722168, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8503298759460449, + "num_tokens": 626162137.0, + "step": 16413 + }, + { + "epoch": 2.088029512784633, + "ewc_loss": 0.07290299981832504, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000365260464604944, + "grad_norm": 8.46938419342041, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8653895854949951, + "num_tokens": 626201031.0, + "step": 16414 + }, + { + "epoch": 2.0881567230632236, + "ewc_loss": 0.0737432986497879, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003736634098459035, + "grad_norm": 8.677817344665527, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8735430240631104, + "num_tokens": 626243647.0, + "step": 16415 + }, + { + "epoch": 2.088283933341814, + "ewc_loss": 0.07323115319013596, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036610057577490807, + "grad_norm": 8.503183364868164, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8713752627372742, + "num_tokens": 626281346.0, + "step": 16416 + }, + { + "epoch": 2.0884111436204047, + "ewc_loss": 0.07380452752113342, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037427572533488274, + "grad_norm": 8.709633827209473, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8743475079536438, + "num_tokens": 626315964.0, + "step": 16417 + }, + { + "epoch": 2.0885383538989952, + "ewc_loss": 0.07328660041093826, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036665506195276976, + "grad_norm": 8.41401195526123, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8584761023521423, + "num_tokens": 626367920.0, + "step": 16418 + }, + { + "epoch": 2.0886655641775858, + "ewc_loss": 0.07410799711942673, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003748690360225737, + "grad_norm": 8.649325370788574, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8614710569381714, + "num_tokens": 626412123.0, + "step": 16419 + }, + { + "epoch": 2.0887927744561763, + "ewc_loss": 0.07313354313373566, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036756592453457415, + "grad_norm": 8.499155044555664, + "learning_rate": 1e-06, + "loss": 0.5583, + "mean_token_accuracy": 0.8358022570610046, + "num_tokens": 626451560.0, + "step": 16420 + }, + { + "epoch": 2.0889199847347664, + "ewc_loss": 0.07370664179325104, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037329690530896187, + "grad_norm": 8.633392333984375, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8586501479148865, + "num_tokens": 626489361.0, + "step": 16421 + }, + { + "epoch": 2.089047195013357, + "ewc_loss": 0.0733736902475357, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003675259940791875, + "grad_norm": 8.47558879852295, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8801543712615967, + "num_tokens": 626527150.0, + "step": 16422 + }, + { + "epoch": 2.0891744052919474, + "ewc_loss": 0.07403922826051712, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037418134161271155, + "grad_norm": 8.570730209350586, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8572086095809937, + "num_tokens": 626568110.0, + "step": 16423 + }, + { + "epoch": 2.089301615570538, + "ewc_loss": 0.07362984865903854, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037008753861300647, + "grad_norm": 8.52280044555664, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8658831119537354, + "num_tokens": 626607215.0, + "step": 16424 + }, + { + "epoch": 2.0894288258491285, + "ewc_loss": 0.07364976406097412, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000372728070942685, + "grad_norm": 8.5668363571167, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8705563545227051, + "num_tokens": 626644424.0, + "step": 16425 + }, + { + "epoch": 2.089556036127719, + "ewc_loss": 0.07372811436653137, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037107017124071717, + "grad_norm": 8.51734447479248, + "learning_rate": 1e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8437485694885254, + "num_tokens": 626689509.0, + "step": 16426 + }, + { + "epoch": 2.0896832464063095, + "ewc_loss": 0.07359002530574799, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003721306857187301, + "grad_norm": 8.497797012329102, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8536034822463989, + "num_tokens": 626733866.0, + "step": 16427 + }, + { + "epoch": 2.0898104566849, + "ewc_loss": 0.07389186322689056, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037270772736519575, + "grad_norm": 8.572898864746094, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8719909191131592, + "num_tokens": 626767465.0, + "step": 16428 + }, + { + "epoch": 2.0899376669634906, + "ewc_loss": 0.07347889244556427, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037101941416040063, + "grad_norm": 8.522459030151367, + "learning_rate": 1e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8513599038124084, + "num_tokens": 626806049.0, + "step": 16429 + }, + { + "epoch": 2.090064877242081, + "ewc_loss": 0.07367773354053497, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003730077878572047, + "grad_norm": 8.498167991638184, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8501695394515991, + "num_tokens": 626848917.0, + "step": 16430 + }, + { + "epoch": 2.0901920875206716, + "ewc_loss": 0.07376618683338165, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003738923405762762, + "grad_norm": 8.634559631347656, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8517343997955322, + "num_tokens": 626884442.0, + "step": 16431 + }, + { + "epoch": 2.090319297799262, + "ewc_loss": 0.07365049421787262, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037029400118626654, + "grad_norm": 8.52482795715332, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8615060448646545, + "num_tokens": 626922517.0, + "step": 16432 + }, + { + "epoch": 2.0904465080778527, + "ewc_loss": 0.0741720050573349, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037550911656580865, + "grad_norm": 8.645609855651855, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8623312711715698, + "num_tokens": 626962880.0, + "step": 16433 + }, + { + "epoch": 2.0905737183564432, + "ewc_loss": 0.07338480651378632, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003700785746332258, + "grad_norm": 8.462888717651367, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8715819716453552, + "num_tokens": 627008115.0, + "step": 16434 + }, + { + "epoch": 2.0907009286350338, + "ewc_loss": 0.07404454797506332, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037667594733648, + "grad_norm": 8.646608352661133, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8762357234954834, + "num_tokens": 627043935.0, + "step": 16435 + }, + { + "epoch": 2.0908281389136243, + "ewc_loss": 0.07349306344985962, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036871968768537045, + "grad_norm": 8.520998001098633, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8648761510848999, + "num_tokens": 627078495.0, + "step": 16436 + }, + { + "epoch": 2.090955349192215, + "ewc_loss": 0.07387490570545197, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003749795723706484, + "grad_norm": 8.598390579223633, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8683454990386963, + "num_tokens": 627116652.0, + "step": 16437 + }, + { + "epoch": 2.0910825594708053, + "ewc_loss": 0.07342803478240967, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037051079561933875, + "grad_norm": 8.503165245056152, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8723207712173462, + "num_tokens": 627152326.0, + "step": 16438 + }, + { + "epoch": 2.091209769749396, + "ewc_loss": 0.07373431324958801, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037357365363277495, + "grad_norm": 8.66256332397461, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8738327026367188, + "num_tokens": 627189502.0, + "step": 16439 + }, + { + "epoch": 2.0913369800279864, + "ewc_loss": 0.07322900742292404, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.000368520530173555, + "grad_norm": 8.478668212890625, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8777852654457092, + "num_tokens": 627225438.0, + "step": 16440 + }, + { + "epoch": 2.091464190306577, + "ewc_loss": 0.0739726573228836, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037351559149101377, + "grad_norm": 8.611766815185547, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8604592084884644, + "num_tokens": 627258154.0, + "step": 16441 + }, + { + "epoch": 2.0915914005851675, + "ewc_loss": 0.07325990498065948, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00036882952554151416, + "grad_norm": 8.491708755493164, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8574381470680237, + "num_tokens": 627301804.0, + "step": 16442 + }, + { + "epoch": 2.091718610863758, + "ewc_loss": 0.0737571120262146, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003738015366252512, + "grad_norm": 8.553576469421387, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8696062564849854, + "num_tokens": 627344301.0, + "step": 16443 + }, + { + "epoch": 2.0918458211423485, + "ewc_loss": 0.0735277608036995, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003715080674737692, + "grad_norm": 8.507556915283203, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8726457357406616, + "num_tokens": 627381068.0, + "step": 16444 + }, + { + "epoch": 2.0919730314209386, + "ewc_loss": 0.0740368515253067, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037171615986153483, + "grad_norm": 8.523521423339844, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8571548461914062, + "num_tokens": 627423410.0, + "step": 16445 + }, + { + "epoch": 2.092100241699529, + "ewc_loss": 0.07359194755554199, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037214989424683154, + "grad_norm": 8.560700416564941, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8744502067565918, + "num_tokens": 627456593.0, + "step": 16446 + }, + { + "epoch": 2.0922274519781197, + "ewc_loss": 0.07389993965625763, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003727884322870523, + "grad_norm": 8.551183700561523, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.866563081741333, + "num_tokens": 627493742.0, + "step": 16447 + }, + { + "epoch": 2.09235466225671, + "ewc_loss": 0.07381750643253326, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003719641244970262, + "grad_norm": 8.536874771118164, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8700383901596069, + "num_tokens": 627529809.0, + "step": 16448 + }, + { + "epoch": 2.0924818725353007, + "ewc_loss": 0.07388067245483398, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003725958231370896, + "grad_norm": 8.525954246520996, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8671937584877014, + "num_tokens": 627570693.0, + "step": 16449 + }, + { + "epoch": 2.0926090828138912, + "ewc_loss": 0.07395295798778534, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003733186749741435, + "grad_norm": 8.546323776245117, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8733410835266113, + "num_tokens": 627607608.0, + "step": 16450 + }, + { + "epoch": 2.0927362930924818, + "ewc_loss": 0.07391272485256195, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037291631451807916, + "grad_norm": 8.546578407287598, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8632830381393433, + "num_tokens": 627642102.0, + "step": 16451 + }, + { + "epoch": 2.0928635033710723, + "ewc_loss": 0.0739525780081749, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003733148332685232, + "grad_norm": 8.579115867614746, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8579450249671936, + "num_tokens": 627678838.0, + "step": 16452 + }, + { + "epoch": 2.092990713649663, + "ewc_loss": 0.07370370626449585, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000370826106518507, + "grad_norm": 8.451727867126465, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.86870276927948, + "num_tokens": 627714952.0, + "step": 16453 + }, + { + "epoch": 2.0931179239282534, + "ewc_loss": 0.07375240325927734, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003737544466275722, + "grad_norm": 8.5919771194458, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8561620116233826, + "num_tokens": 627754710.0, + "step": 16454 + }, + { + "epoch": 2.093245134206844, + "ewc_loss": 0.07369452714920044, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003707343712449074, + "grad_norm": 8.524431228637695, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8689097166061401, + "num_tokens": 627791504.0, + "step": 16455 + }, + { + "epoch": 2.0933723444854344, + "ewc_loss": 0.07397057116031647, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037349481135606766, + "grad_norm": 8.519203186035156, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8787142038345337, + "num_tokens": 627829683.0, + "step": 16456 + }, + { + "epoch": 2.093499554764025, + "ewc_loss": 0.07400822639465332, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037142986548133194, + "grad_norm": 8.636575698852539, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8654325008392334, + "num_tokens": 627869826.0, + "step": 16457 + }, + { + "epoch": 2.0936267650426155, + "ewc_loss": 0.0736839771270752, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037062884075567126, + "grad_norm": 8.50722599029541, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8607511520385742, + "num_tokens": 627904165.0, + "step": 16458 + }, + { + "epoch": 2.093753975321206, + "ewc_loss": 0.07428716123104095, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003742193221114576, + "grad_norm": 8.574297904968262, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8696821331977844, + "num_tokens": 627936559.0, + "step": 16459 + }, + { + "epoch": 2.0938811855997965, + "ewc_loss": 0.07338877767324448, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.0003701182431541383, + "grad_norm": 8.469522476196289, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.870175838470459, + "num_tokens": 627971742.0, + "step": 16460 + }, + { + "epoch": 2.094008395878387, + "ewc_loss": 0.07408814132213593, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003746704605873674, + "grad_norm": 8.540491104125977, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8743487000465393, + "num_tokens": 628013104.0, + "step": 16461 + }, + { + "epoch": 2.0941356061569776, + "ewc_loss": 0.0737333595752716, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003711227036546916, + "grad_norm": 8.555392265319824, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8614650964736938, + "num_tokens": 628049378.0, + "step": 16462 + }, + { + "epoch": 2.094262816435568, + "ewc_loss": 0.07355014234781265, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037173187592998147, + "grad_norm": 8.518425941467285, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8825533390045166, + "num_tokens": 628087213.0, + "step": 16463 + }, + { + "epoch": 2.0943900267141586, + "ewc_loss": 0.07359996438026428, + "ewc_loss_diag": 3.647804260253906e-05, + "ewc_loss_parallel": 0.00037223013350740075, + "grad_norm": 8.52700424194336, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8696534037590027, + "num_tokens": 628122060.0, + "step": 16464 + }, + { + "epoch": 2.094517236992749, + "ewc_loss": 0.07379674911499023, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037175658508203924, + "grad_norm": 8.488245010375977, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8678220510482788, + "num_tokens": 628166554.0, + "step": 16465 + }, + { + "epoch": 2.0946444472713397, + "ewc_loss": 0.07394202798604965, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037320933188311756, + "grad_norm": 8.672805786132812, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8636835813522339, + "num_tokens": 628199953.0, + "step": 16466 + }, + { + "epoch": 2.09477165754993, + "ewc_loss": 0.07351648807525635, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036895391531288624, + "grad_norm": 8.453264236450195, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8662698864936829, + "num_tokens": 628237044.0, + "step": 16467 + }, + { + "epoch": 2.0948988678285207, + "ewc_loss": 0.074042409658432, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037421315209940076, + "grad_norm": 8.625988960266113, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.868350625038147, + "num_tokens": 628275773.0, + "step": 16468 + }, + { + "epoch": 2.095026078107111, + "ewc_loss": 0.07362647354602814, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00036761234514415264, + "grad_norm": 8.50294017791748, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.848892331123352, + "num_tokens": 628314549.0, + "step": 16469 + }, + { + "epoch": 2.0951532883857014, + "ewc_loss": 0.07393354922533035, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003731245524249971, + "grad_norm": 8.583025932312012, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8590854406356812, + "num_tokens": 628349417.0, + "step": 16470 + }, + { + "epoch": 2.095280498664292, + "ewc_loss": 0.07359067350625992, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036969580105505884, + "grad_norm": 8.522198677062988, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.86902916431427, + "num_tokens": 628384435.0, + "step": 16471 + }, + { + "epoch": 2.0954077089428824, + "ewc_loss": 0.07392755150794983, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037062313640490174, + "grad_norm": 8.541213035583496, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8520534038543701, + "num_tokens": 628424124.0, + "step": 16472 + }, + { + "epoch": 2.095534919221473, + "ewc_loss": 0.07367236912250519, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003705127746798098, + "grad_norm": 8.520977973937988, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8677641153335571, + "num_tokens": 628462412.0, + "step": 16473 + }, + { + "epoch": 2.0956621295000635, + "ewc_loss": 0.07366857677698135, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037047482328489423, + "grad_norm": 8.597392082214355, + "learning_rate": 1e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8476602435112, + "num_tokens": 628499066.0, + "step": 16474 + }, + { + "epoch": 2.095789339778654, + "ewc_loss": 0.07351456582546234, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003689347649924457, + "grad_norm": 8.475296974182129, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8800622224807739, + "num_tokens": 628532379.0, + "step": 16475 + }, + { + "epoch": 2.0959165500572445, + "ewc_loss": 0.0740399956703186, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000371747650206089, + "grad_norm": 8.602804183959961, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8748670220375061, + "num_tokens": 628570497.0, + "step": 16476 + }, + { + "epoch": 2.096043760335835, + "ewc_loss": 0.07364450395107269, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003677927306853235, + "grad_norm": 8.516057968139648, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8687537908554077, + "num_tokens": 628608298.0, + "step": 16477 + }, + { + "epoch": 2.0961709706144256, + "ewc_loss": 0.07383628189563751, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003697104402817786, + "grad_norm": 8.56673812866211, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8838781118392944, + "num_tokens": 628641217.0, + "step": 16478 + }, + { + "epoch": 2.096298180893016, + "ewc_loss": 0.0736696720123291, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003680444206111133, + "grad_norm": 8.431072235107422, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8666948080062866, + "num_tokens": 628682480.0, + "step": 16479 + }, + { + "epoch": 2.0964253911716066, + "ewc_loss": 0.07411827147006989, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037253039772622287, + "grad_norm": 8.5499906539917, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8633439540863037, + "num_tokens": 628718359.0, + "step": 16480 + }, + { + "epoch": 2.096552601450197, + "ewc_loss": 0.07362829148769379, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00036763062234967947, + "grad_norm": 8.441032409667969, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8723962306976318, + "num_tokens": 628752214.0, + "step": 16481 + }, + { + "epoch": 2.0966798117287877, + "ewc_loss": 0.07399912178516388, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037133891601115465, + "grad_norm": 8.572724342346191, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8781867027282715, + "num_tokens": 628787882.0, + "step": 16482 + }, + { + "epoch": 2.0968070220073782, + "ewc_loss": 0.07363283634185791, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00036767605342902243, + "grad_norm": 8.387290000915527, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.861172616481781, + "num_tokens": 628824910.0, + "step": 16483 + }, + { + "epoch": 2.0969342322859688, + "ewc_loss": 0.07401242852210999, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037391329533420503, + "grad_norm": 8.548616409301758, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8783478140830994, + "num_tokens": 628860657.0, + "step": 16484 + }, + { + "epoch": 2.0970614425645593, + "ewc_loss": 0.07346883416175842, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036847745650447905, + "grad_norm": 8.462486267089844, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8479065895080566, + "num_tokens": 628900619.0, + "step": 16485 + }, + { + "epoch": 2.09718865284315, + "ewc_loss": 0.07397738099098206, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037356288521550596, + "grad_norm": 8.503592491149902, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8769673109054565, + "num_tokens": 628944304.0, + "step": 16486 + }, + { + "epoch": 2.0973158631217403, + "ewc_loss": 0.07371830195188522, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000370972091332078, + "grad_norm": 8.5471830368042, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8695213794708252, + "num_tokens": 628980490.0, + "step": 16487 + }, + { + "epoch": 2.097443073400331, + "ewc_loss": 0.07366260886192322, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000370415102224797, + "grad_norm": 8.476725578308105, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8703173398971558, + "num_tokens": 629011924.0, + "step": 16488 + }, + { + "epoch": 2.0975702836789214, + "ewc_loss": 0.07387736439704895, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003725627320818603, + "grad_norm": 8.584993362426758, + "learning_rate": 1e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.840346097946167, + "num_tokens": 629049514.0, + "step": 16489 + }, + { + "epoch": 2.097697493957512, + "ewc_loss": 0.07350276410579681, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003688166616484523, + "grad_norm": 8.466629981994629, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8829410672187805, + "num_tokens": 629085374.0, + "step": 16490 + }, + { + "epoch": 2.0978247042361025, + "ewc_loss": 0.07419069111347198, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003732545592356473, + "grad_norm": 8.593413352966309, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8623393774032593, + "num_tokens": 629126307.0, + "step": 16491 + }, + { + "epoch": 2.097951914514693, + "ewc_loss": 0.07368093729019165, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00036815699422731996, + "grad_norm": 8.489518165588379, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8653402328491211, + "num_tokens": 629159693.0, + "step": 16492 + }, + { + "epoch": 2.0980791247932835, + "ewc_loss": 0.07388794422149658, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003726684662979096, + "grad_norm": 8.588717460632324, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8712185621261597, + "num_tokens": 629198544.0, + "step": 16493 + }, + { + "epoch": 2.0982063350718736, + "ewc_loss": 0.07370761781930923, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00036842384724877775, + "grad_norm": 8.419218063354492, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8575229048728943, + "num_tokens": 629240956.0, + "step": 16494 + }, + { + "epoch": 2.098333545350464, + "ewc_loss": 0.07404571771621704, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037424618494696915, + "grad_norm": 8.558943748474121, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8664673566818237, + "num_tokens": 629276196.0, + "step": 16495 + }, + { + "epoch": 2.0984607556290547, + "ewc_loss": 0.07336589694023132, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003674480540212244, + "grad_norm": 8.556567192077637, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8527548313140869, + "num_tokens": 629311398.0, + "step": 16496 + }, + { + "epoch": 2.098587965907645, + "ewc_loss": 0.07392852753400803, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003730743483174592, + "grad_norm": 8.527143478393555, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.860108494758606, + "num_tokens": 629349488.0, + "step": 16497 + }, + { + "epoch": 2.0987151761862357, + "ewc_loss": 0.07372258603572845, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037101490306667984, + "grad_norm": 8.584700584411621, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.880152702331543, + "num_tokens": 629388023.0, + "step": 16498 + }, + { + "epoch": 2.0988423864648262, + "ewc_loss": 0.07350867241621017, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036887580063194036, + "grad_norm": 8.529622077941895, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8661969304084778, + "num_tokens": 629432140.0, + "step": 16499 + }, + { + "epoch": 2.0989695967434168, + "ewc_loss": 0.07397270947694778, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003710747405420989, + "grad_norm": 8.509934425354004, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.851714015007019, + "num_tokens": 629480690.0, + "step": 16500 + }, + { + "epoch": 2.0990968070220073, + "ewc_loss": 0.07372377812862396, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037102680653333664, + "grad_norm": 8.625097274780273, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8828582763671875, + "num_tokens": 629522500.0, + "step": 16501 + }, + { + "epoch": 2.099224017300598, + "ewc_loss": 0.07339955866336823, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036778461071662605, + "grad_norm": 8.450905799865723, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8673686981201172, + "num_tokens": 629563450.0, + "step": 16502 + }, + { + "epoch": 2.0993512275791884, + "ewc_loss": 0.0739949494600296, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037373852683231235, + "grad_norm": 8.602080345153809, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8753551244735718, + "num_tokens": 629603753.0, + "step": 16503 + }, + { + "epoch": 2.099478437857779, + "ewc_loss": 0.07337392866611481, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003675283514894545, + "grad_norm": 8.552653312683105, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.868067741394043, + "num_tokens": 629637828.0, + "step": 16504 + }, + { + "epoch": 2.0996056481363694, + "ewc_loss": 0.0738440528512001, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037222960963845253, + "grad_norm": 8.525352478027344, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8720033168792725, + "num_tokens": 629674636.0, + "step": 16505 + }, + { + "epoch": 2.09973285841496, + "ewc_loss": 0.07352553308010101, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003690443991217762, + "grad_norm": 8.550543785095215, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8562980890274048, + "num_tokens": 629717046.0, + "step": 16506 + }, + { + "epoch": 2.0998600686935505, + "ewc_loss": 0.0735120177268982, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003689092118293047, + "grad_norm": 8.467013359069824, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8770703673362732, + "num_tokens": 629750331.0, + "step": 16507 + }, + { + "epoch": 2.099987278972141, + "ewc_loss": 0.0739261582493782, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000373050628695637, + "grad_norm": 8.63547134399414, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8471773266792297, + "num_tokens": 629790095.0, + "step": 16508 + }, + { + "epoch": 2.1001144892507315, + "ewc_loss": 0.0732228234410286, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003660172806121409, + "grad_norm": 8.489052772521973, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8803101778030396, + "num_tokens": 629823035.0, + "step": 16509 + }, + { + "epoch": 2.100241699529322, + "ewc_loss": 0.07416295260190964, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003729771706275642, + "grad_norm": 8.61122989654541, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8610923290252686, + "num_tokens": 629857096.0, + "step": 16510 + }, + { + "epoch": 2.1003689098079126, + "ewc_loss": 0.07331056892871857, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003668947028927505, + "grad_norm": 8.55673599243164, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8642168045043945, + "num_tokens": 629893448.0, + "step": 16511 + }, + { + "epoch": 2.100496120086503, + "ewc_loss": 0.0737655758857727, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003714448248501867, + "grad_norm": 8.542280197143555, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8643560409545898, + "num_tokens": 629931301.0, + "step": 16512 + }, + { + "epoch": 2.1006233303650936, + "ewc_loss": 0.07351874560117722, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003689765289891511, + "grad_norm": 8.560585021972656, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8726707696914673, + "num_tokens": 629963481.0, + "step": 16513 + }, + { + "epoch": 2.100750540643684, + "ewc_loss": 0.0734572634100914, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003683617105707526, + "grad_norm": 8.50651741027832, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8617061972618103, + "num_tokens": 630002739.0, + "step": 16514 + }, + { + "epoch": 2.1008777509222747, + "ewc_loss": 0.07360590994358063, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003698481887113303, + "grad_norm": 8.515874862670898, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8580313920974731, + "num_tokens": 630041206.0, + "step": 16515 + }, + { + "epoch": 2.101004961200865, + "ewc_loss": 0.07356428354978561, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003694318875204772, + "grad_norm": 8.530861854553223, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8523557186126709, + "num_tokens": 630083811.0, + "step": 16516 + }, + { + "epoch": 2.1011321714794557, + "ewc_loss": 0.07361553609371185, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003699444350786507, + "grad_norm": 8.498799324035645, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8626962900161743, + "num_tokens": 630119823.0, + "step": 16517 + }, + { + "epoch": 2.1012593817580463, + "ewc_loss": 0.07359229028224945, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003697119536809623, + "grad_norm": 8.45893383026123, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.866847038269043, + "num_tokens": 630163807.0, + "step": 16518 + }, + { + "epoch": 2.1013865920366364, + "ewc_loss": 0.0736895203590393, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003706842253450304, + "grad_norm": 8.5709867477417, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8606423139572144, + "num_tokens": 630203038.0, + "step": 16519 + }, + { + "epoch": 2.101513802315227, + "ewc_loss": 0.07357507944107056, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036953980452381074, + "grad_norm": 8.516682624816895, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8700746297836304, + "num_tokens": 630238505.0, + "step": 16520 + }, + { + "epoch": 2.1016410125938174, + "ewc_loss": 0.07378280162811279, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003716170322149992, + "grad_norm": 8.572022438049316, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8630528450012207, + "num_tokens": 630275279.0, + "step": 16521 + }, + { + "epoch": 2.101768222872408, + "ewc_loss": 0.0735512524843216, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003693015896715224, + "grad_norm": 8.54356575012207, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8621264696121216, + "num_tokens": 630317288.0, + "step": 16522 + }, + { + "epoch": 2.1018954331509985, + "ewc_loss": 0.07381001114845276, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037188915302976966, + "grad_norm": 8.584067344665527, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8654739856719971, + "num_tokens": 630352786.0, + "step": 16523 + }, + { + "epoch": 2.102022643429589, + "ewc_loss": 0.07350975275039673, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003688865399453789, + "grad_norm": 8.46859359741211, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.866551399230957, + "num_tokens": 630391138.0, + "step": 16524 + }, + { + "epoch": 2.1021498537081795, + "ewc_loss": 0.07414895296096802, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003728372394107282, + "grad_norm": 8.59837818145752, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8529960513114929, + "num_tokens": 630426567.0, + "step": 16525 + }, + { + "epoch": 2.10227706398677, + "ewc_loss": 0.07349740713834763, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036876313970424235, + "grad_norm": 8.529539108276367, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8542725443840027, + "num_tokens": 630467308.0, + "step": 16526 + }, + { + "epoch": 2.1024042742653606, + "ewc_loss": 0.07380388677120209, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037182788946665823, + "grad_norm": 8.571627616882324, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8753635883331299, + "num_tokens": 630503333.0, + "step": 16527 + }, + { + "epoch": 2.102531484543951, + "ewc_loss": 0.07358972728252411, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036968637141399086, + "grad_norm": 8.54396915435791, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8611229658126831, + "num_tokens": 630536838.0, + "step": 16528 + }, + { + "epoch": 2.1026586948225416, + "ewc_loss": 0.073820099234581, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003719900851137936, + "grad_norm": 8.597847938537598, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.878678023815155, + "num_tokens": 630570833.0, + "step": 16529 + }, + { + "epoch": 2.102785905101132, + "ewc_loss": 0.0735757052898407, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003695461491588503, + "grad_norm": 8.47376823425293, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8740262985229492, + "num_tokens": 630615043.0, + "step": 16530 + }, + { + "epoch": 2.1029131153797227, + "ewc_loss": 0.07382437586784363, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003720328095369041, + "grad_norm": 8.663055419921875, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.888098418712616, + "num_tokens": 630651368.0, + "step": 16531 + }, + { + "epoch": 2.1030403256583132, + "ewc_loss": 0.07370224595069885, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003683700633700937, + "grad_norm": 8.52590560913086, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8578606247901917, + "num_tokens": 630690341.0, + "step": 16532 + }, + { + "epoch": 2.1031675359369038, + "ewc_loss": 0.07385147362947464, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037230379530228674, + "grad_norm": 8.590797424316406, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8618786334991455, + "num_tokens": 630727168.0, + "step": 16533 + }, + { + "epoch": 2.1032947462154943, + "ewc_loss": 0.07334873825311661, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000367276428733021, + "grad_norm": 8.512043952941895, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8699541091918945, + "num_tokens": 630766737.0, + "step": 16534 + }, + { + "epoch": 2.103421956494085, + "ewc_loss": 0.0737336277961731, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003711253229994327, + "grad_norm": 8.598073959350586, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.862599790096283, + "num_tokens": 630803721.0, + "step": 16535 + }, + { + "epoch": 2.1035491667726753, + "ewc_loss": 0.07349307835102081, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036871980410069227, + "grad_norm": 8.473185539245605, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8710795640945435, + "num_tokens": 630843697.0, + "step": 16536 + }, + { + "epoch": 2.103676377051266, + "ewc_loss": 0.07380601763725281, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037184928078204393, + "grad_norm": 8.67794132232666, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8657909631729126, + "num_tokens": 630874325.0, + "step": 16537 + }, + { + "epoch": 2.1038035873298564, + "ewc_loss": 0.07342840731143951, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036807311698794365, + "grad_norm": 8.48005199432373, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8627586364746094, + "num_tokens": 630911456.0, + "step": 16538 + }, + { + "epoch": 2.103930797608447, + "ewc_loss": 0.0739230215549469, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003730192838702351, + "grad_norm": 8.603340148925781, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.864224374294281, + "num_tokens": 630954418.0, + "step": 16539 + }, + { + "epoch": 2.1040580078870375, + "ewc_loss": 0.07343624532222748, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036815155181102455, + "grad_norm": 8.544812202453613, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8654161095619202, + "num_tokens": 630991826.0, + "step": 16540 + }, + { + "epoch": 2.104185218165628, + "ewc_loss": 0.07387620210647583, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037255106144584715, + "grad_norm": 8.62918758392334, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8668336868286133, + "num_tokens": 631025831.0, + "step": 16541 + }, + { + "epoch": 2.1043124284442185, + "ewc_loss": 0.07385878264904022, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003699355002027005, + "grad_norm": 8.529003143310547, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8523335456848145, + "num_tokens": 631062896.0, + "step": 16542 + }, + { + "epoch": 2.1044396387228086, + "ewc_loss": 0.07412119209766388, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037255953066051006, + "grad_norm": 8.630836486816406, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.871614933013916, + "num_tokens": 631099291.0, + "step": 16543 + }, + { + "epoch": 2.104566849001399, + "ewc_loss": 0.07355984300374985, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003693874750752002, + "grad_norm": 8.587730407714844, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8557051420211792, + "num_tokens": 631141570.0, + "step": 16544 + }, + { + "epoch": 2.1046940592799896, + "ewc_loss": 0.07394252717494965, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037077293382026255, + "grad_norm": 8.536911964416504, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8582180142402649, + "num_tokens": 631178882.0, + "step": 16545 + }, + { + "epoch": 2.10482126955858, + "ewc_loss": 0.0740346759557724, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003716943901963532, + "grad_norm": 8.575069427490234, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8619427680969238, + "num_tokens": 631218059.0, + "step": 16546 + }, + { + "epoch": 2.1049484798371707, + "ewc_loss": 0.07389280200004578, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037027563666924834, + "grad_norm": 8.556249618530273, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8668500185012817, + "num_tokens": 631252827.0, + "step": 16547 + }, + { + "epoch": 2.1050756901157612, + "ewc_loss": 0.07396996766328812, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037104732473380864, + "grad_norm": 8.539674758911133, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8724563121795654, + "num_tokens": 631294370.0, + "step": 16548 + }, + { + "epoch": 2.1052029003943518, + "ewc_loss": 0.07374313473701477, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003712204343173653, + "grad_norm": 8.51823616027832, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.86191725730896, + "num_tokens": 631331091.0, + "step": 16549 + }, + { + "epoch": 2.1053301106729423, + "ewc_loss": 0.07375733554363251, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037136240280233324, + "grad_norm": 8.577005386352539, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8683264851570129, + "num_tokens": 631367285.0, + "step": 16550 + }, + { + "epoch": 2.105457320951533, + "ewc_loss": 0.07371354103088379, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003709244483616203, + "grad_norm": 8.535812377929688, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8527932167053223, + "num_tokens": 631402471.0, + "step": 16551 + }, + { + "epoch": 2.1055845312301233, + "ewc_loss": 0.07387056946754456, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003724948037415743, + "grad_norm": 8.560013771057129, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8639379739761353, + "num_tokens": 631439706.0, + "step": 16552 + }, + { + "epoch": 2.105711741508714, + "ewc_loss": 0.07376116514205933, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037140073254704475, + "grad_norm": 8.555046081542969, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8710922002792358, + "num_tokens": 631471437.0, + "step": 16553 + }, + { + "epoch": 2.1058389517873044, + "ewc_loss": 0.07401193678379059, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037146706017665565, + "grad_norm": 8.525699615478516, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8618862628936768, + "num_tokens": 631515028.0, + "step": 16554 + }, + { + "epoch": 2.105966162065895, + "ewc_loss": 0.07379709184169769, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037176001933403313, + "grad_norm": 8.567499160766602, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8601893186569214, + "num_tokens": 631552946.0, + "step": 16555 + }, + { + "epoch": 2.1060933723444855, + "ewc_loss": 0.07392913848161697, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003706390270963311, + "grad_norm": 8.556700706481934, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8693467378616333, + "num_tokens": 631596045.0, + "step": 16556 + }, + { + "epoch": 2.106220582623076, + "ewc_loss": 0.0740242451429367, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003715901402756572, + "grad_norm": 8.483988761901855, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8746007680892944, + "num_tokens": 631630233.0, + "step": 16557 + }, + { + "epoch": 2.1063477929016665, + "ewc_loss": 0.07413724064826965, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037272003828547895, + "grad_norm": 8.559388160705566, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8712879419326782, + "num_tokens": 631666665.0, + "step": 16558 + }, + { + "epoch": 2.106475003180257, + "ewc_loss": 0.07391378283500671, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003704854752868414, + "grad_norm": 8.561966896057129, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8624647259712219, + "num_tokens": 631709536.0, + "step": 16559 + }, + { + "epoch": 2.1066022134588476, + "ewc_loss": 0.07384048402309418, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037219395744614303, + "grad_norm": 8.522537231445312, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8795171976089478, + "num_tokens": 631744333.0, + "step": 16560 + }, + { + "epoch": 2.106729423737438, + "ewc_loss": 0.07391129434108734, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003729020245373249, + "grad_norm": 8.570940971374512, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8504173159599304, + "num_tokens": 631783522.0, + "step": 16561 + }, + { + "epoch": 2.1068566340160286, + "ewc_loss": 0.07373341917991638, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003711232275236398, + "grad_norm": 8.49577522277832, + "learning_rate": 1e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8489980101585388, + "num_tokens": 631822579.0, + "step": 16562 + }, + { + "epoch": 2.106983844294619, + "ewc_loss": 0.07422178238630295, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003735654754564166, + "grad_norm": 8.562995910644531, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8731880784034729, + "num_tokens": 631862642.0, + "step": 16563 + }, + { + "epoch": 2.1071110545732097, + "ewc_loss": 0.07397471368312836, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037109479308128357, + "grad_norm": 8.507499694824219, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8761888742446899, + "num_tokens": 631903161.0, + "step": 16564 + }, + { + "epoch": 2.1072382648518, + "ewc_loss": 0.0741809830069542, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037315746885724366, + "grad_norm": 8.568197250366211, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8679214119911194, + "num_tokens": 631938149.0, + "step": 16565 + }, + { + "epoch": 2.1073654751303907, + "ewc_loss": 0.07398802787065506, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003712279431056231, + "grad_norm": 8.483907699584961, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8672634363174438, + "num_tokens": 631974717.0, + "step": 16566 + }, + { + "epoch": 2.107492685408981, + "ewc_loss": 0.0743236318230629, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037458399310708046, + "grad_norm": 8.55203628540039, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8698124885559082, + "num_tokens": 632011147.0, + "step": 16567 + }, + { + "epoch": 2.1076198956875714, + "ewc_loss": 0.0739932656288147, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003712803591042757, + "grad_norm": 8.519790649414062, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8723370432853699, + "num_tokens": 632051915.0, + "step": 16568 + }, + { + "epoch": 2.107747105966162, + "ewc_loss": 0.07421689480543137, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037351661012507975, + "grad_norm": 8.639457702636719, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8557807207107544, + "num_tokens": 632083047.0, + "step": 16569 + }, + { + "epoch": 2.1078743162447524, + "ewc_loss": 0.07405975461006165, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037194520700722933, + "grad_norm": 8.529520034790039, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8775527477264404, + "num_tokens": 632119250.0, + "step": 16570 + }, + { + "epoch": 2.108001526523343, + "ewc_loss": 0.07454396784305573, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037434595287777483, + "grad_norm": 8.586080551147461, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.870722770690918, + "num_tokens": 632152696.0, + "step": 16571 + }, + { + "epoch": 2.1081287368019335, + "ewc_loss": 0.07427101582288742, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037161639193072915, + "grad_norm": 8.495490074157715, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8871172666549683, + "num_tokens": 632188167.0, + "step": 16572 + }, + { + "epoch": 2.108255947080524, + "ewc_loss": 0.07412089407444, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037499796599149704, + "grad_norm": 8.582136154174805, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8626903295516968, + "num_tokens": 632229567.0, + "step": 16573 + }, + { + "epoch": 2.1083831573591145, + "ewc_loss": 0.07376793771982193, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003714684280566871, + "grad_norm": 8.487533569335938, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8755971193313599, + "num_tokens": 632266870.0, + "step": 16574 + }, + { + "epoch": 2.108510367637705, + "ewc_loss": 0.07409780472517014, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037476711440831423, + "grad_norm": 8.54514217376709, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8653533458709717, + "num_tokens": 632303844.0, + "step": 16575 + }, + { + "epoch": 2.1086375779162956, + "ewc_loss": 0.07386691868305206, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003724582202266902, + "grad_norm": 8.540339469909668, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8675808310508728, + "num_tokens": 632339343.0, + "step": 16576 + }, + { + "epoch": 2.108764788194886, + "ewc_loss": 0.07392340898513794, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037302315467968583, + "grad_norm": 8.503599166870117, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8690015077590942, + "num_tokens": 632385516.0, + "step": 16577 + }, + { + "epoch": 2.1088919984734766, + "ewc_loss": 0.07403402030467987, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000374129245756194, + "grad_norm": 8.573949813842773, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8708081841468811, + "num_tokens": 632420664.0, + "step": 16578 + }, + { + "epoch": 2.109019208752067, + "ewc_loss": 0.07372109591960907, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037100003100931644, + "grad_norm": 8.49364185333252, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8713302612304688, + "num_tokens": 632458082.0, + "step": 16579 + }, + { + "epoch": 2.1091464190306577, + "ewc_loss": 0.07405833899974823, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037437243736349046, + "grad_norm": 8.571526527404785, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8539396524429321, + "num_tokens": 632495963.0, + "step": 16580 + }, + { + "epoch": 2.109273629309248, + "ewc_loss": 0.07382351160049438, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037202422390691936, + "grad_norm": 8.52839183807373, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8672831654548645, + "num_tokens": 632529968.0, + "step": 16581 + }, + { + "epoch": 2.1094008395878387, + "ewc_loss": 0.07406578958034515, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003744469431694597, + "grad_norm": 8.603472709655762, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8733206987380981, + "num_tokens": 632567527.0, + "step": 16582 + }, + { + "epoch": 2.1095280498664293, + "ewc_loss": 0.07362008094787598, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003699898661579937, + "grad_norm": 8.47605037689209, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8648724555969238, + "num_tokens": 632606192.0, + "step": 16583 + }, + { + "epoch": 2.10965526014502, + "ewc_loss": 0.07412682473659515, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003750573087017983, + "grad_norm": 8.56794548034668, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8768061399459839, + "num_tokens": 632645940.0, + "step": 16584 + }, + { + "epoch": 2.1097824704236103, + "ewc_loss": 0.07377780973911285, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003715671773534268, + "grad_norm": 8.521367073059082, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8741273880004883, + "num_tokens": 632689385.0, + "step": 16585 + }, + { + "epoch": 2.109909680702201, + "ewc_loss": 0.07401570677757263, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003739461535587907, + "grad_norm": 8.647656440734863, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8709214329719543, + "num_tokens": 632720178.0, + "step": 16586 + }, + { + "epoch": 2.1100368909807914, + "ewc_loss": 0.07395347952842712, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003708824224304408, + "grad_norm": 8.525979042053223, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.857886016368866, + "num_tokens": 632763249.0, + "step": 16587 + }, + { + "epoch": 2.110164101259382, + "ewc_loss": 0.0740971565246582, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037476065335795283, + "grad_norm": 8.617274284362793, + "learning_rate": 1e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8466590642929077, + "num_tokens": 632803293.0, + "step": 16588 + }, + { + "epoch": 2.1102913115379724, + "ewc_loss": 0.0736897736787796, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003706868155859411, + "grad_norm": 8.572941780090332, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8578050136566162, + "num_tokens": 632838731.0, + "step": 16589 + }, + { + "epoch": 2.110418521816563, + "ewc_loss": 0.07432051002979279, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037455270648933947, + "grad_norm": 8.64887523651123, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8556190133094788, + "num_tokens": 632875044.0, + "step": 16590 + }, + { + "epoch": 2.1105457320951535, + "ewc_loss": 0.07379413396120071, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003717303916346282, + "grad_norm": 8.574222564697266, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8454599976539612, + "num_tokens": 632911330.0, + "step": 16591 + }, + { + "epoch": 2.1106729423737436, + "ewc_loss": 0.07436779141426086, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003725841816049069, + "grad_norm": 8.601359367370605, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.855958878993988, + "num_tokens": 632952122.0, + "step": 16592 + }, + { + "epoch": 2.110800152652334, + "ewc_loss": 0.07409151643514633, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037226281710900366, + "grad_norm": 8.643325805664062, + "learning_rate": 1e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8475010395050049, + "num_tokens": 632985602.0, + "step": 16593 + }, + { + "epoch": 2.1109273629309246, + "ewc_loss": 0.07388357818126678, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037018346483819187, + "grad_norm": 8.541653633117676, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8657453656196594, + "num_tokens": 633023792.0, + "step": 16594 + }, + { + "epoch": 2.111054573209515, + "ewc_loss": 0.07413394749164581, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003726871218532324, + "grad_norm": 8.639679908752441, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8682252168655396, + "num_tokens": 633065457.0, + "step": 16595 + }, + { + "epoch": 2.1111817834881057, + "ewc_loss": 0.07368685305118561, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003682161623146385, + "grad_norm": 8.5543851852417, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8737283945083618, + "num_tokens": 633101402.0, + "step": 16596 + }, + { + "epoch": 2.1113089937666962, + "ewc_loss": 0.07378798723220825, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037166898255236447, + "grad_norm": 8.549042701721191, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8573501706123352, + "num_tokens": 633147002.0, + "step": 16597 + }, + { + "epoch": 2.1114362040452868, + "ewc_loss": 0.0738963931798935, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003703115798998624, + "grad_norm": 8.570239067077637, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.864822268486023, + "num_tokens": 633184209.0, + "step": 16598 + }, + { + "epoch": 2.1115634143238773, + "ewc_loss": 0.07390785217285156, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003704262198880315, + "grad_norm": 8.586674690246582, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8697605133056641, + "num_tokens": 633224525.0, + "step": 16599 + }, + { + "epoch": 2.111690624602468, + "ewc_loss": 0.07393354177474976, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003706830320879817, + "grad_norm": 8.55651569366455, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8784434795379639, + "num_tokens": 633253171.0, + "step": 16600 + }, + { + "epoch": 2.1118178348810583, + "ewc_loss": 0.07421447336673737, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037105093360878527, + "grad_norm": 8.589556694030762, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8627716898918152, + "num_tokens": 633294326.0, + "step": 16601 + }, + { + "epoch": 2.111945045159649, + "ewc_loss": 0.07385995984077454, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000369947258150205, + "grad_norm": 8.534969329833984, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8803116679191589, + "num_tokens": 633327236.0, + "step": 16602 + }, + { + "epoch": 2.1120722554382394, + "ewc_loss": 0.07430966198444366, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037200291990302503, + "grad_norm": 8.554583549499512, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8760040998458862, + "num_tokens": 633368593.0, + "step": 16603 + }, + { + "epoch": 2.11219946571683, + "ewc_loss": 0.07418425381183624, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003707488358486444, + "grad_norm": 8.696313858032227, + "learning_rate": 1e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.8456741571426392, + "num_tokens": 633402598.0, + "step": 16604 + }, + { + "epoch": 2.1123266759954205, + "ewc_loss": 0.07370250672101974, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00036837271181866527, + "grad_norm": 8.501797676086426, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8603343367576599, + "num_tokens": 633437069.0, + "step": 16605 + }, + { + "epoch": 2.112453886274011, + "ewc_loss": 0.07427634298801422, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000374111084965989, + "grad_norm": 8.586848258972168, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8676439523696899, + "num_tokens": 633479752.0, + "step": 16606 + }, + { + "epoch": 2.1125810965526015, + "ewc_loss": 0.07380615174770355, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003694092156365514, + "grad_norm": 8.5332670211792, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8628851175308228, + "num_tokens": 633516949.0, + "step": 16607 + }, + { + "epoch": 2.112708306831192, + "ewc_loss": 0.07413721829652786, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037271983455866575, + "grad_norm": 8.610902786254883, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8694874048233032, + "num_tokens": 633547511.0, + "step": 16608 + }, + { + "epoch": 2.1128355171097826, + "ewc_loss": 0.07403380423784256, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003716857172548771, + "grad_norm": 8.597076416015625, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8679858446121216, + "num_tokens": 633583940.0, + "step": 16609 + }, + { + "epoch": 2.112962727388373, + "ewc_loss": 0.07395422458648682, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003708899312186986, + "grad_norm": 8.533136367797852, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8545098304748535, + "num_tokens": 633626245.0, + "step": 16610 + }, + { + "epoch": 2.1130899376669636, + "ewc_loss": 0.07427550852298737, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037166138645261526, + "grad_norm": 8.562843322753906, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8688868284225464, + "num_tokens": 633662889.0, + "step": 16611 + }, + { + "epoch": 2.113217147945554, + "ewc_loss": 0.07425985485315323, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037150480784475803, + "grad_norm": 8.602849960327148, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.862151563167572, + "num_tokens": 633701461.0, + "step": 16612 + }, + { + "epoch": 2.1133443582241447, + "ewc_loss": 0.07424341887235641, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003713404294103384, + "grad_norm": 8.598139762878418, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.866407573223114, + "num_tokens": 633741302.0, + "step": 16613 + }, + { + "epoch": 2.113471568502735, + "ewc_loss": 0.07400566339492798, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003714043414220214, + "grad_norm": 8.537609100341797, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8705185651779175, + "num_tokens": 633780337.0, + "step": 16614 + }, + { + "epoch": 2.1135987787813257, + "ewc_loss": 0.07430075109004974, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037191371666267514, + "grad_norm": 8.615657806396484, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.872194766998291, + "num_tokens": 633814450.0, + "step": 16615 + }, + { + "epoch": 2.1137259890599163, + "ewc_loss": 0.07389113306999207, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037025896017439663, + "grad_norm": 8.560397148132324, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8580396771430969, + "num_tokens": 633856252.0, + "step": 16616 + }, + { + "epoch": 2.1138531993385064, + "ewc_loss": 0.07388971745967865, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037268619053065777, + "grad_norm": 8.61526870727539, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8536559343338013, + "num_tokens": 633895235.0, + "step": 16617 + }, + { + "epoch": 2.113980409617097, + "ewc_loss": 0.07357637584209442, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036955284303985536, + "grad_norm": 8.558029174804688, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.86769700050354, + "num_tokens": 633938948.0, + "step": 16618 + }, + { + "epoch": 2.1141076198956874, + "ewc_loss": 0.0741799920797348, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037314757355488837, + "grad_norm": 8.64188289642334, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8749122619628906, + "num_tokens": 633973393.0, + "step": 16619 + }, + { + "epoch": 2.114234830174278, + "ewc_loss": 0.07358291000127792, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036961815203540027, + "grad_norm": 8.52626895904541, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8713815808296204, + "num_tokens": 634006714.0, + "step": 16620 + }, + { + "epoch": 2.1143620404528685, + "ewc_loss": 0.07387647777795792, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037255382630974054, + "grad_norm": 8.667129516601562, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8712334632873535, + "num_tokens": 634046104.0, + "step": 16621 + }, + { + "epoch": 2.114489250731459, + "ewc_loss": 0.07351294159889221, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036891846684738994, + "grad_norm": 8.475363731384277, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8575353622436523, + "num_tokens": 634086401.0, + "step": 16622 + }, + { + "epoch": 2.1146164610100495, + "ewc_loss": 0.07404523342847824, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003742413828149438, + "grad_norm": 8.688959121704102, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8569968342781067, + "num_tokens": 634125479.0, + "step": 16623 + }, + { + "epoch": 2.11474367128864, + "ewc_loss": 0.07334001362323761, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036718917544931173, + "grad_norm": 8.513461112976074, + "learning_rate": 1e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8472665548324585, + "num_tokens": 634163873.0, + "step": 16624 + }, + { + "epoch": 2.1148708815672306, + "ewc_loss": 0.07406721264123917, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003744612040463835, + "grad_norm": 8.663593292236328, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.863702118396759, + "num_tokens": 634200533.0, + "step": 16625 + }, + { + "epoch": 2.114998091845821, + "ewc_loss": 0.07353570312261581, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036914608790539205, + "grad_norm": 8.549675941467285, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8583500385284424, + "num_tokens": 634235891.0, + "step": 16626 + }, + { + "epoch": 2.1151253021244116, + "ewc_loss": 0.07385508716106415, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000372339942259714, + "grad_norm": 8.607937812805176, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8649500608444214, + "num_tokens": 634266748.0, + "step": 16627 + }, + { + "epoch": 2.115252512403002, + "ewc_loss": 0.0735754519701004, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003695435880217701, + "grad_norm": 8.629044532775879, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8525531888008118, + "num_tokens": 634302603.0, + "step": 16628 + }, + { + "epoch": 2.1153797226815927, + "ewc_loss": 0.07360772043466568, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003698662621900439, + "grad_norm": 8.543850898742676, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8671830892562866, + "num_tokens": 634345471.0, + "step": 16629 + }, + { + "epoch": 2.115506932960183, + "ewc_loss": 0.07361750304698944, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036996410926803946, + "grad_norm": 8.567549705505371, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8639765977859497, + "num_tokens": 634383902.0, + "step": 16630 + }, + { + "epoch": 2.1156341432387737, + "ewc_loss": 0.07353803515434265, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003691694582812488, + "grad_norm": 8.56556510925293, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8567745089530945, + "num_tokens": 634421082.0, + "step": 16631 + }, + { + "epoch": 2.1157613535173643, + "ewc_loss": 0.07356192171573639, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036940822610631585, + "grad_norm": 8.56982707977295, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8606375455856323, + "num_tokens": 634456490.0, + "step": 16632 + }, + { + "epoch": 2.115888563795955, + "ewc_loss": 0.07385017722845078, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003698494110722095, + "grad_norm": 8.613431930541992, + "learning_rate": 1e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8469104766845703, + "num_tokens": 634498252.0, + "step": 16633 + }, + { + "epoch": 2.1160157740745453, + "ewc_loss": 0.07385715842247009, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003699192020576447, + "grad_norm": 8.566439628601074, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8743458986282349, + "num_tokens": 634540769.0, + "step": 16634 + }, + { + "epoch": 2.116142984353136, + "ewc_loss": 0.07419070601463318, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003708132717292756, + "grad_norm": 8.592804908752441, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8728749752044678, + "num_tokens": 634573807.0, + "step": 16635 + }, + { + "epoch": 2.1162701946317264, + "ewc_loss": 0.07366511970758438, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037044024793431163, + "grad_norm": 8.55892562866211, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.860629141330719, + "num_tokens": 634609542.0, + "step": 16636 + }, + { + "epoch": 2.116397404910317, + "ewc_loss": 0.07360708713531494, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003698599466588348, + "grad_norm": 8.536921501159668, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8728647232055664, + "num_tokens": 634648641.0, + "step": 16637 + }, + { + "epoch": 2.1165246151889074, + "ewc_loss": 0.07387637346982956, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000370111403753981, + "grad_norm": 8.601357460021973, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.863618016242981, + "num_tokens": 634685531.0, + "step": 16638 + }, + { + "epoch": 2.116651825467498, + "ewc_loss": 0.07384496927261353, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003697973443195224, + "grad_norm": 8.594244003295898, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8677394390106201, + "num_tokens": 634717239.0, + "step": 16639 + }, + { + "epoch": 2.116779035746088, + "ewc_loss": 0.07386647909879684, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003700124507304281, + "grad_norm": 8.556170463562012, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8624957799911499, + "num_tokens": 634750058.0, + "step": 16640 + }, + { + "epoch": 2.1169062460246786, + "ewc_loss": 0.07427038252353668, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003716100472956896, + "grad_norm": 8.558769226074219, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.858683705329895, + "num_tokens": 634793480.0, + "step": 16641 + }, + { + "epoch": 2.117033456303269, + "ewc_loss": 0.07362812757492065, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000370070367353037, + "grad_norm": 8.51008129119873, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8563585877418518, + "num_tokens": 634829923.0, + "step": 16642 + }, + { + "epoch": 2.1171606665818596, + "ewc_loss": 0.07404576241970062, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037424673791974783, + "grad_norm": 8.661274909973145, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8597108721733093, + "num_tokens": 634868499.0, + "step": 16643 + }, + { + "epoch": 2.11728787686045, + "ewc_loss": 0.07358338683843613, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003696229250635952, + "grad_norm": 8.551587104797363, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8737996816635132, + "num_tokens": 634905922.0, + "step": 16644 + }, + { + "epoch": 2.1174150871390407, + "ewc_loss": 0.07391241192817688, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037291322951205075, + "grad_norm": 8.626554489135742, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8669989109039307, + "num_tokens": 634946190.0, + "step": 16645 + }, + { + "epoch": 2.1175422974176312, + "ewc_loss": 0.07411366701126099, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037004295154474676, + "grad_norm": 8.59113597869873, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8769116997718811, + "num_tokens": 634981840.0, + "step": 16646 + }, + { + "epoch": 2.1176695076962218, + "ewc_loss": 0.07378710806369781, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037166019319556653, + "grad_norm": 8.633061408996582, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8631640076637268, + "num_tokens": 635024755.0, + "step": 16647 + }, + { + "epoch": 2.1177967179748123, + "ewc_loss": 0.07413873076438904, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003702935064211488, + "grad_norm": 8.565084457397461, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8696558475494385, + "num_tokens": 635063076.0, + "step": 16648 + }, + { + "epoch": 2.117923928253403, + "ewc_loss": 0.07376512140035629, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037144028465263546, + "grad_norm": 8.61331844329834, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8767387866973877, + "num_tokens": 635103052.0, + "step": 16649 + }, + { + "epoch": 2.1180511385319933, + "ewc_loss": 0.07354480773210526, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036923715379089117, + "grad_norm": 8.50200080871582, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8741673231124878, + "num_tokens": 635140917.0, + "step": 16650 + }, + { + "epoch": 2.118178348810584, + "ewc_loss": 0.07440000772476196, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037290636100806296, + "grad_norm": 8.630623817443848, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8692703247070312, + "num_tokens": 635182049.0, + "step": 16651 + }, + { + "epoch": 2.1183055590891744, + "ewc_loss": 0.07372094690799713, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003709985176101327, + "grad_norm": 8.59676456451416, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8583650588989258, + "num_tokens": 635217613.0, + "step": 16652 + }, + { + "epoch": 2.118432769367765, + "ewc_loss": 0.07429715991020203, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037187780253589153, + "grad_norm": 8.592771530151367, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.875009298324585, + "num_tokens": 635252716.0, + "step": 16653 + }, + { + "epoch": 2.1185599796463555, + "ewc_loss": 0.07382522523403168, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003720413660630584, + "grad_norm": 8.578558921813965, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8682506084442139, + "num_tokens": 635288919.0, + "step": 16654 + }, + { + "epoch": 2.118687189924946, + "ewc_loss": 0.07379835844039917, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037177259218879044, + "grad_norm": 8.57858657836914, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8599808216094971, + "num_tokens": 635329781.0, + "step": 16655 + }, + { + "epoch": 2.1188144002035365, + "ewc_loss": 0.07374584674835205, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037124755908735096, + "grad_norm": 8.571922302246094, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.873809278011322, + "num_tokens": 635363844.0, + "step": 16656 + }, + { + "epoch": 2.118941610482127, + "ewc_loss": 0.07375932484865189, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037138230982236564, + "grad_norm": 8.58165168762207, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.861160933971405, + "num_tokens": 635397568.0, + "step": 16657 + }, + { + "epoch": 2.1190688207607176, + "ewc_loss": 0.07376694679260254, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037145850365050137, + "grad_norm": 8.622627258300781, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8510070443153381, + "num_tokens": 635432903.0, + "step": 16658 + }, + { + "epoch": 2.119196031039308, + "ewc_loss": 0.0736112967133522, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00036990203079767525, + "grad_norm": 8.571131706237793, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8659042119979858, + "num_tokens": 635473388.0, + "step": 16659 + }, + { + "epoch": 2.1193232413178986, + "ewc_loss": 0.07371044158935547, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037089348188601434, + "grad_norm": 8.552484512329102, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8681386709213257, + "num_tokens": 635512966.0, + "step": 16660 + }, + { + "epoch": 2.119450451596489, + "ewc_loss": 0.07373106479644775, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003710996825248003, + "grad_norm": 8.592519760131836, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8513896465301514, + "num_tokens": 635547814.0, + "step": 16661 + }, + { + "epoch": 2.1195776618750797, + "ewc_loss": 0.07367591559886932, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037054825224913657, + "grad_norm": 8.504298210144043, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8559993505477905, + "num_tokens": 635583120.0, + "step": 16662 + }, + { + "epoch": 2.11970487215367, + "ewc_loss": 0.07389947772026062, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003727838338818401, + "grad_norm": 8.670843124389648, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8601449131965637, + "num_tokens": 635621531.0, + "step": 16663 + }, + { + "epoch": 2.1198320824322607, + "ewc_loss": 0.07398481667041779, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00036875440855510533, + "grad_norm": 8.51282787322998, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8532145023345947, + "num_tokens": 635659641.0, + "step": 16664 + }, + { + "epoch": 2.119959292710851, + "ewc_loss": 0.0745638906955719, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037454519770108163, + "grad_norm": 8.641729354858398, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8601099848747253, + "num_tokens": 635697953.0, + "step": 16665 + }, + { + "epoch": 2.1200865029894413, + "ewc_loss": 0.07397464662790298, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003686527197714895, + "grad_norm": 8.474498748779297, + "learning_rate": 1e-06, + "loss": 0.5427, + "mean_token_accuracy": 0.840339720249176, + "num_tokens": 635742519.0, + "step": 16666 + }, + { + "epoch": 2.120213713268032, + "ewc_loss": 0.0742986649274826, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037433425313793123, + "grad_norm": 8.641456604003906, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8721109628677368, + "num_tokens": 635777786.0, + "step": 16667 + }, + { + "epoch": 2.1203409235466224, + "ewc_loss": 0.07407616078853607, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00036966786137782037, + "grad_norm": 8.489949226379395, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8661102056503296, + "num_tokens": 635814490.0, + "step": 16668 + }, + { + "epoch": 2.120468133825213, + "ewc_loss": 0.07461418211460114, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000375048111891374, + "grad_norm": 8.657242774963379, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.851921796798706, + "num_tokens": 635858351.0, + "step": 16669 + }, + { + "epoch": 2.1205953441038035, + "ewc_loss": 0.07407639920711517, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003696701896842569, + "grad_norm": 8.499024391174316, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8621878027915955, + "num_tokens": 635895585.0, + "step": 16670 + }, + { + "epoch": 2.120722554382394, + "ewc_loss": 0.07466757297515869, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037558202166110277, + "grad_norm": 8.633234024047852, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8715537190437317, + "num_tokens": 635933591.0, + "step": 16671 + }, + { + "epoch": 2.1208497646609845, + "ewc_loss": 0.074273981153965, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037164604873396456, + "grad_norm": 8.528223991394043, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.862615168094635, + "num_tokens": 635972040.0, + "step": 16672 + }, + { + "epoch": 2.120976974939575, + "ewc_loss": 0.07458033412694931, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037470957613550127, + "grad_norm": 8.622271537780762, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8683385848999023, + "num_tokens": 636013287.0, + "step": 16673 + }, + { + "epoch": 2.1211041852181656, + "ewc_loss": 0.07444250583648682, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003733313351403922, + "grad_norm": 9.222681999206543, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.86004638671875, + "num_tokens": 636046137.0, + "step": 16674 + }, + { + "epoch": 2.121231395496756, + "ewc_loss": 0.07365235686302185, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00036542979069054127, + "grad_norm": 8.403076171875, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.855526328086853, + "num_tokens": 636088870.0, + "step": 16675 + }, + { + "epoch": 2.1213586057753466, + "ewc_loss": 0.07632511854171753, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0003823917650151998, + "grad_norm": 53.964561462402344, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.874187171459198, + "num_tokens": 636130519.0, + "step": 16676 + }, + { + "epoch": 2.121485816053937, + "ewc_loss": 0.12775620818138123, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0009064683690667152, + "grad_norm": 14.326752662658691, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8682864904403687, + "num_tokens": 636165641.0, + "step": 16677 + }, + { + "epoch": 2.1216130263325277, + "ewc_loss": 0.07568292319774628, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003857355040963739, + "grad_norm": 7.767127513885498, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8508614301681519, + "num_tokens": 636203811.0, + "step": 16678 + }, + { + "epoch": 2.121740236611118, + "ewc_loss": 0.10702574253082275, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0006991637055762112, + "grad_norm": 13.076834678649902, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8763826489448547, + "num_tokens": 636239503.0, + "step": 16679 + }, + { + "epoch": 2.1218674468897087, + "ewc_loss": 0.11473090946674347, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0007762153400108218, + "grad_norm": 13.202176094055176, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8682349920272827, + "num_tokens": 636278994.0, + "step": 16680 + }, + { + "epoch": 2.1219946571682993, + "ewc_loss": 0.08556364476680756, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00048454265925101936, + "grad_norm": 9.072876930236816, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8619201183319092, + "num_tokens": 636318025.0, + "step": 16681 + }, + { + "epoch": 2.12212186744689, + "ewc_loss": 0.0912894606590271, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0005418008659034967, + "grad_norm": 11.253923416137695, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8673906326293945, + "num_tokens": 636357289.0, + "step": 16682 + }, + { + "epoch": 2.1222490777254803, + "ewc_loss": 0.09575165808200836, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0005864228587597609, + "grad_norm": 10.947880744934082, + "learning_rate": 1e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8503193259239197, + "num_tokens": 636399067.0, + "step": 16683 + }, + { + "epoch": 2.122376288004071, + "ewc_loss": 0.08263574540615082, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0004601465188898146, + "grad_norm": 9.364108085632324, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8703283071517944, + "num_tokens": 636442418.0, + "step": 16684 + }, + { + "epoch": 2.1225034982826614, + "ewc_loss": 0.08435162901878357, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00047730538062751293, + "grad_norm": 10.130778312683105, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8525422215461731, + "num_tokens": 636484273.0, + "step": 16685 + }, + { + "epoch": 2.122630708561252, + "ewc_loss": 0.08390025049448013, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00047279158025048673, + "grad_norm": 9.58142375946045, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8725321292877197, + "num_tokens": 636525675.0, + "step": 16686 + }, + { + "epoch": 2.1227579188398424, + "ewc_loss": 0.08027835935354233, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00043657264905050397, + "grad_norm": 9.412078857421875, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8642165064811707, + "num_tokens": 636559652.0, + "step": 16687 + }, + { + "epoch": 2.122885129118433, + "ewc_loss": 0.0800594910979271, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0004343839827924967, + "grad_norm": 9.428857803344727, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8850399255752563, + "num_tokens": 636597900.0, + "step": 16688 + }, + { + "epoch": 2.1230123393970235, + "ewc_loss": 0.07912035286426544, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00042499255505390465, + "grad_norm": 9.229852676391602, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8658138513565063, + "num_tokens": 636635822.0, + "step": 16689 + }, + { + "epoch": 2.1231395496756136, + "ewc_loss": 0.07746769487857819, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00040846600313670933, + "grad_norm": 9.095662117004395, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8551731705665588, + "num_tokens": 636667275.0, + "step": 16690 + }, + { + "epoch": 2.123266759954204, + "ewc_loss": 0.077759750187397, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00041138657252304256, + "grad_norm": 9.139862060546875, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8671908974647522, + "num_tokens": 636704723.0, + "step": 16691 + }, + { + "epoch": 2.1233939702327946, + "ewc_loss": 0.07640233635902405, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00039781242958270013, + "grad_norm": 8.93216323852539, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8703294396400452, + "num_tokens": 636748209.0, + "step": 16692 + }, + { + "epoch": 2.123521180511385, + "ewc_loss": 0.07648008316755295, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00039858988020569086, + "grad_norm": 8.98544692993164, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8695380687713623, + "num_tokens": 636785052.0, + "step": 16693 + }, + { + "epoch": 2.1236483907899757, + "ewc_loss": 0.0756421685218811, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00039021080010570586, + "grad_norm": 8.854931831359863, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8773543834686279, + "num_tokens": 636816073.0, + "step": 16694 + }, + { + "epoch": 2.123775601068566, + "ewc_loss": 0.07560562342405319, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003898453142028302, + "grad_norm": 14.356751441955566, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8714287281036377, + "num_tokens": 636848963.0, + "step": 16695 + }, + { + "epoch": 2.1239028113471567, + "ewc_loss": 0.0841168537735939, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004725161998067051, + "grad_norm": 9.714695930480957, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.877776026725769, + "num_tokens": 636880632.0, + "step": 16696 + }, + { + "epoch": 2.1240300216257473, + "ewc_loss": 0.07442611455917358, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003780501720029861, + "grad_norm": 8.67530632019043, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.853001594543457, + "num_tokens": 636923597.0, + "step": 16697 + }, + { + "epoch": 2.124157231904338, + "ewc_loss": 0.07674499601125717, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000398797623347491, + "grad_norm": 9.044032096862793, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8600625991821289, + "num_tokens": 636962250.0, + "step": 16698 + }, + { + "epoch": 2.1242844421829283, + "ewc_loss": 0.07602716982364655, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038917799247428775, + "grad_norm": 8.765755653381348, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8737539052963257, + "num_tokens": 636997973.0, + "step": 16699 + }, + { + "epoch": 2.124411652461519, + "ewc_loss": 0.07549211382865906, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003887102357111871, + "grad_norm": 8.878281593322754, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8538902997970581, + "num_tokens": 637037320.0, + "step": 16700 + }, + { + "epoch": 2.1245388627401094, + "ewc_loss": 0.07502798736095428, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038406893145293, + "grad_norm": 8.741366386413574, + "learning_rate": 1e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8424457907676697, + "num_tokens": 637072267.0, + "step": 16701 + }, + { + "epoch": 2.1246660730187, + "ewc_loss": 0.07507096230983734, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038449864950962365, + "grad_norm": 8.728607177734375, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8806976079940796, + "num_tokens": 637108782.0, + "step": 16702 + }, + { + "epoch": 2.1247932832972904, + "ewc_loss": 0.07503180205821991, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038410708657465875, + "grad_norm": 8.791339874267578, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8642005920410156, + "num_tokens": 637147368.0, + "step": 16703 + }, + { + "epoch": 2.124920493575881, + "ewc_loss": 0.0743531584739685, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003773205971810967, + "grad_norm": 8.657083511352539, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8738700747489929, + "num_tokens": 637181909.0, + "step": 16704 + }, + { + "epoch": 2.1250477038544715, + "ewc_loss": 0.07487282156944275, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038251731893979013, + "grad_norm": 8.750091552734375, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8634383678436279, + "num_tokens": 637218498.0, + "step": 16705 + }, + { + "epoch": 2.125174914133062, + "ewc_loss": 0.07441885769367218, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037797761615365744, + "grad_norm": 8.72947883605957, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8659114837646484, + "num_tokens": 637254825.0, + "step": 16706 + }, + { + "epoch": 2.1253021244116526, + "ewc_loss": 0.07439543306827545, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003777433594223112, + "grad_norm": 8.643549919128418, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8553723096847534, + "num_tokens": 637301593.0, + "step": 16707 + }, + { + "epoch": 2.125429334690243, + "ewc_loss": 0.07448214292526245, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037861044984310865, + "grad_norm": 8.60368537902832, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8773381114006042, + "num_tokens": 637341676.0, + "step": 16708 + }, + { + "epoch": 2.1255565449688336, + "ewc_loss": 0.07449838519096375, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003787728783208877, + "grad_norm": 8.780173301696777, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8695014119148254, + "num_tokens": 637378840.0, + "step": 16709 + }, + { + "epoch": 2.125683755247424, + "ewc_loss": 0.07419827580451965, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003757717786356807, + "grad_norm": 8.580475807189941, + "learning_rate": 1e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8468563556671143, + "num_tokens": 637412711.0, + "step": 16710 + }, + { + "epoch": 2.1258109655260147, + "ewc_loss": 0.07459568232297897, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003797458775807172, + "grad_norm": 14.271842002868652, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8629430532455444, + "num_tokens": 637448287.0, + "step": 16711 + }, + { + "epoch": 2.125938175804605, + "ewc_loss": 0.08277401328086853, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004590877506416291, + "grad_norm": 9.496088981628418, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8635991811752319, + "num_tokens": 637487899.0, + "step": 16712 + }, + { + "epoch": 2.1260653860831957, + "ewc_loss": 0.07548308372497559, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003886198974214494, + "grad_norm": 8.795419692993164, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8697091341018677, + "num_tokens": 637528161.0, + "step": 16713 + }, + { + "epoch": 2.1261925963617863, + "ewc_loss": 0.07558213174343109, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000389610358979553, + "grad_norm": 8.859051704406738, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8525592088699341, + "num_tokens": 637565692.0, + "step": 16714 + }, + { + "epoch": 2.1263198066403763, + "ewc_loss": 0.07616987079381943, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00039548776112496853, + "grad_norm": 8.906781196594238, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8527110815048218, + "num_tokens": 637600368.0, + "step": 16715 + }, + { + "epoch": 2.126447016918967, + "ewc_loss": 0.0751802921295166, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003855919640045613, + "grad_norm": 8.794363975524902, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8616840839385986, + "num_tokens": 637635732.0, + "step": 16716 + }, + { + "epoch": 2.1265742271975574, + "ewc_loss": 0.07518687844276428, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003856578259728849, + "grad_norm": 8.665727615356445, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8606733083724976, + "num_tokens": 637677127.0, + "step": 16717 + }, + { + "epoch": 2.126701437476148, + "ewc_loss": 0.07529712468385696, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003867603081744164, + "grad_norm": 8.824773788452148, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8644382953643799, + "num_tokens": 637715278.0, + "step": 16718 + }, + { + "epoch": 2.1268286477547385, + "ewc_loss": 0.07461005449295044, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037988959229551256, + "grad_norm": 8.704290390014648, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8689635992050171, + "num_tokens": 637742937.0, + "step": 16719 + }, + { + "epoch": 2.126955858033329, + "ewc_loss": 0.07478481531143188, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038163724821060896, + "grad_norm": 8.705753326416016, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8684264421463013, + "num_tokens": 637776761.0, + "step": 16720 + }, + { + "epoch": 2.1270830683119195, + "ewc_loss": 0.07455810904502869, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037937017623335123, + "grad_norm": 8.640691757202148, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8680585622787476, + "num_tokens": 637812197.0, + "step": 16721 + }, + { + "epoch": 2.12721027859051, + "ewc_loss": 0.07480283826589584, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037937602610327303, + "grad_norm": 8.620341300964355, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8703070878982544, + "num_tokens": 637850442.0, + "step": 16722 + }, + { + "epoch": 2.1273374888691006, + "ewc_loss": 0.07461704313755035, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037995949969626963, + "grad_norm": 8.684732437133789, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8664438724517822, + "num_tokens": 637889352.0, + "step": 16723 + }, + { + "epoch": 2.127464699147691, + "ewc_loss": 0.07435566186904907, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003773456846829504, + "grad_norm": 8.639408111572266, + "learning_rate": 1e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8474866151809692, + "num_tokens": 637927195.0, + "step": 16724 + }, + { + "epoch": 2.1275919094262816, + "ewc_loss": 0.07445473968982697, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003783364372793585, + "grad_norm": 8.621630668640137, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8528489470481873, + "num_tokens": 637969431.0, + "step": 16725 + }, + { + "epoch": 2.127719119704872, + "ewc_loss": 0.07424971461296082, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003762862179428339, + "grad_norm": 8.562495231628418, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8817417025566101, + "num_tokens": 638013910.0, + "step": 16726 + }, + { + "epoch": 2.1278463299834627, + "ewc_loss": 0.07446710765361786, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003784601576626301, + "grad_norm": 8.646358489990234, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8538233637809753, + "num_tokens": 638051848.0, + "step": 16727 + }, + { + "epoch": 2.127973540262053, + "ewc_loss": 0.0747108981013298, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037601523217745125, + "grad_norm": 8.60145378112793, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8659771680831909, + "num_tokens": 638092192.0, + "step": 16728 + }, + { + "epoch": 2.1281007505406437, + "ewc_loss": 0.07437294721603394, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037751850322820246, + "grad_norm": 8.58806037902832, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.875743567943573, + "num_tokens": 638134306.0, + "step": 16729 + }, + { + "epoch": 2.1282279608192343, + "ewc_loss": 0.07431218028068542, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037691081524826586, + "grad_norm": 8.635392189025879, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8509225845336914, + "num_tokens": 638177988.0, + "step": 16730 + }, + { + "epoch": 2.128355171097825, + "ewc_loss": 0.07425535470247269, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037634262116625905, + "grad_norm": 8.617104530334473, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8761005401611328, + "num_tokens": 638212237.0, + "step": 16731 + }, + { + "epoch": 2.1284823813764153, + "ewc_loss": 0.07442571967840195, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003780462429858744, + "grad_norm": 8.664332389831543, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8584845066070557, + "num_tokens": 638252492.0, + "step": 16732 + }, + { + "epoch": 2.128609591655006, + "ewc_loss": 0.07414760440587997, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003752651100512594, + "grad_norm": 8.596395492553711, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.864102840423584, + "num_tokens": 638294308.0, + "step": 16733 + }, + { + "epoch": 2.1287368019335964, + "ewc_loss": 0.07465267181396484, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037787441397085786, + "grad_norm": 8.69051742553711, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8692919611930847, + "num_tokens": 638330178.0, + "step": 16734 + }, + { + "epoch": 2.128864012212187, + "ewc_loss": 0.07400570809841156, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003738461236935109, + "grad_norm": 8.534784317016602, + "learning_rate": 1e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8459540605545044, + "num_tokens": 638372731.0, + "step": 16735 + }, + { + "epoch": 2.1289912224907774, + "ewc_loss": 0.07475686818361282, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003789163311012089, + "grad_norm": 8.692246437072754, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8618731498718262, + "num_tokens": 638407522.0, + "step": 16736 + }, + { + "epoch": 2.129118432769368, + "ewc_loss": 0.07441788911819458, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003730851167347282, + "grad_norm": 8.482994079589844, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8672045469284058, + "num_tokens": 638443262.0, + "step": 16737 + }, + { + "epoch": 2.129245643047958, + "ewc_loss": 0.07470112293958664, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038080030935816467, + "grad_norm": 8.780782699584961, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8655123710632324, + "num_tokens": 638482729.0, + "step": 16738 + }, + { + "epoch": 2.129372853326549, + "ewc_loss": 0.07375844568014145, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003713735204655677, + "grad_norm": 8.450262069702148, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8726376295089722, + "num_tokens": 638520034.0, + "step": 16739 + }, + { + "epoch": 2.129500063605139, + "ewc_loss": 0.07557347416877747, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000384641025448218, + "grad_norm": 8.861454010009766, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8729543685913086, + "num_tokens": 638558631.0, + "step": 16740 + }, + { + "epoch": 2.1296272738837296, + "ewc_loss": 0.0736265480518341, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037005459307692945, + "grad_norm": 8.40494155883789, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8767588138580322, + "num_tokens": 638593565.0, + "step": 16741 + }, + { + "epoch": 2.12975448416232, + "ewc_loss": 0.07581828534603119, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003870890650432557, + "grad_norm": 8.96086597442627, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8685746192932129, + "num_tokens": 638625952.0, + "step": 16742 + }, + { + "epoch": 2.1298816944409107, + "ewc_loss": 0.0735405758023262, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003691948077175766, + "grad_norm": 8.389303207397461, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8623316287994385, + "num_tokens": 638662000.0, + "step": 16743 + }, + { + "epoch": 2.130008904719501, + "ewc_loss": 0.07610808312892914, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003899870498571545, + "grad_norm": 8.82902717590332, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8513604998588562, + "num_tokens": 638706020.0, + "step": 16744 + }, + { + "epoch": 2.1301361149980917, + "ewc_loss": 0.07431386411190033, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037204488762654364, + "grad_norm": 8.470684051513672, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8710926175117493, + "num_tokens": 638745354.0, + "step": 16745 + }, + { + "epoch": 2.1302633252766823, + "ewc_loss": 0.07574649155139923, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003863711899612099, + "grad_norm": 8.779314041137695, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8595350980758667, + "num_tokens": 638785497.0, + "step": 16746 + }, + { + "epoch": 2.130390535555273, + "ewc_loss": 0.0746389776468277, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037529607652686536, + "grad_norm": 8.589468955993652, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8450208306312561, + "num_tokens": 638821650.0, + "step": 16747 + }, + { + "epoch": 2.1305177458338633, + "ewc_loss": 0.07545208930969238, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000383427191991359, + "grad_norm": 8.690699577331543, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8511970639228821, + "num_tokens": 638855589.0, + "step": 16748 + }, + { + "epoch": 2.130644956112454, + "ewc_loss": 0.07481426000595093, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003770488838199526, + "grad_norm": 8.554849624633789, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8638432025909424, + "num_tokens": 638895766.0, + "step": 16749 + }, + { + "epoch": 2.1307721663910444, + "ewc_loss": 0.0752561092376709, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000381467369152233, + "grad_norm": 8.661041259765625, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8551547527313232, + "num_tokens": 638932622.0, + "step": 16750 + }, + { + "epoch": 2.130899376669635, + "ewc_loss": 0.07492345571517944, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003781407722271979, + "grad_norm": 8.56260871887207, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8543407320976257, + "num_tokens": 638971588.0, + "step": 16751 + }, + { + "epoch": 2.1310265869482254, + "ewc_loss": 0.07512617111206055, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038016799953766167, + "grad_norm": 8.604740142822266, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8694220781326294, + "num_tokens": 639009921.0, + "step": 16752 + }, + { + "epoch": 2.131153797226816, + "ewc_loss": 0.07503852993249893, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003792915667872876, + "grad_norm": 8.630728721618652, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8654033541679382, + "num_tokens": 639048160.0, + "step": 16753 + }, + { + "epoch": 2.1312810075054065, + "ewc_loss": 0.07499488443136215, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037885509664192796, + "grad_norm": 8.677409172058105, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8764029741287231, + "num_tokens": 639080260.0, + "step": 16754 + }, + { + "epoch": 2.131408217783997, + "ewc_loss": 0.07426583766937256, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003764474531635642, + "grad_norm": 8.635262489318848, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8565413355827332, + "num_tokens": 639118758.0, + "step": 16755 + }, + { + "epoch": 2.1315354280625876, + "ewc_loss": 0.07504082471132278, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003793145006056875, + "grad_norm": 8.600274085998535, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.864666223526001, + "num_tokens": 639159528.0, + "step": 16756 + }, + { + "epoch": 2.131662638341178, + "ewc_loss": 0.07493001967668533, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037820645957253873, + "grad_norm": 8.629877090454102, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8682864308357239, + "num_tokens": 639200394.0, + "step": 16757 + }, + { + "epoch": 2.1317898486197686, + "ewc_loss": 0.07476699352264404, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003765762085095048, + "grad_norm": 8.557964324951172, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8569315671920776, + "num_tokens": 639240938.0, + "step": 16758 + }, + { + "epoch": 2.131917058898359, + "ewc_loss": 0.07506661117076874, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003795723896473646, + "grad_norm": 8.630400657653809, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8592500686645508, + "num_tokens": 639278097.0, + "step": 16759 + }, + { + "epoch": 2.1320442691769497, + "ewc_loss": 0.07480678707361221, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037697411607950926, + "grad_norm": 8.54965877532959, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8660567998886108, + "num_tokens": 639324512.0, + "step": 16760 + }, + { + "epoch": 2.13217147945554, + "ewc_loss": 0.07500386238098145, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037894491106271744, + "grad_norm": 8.615039825439453, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8696367740631104, + "num_tokens": 639366833.0, + "step": 16761 + }, + { + "epoch": 2.1322986897341307, + "ewc_loss": 0.07475925981998444, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003764988505281508, + "grad_norm": 8.718778610229492, + "learning_rate": 1e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8500654101371765, + "num_tokens": 639405765.0, + "step": 16762 + }, + { + "epoch": 2.132425900012721, + "ewc_loss": 0.0746275931596756, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037518219323828816, + "grad_norm": 8.564105987548828, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.861135721206665, + "num_tokens": 639439622.0, + "step": 16763 + }, + { + "epoch": 2.1325531102913113, + "ewc_loss": 0.07495154440402985, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003784216823987663, + "grad_norm": 8.616114616394043, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8750362396240234, + "num_tokens": 639479943.0, + "step": 16764 + }, + { + "epoch": 2.132680320569902, + "ewc_loss": 0.07419614493846893, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003757505619432777, + "grad_norm": 8.567039489746094, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8586282134056091, + "num_tokens": 639520487.0, + "step": 16765 + }, + { + "epoch": 2.1328075308484924, + "ewc_loss": 0.0745154321193695, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003789433394558728, + "grad_norm": 8.62972354888916, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8634458780288696, + "num_tokens": 639558997.0, + "step": 16766 + }, + { + "epoch": 2.132934741127083, + "ewc_loss": 0.0742223858833313, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037601296207867563, + "grad_norm": 8.561908721923828, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8633480668067932, + "num_tokens": 639594569.0, + "step": 16767 + }, + { + "epoch": 2.1330619514056735, + "ewc_loss": 0.07453249394893646, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003791140334215015, + "grad_norm": 8.622357368469238, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8785462975502014, + "num_tokens": 639629827.0, + "step": 16768 + }, + { + "epoch": 2.133189161684264, + "ewc_loss": 0.07427874207496643, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037657649954780936, + "grad_norm": 8.646772384643555, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8575412034988403, + "num_tokens": 639667634.0, + "step": 16769 + }, + { + "epoch": 2.1333163719628545, + "ewc_loss": 0.07439659535884857, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003777550591621548, + "grad_norm": 8.63964557647705, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.861812174320221, + "num_tokens": 639700536.0, + "step": 16770 + }, + { + "epoch": 2.133443582241445, + "ewc_loss": 0.0743868425488472, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003776575031224638, + "grad_norm": 8.611369132995605, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8672303557395935, + "num_tokens": 639744006.0, + "step": 16771 + }, + { + "epoch": 2.1335707925200356, + "ewc_loss": 0.07444301247596741, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037821923615410924, + "grad_norm": 8.584280967712402, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8713796138763428, + "num_tokens": 639784950.0, + "step": 16772 + }, + { + "epoch": 2.133698002798626, + "ewc_loss": 0.07448157668113708, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003786048328038305, + "grad_norm": 8.613157272338867, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8792359828948975, + "num_tokens": 639823567.0, + "step": 16773 + }, + { + "epoch": 2.1338252130772166, + "ewc_loss": 0.07475097477436066, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003764160501305014, + "grad_norm": 8.558446884155273, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8746660351753235, + "num_tokens": 639863386.0, + "step": 16774 + }, + { + "epoch": 2.133952423355807, + "ewc_loss": 0.07452397048473358, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037902873009443283, + "grad_norm": 8.595531463623047, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8539576530456543, + "num_tokens": 639901219.0, + "step": 16775 + }, + { + "epoch": 2.1340796336343977, + "ewc_loss": 0.07442113757133484, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037800049176439643, + "grad_norm": 8.646232604980469, + "learning_rate": 1e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8492095470428467, + "num_tokens": 639935240.0, + "step": 16776 + }, + { + "epoch": 2.134206843912988, + "ewc_loss": 0.07441940158605576, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003779830876737833, + "grad_norm": 8.624377250671387, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8561269044876099, + "num_tokens": 639975370.0, + "step": 16777 + }, + { + "epoch": 2.1343340541915787, + "ewc_loss": 0.07499216496944427, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037882785545662045, + "grad_norm": 8.537558555603027, + "learning_rate": 1e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.8450514674186707, + "num_tokens": 640020463.0, + "step": 16778 + }, + { + "epoch": 2.1344612644701693, + "ewc_loss": 0.07509662210941315, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037987245013937354, + "grad_norm": 8.665637016296387, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.859798789024353, + "num_tokens": 640057142.0, + "step": 16779 + }, + { + "epoch": 2.13458847474876, + "ewc_loss": 0.07431827485561371, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037697184598073363, + "grad_norm": 8.564419746398926, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8611769676208496, + "num_tokens": 640098879.0, + "step": 16780 + }, + { + "epoch": 2.1347156850273503, + "ewc_loss": 0.0747358575463295, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038114763447083533, + "grad_norm": 8.701966285705566, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8679021596908569, + "num_tokens": 640136077.0, + "step": 16781 + }, + { + "epoch": 2.134842895305941, + "ewc_loss": 0.07437346130609512, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003750822797883302, + "grad_norm": 8.553412437438965, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8499230146408081, + "num_tokens": 640175160.0, + "step": 16782 + }, + { + "epoch": 2.1349701055845314, + "ewc_loss": 0.07466588914394379, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003804480074904859, + "grad_norm": 8.713930130004883, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8893246650695801, + "num_tokens": 640216670.0, + "step": 16783 + }, + { + "epoch": 2.135097315863122, + "ewc_loss": 0.07443901896476746, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003732964105438441, + "grad_norm": 8.54694652557373, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8610880374908447, + "num_tokens": 640254872.0, + "step": 16784 + }, + { + "epoch": 2.1352245261417124, + "ewc_loss": 0.07468369603157043, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003806260065175593, + "grad_norm": 8.720473289489746, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8619277477264404, + "num_tokens": 640289319.0, + "step": 16785 + }, + { + "epoch": 2.135351736420303, + "ewc_loss": 0.07406827807426453, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003744718269445002, + "grad_norm": 8.600274085998535, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8777709007263184, + "num_tokens": 640323897.0, + "step": 16786 + }, + { + "epoch": 2.1354789466988935, + "ewc_loss": 0.07445217669010162, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003783108841162175, + "grad_norm": 8.640244483947754, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8715027570724487, + "num_tokens": 640366203.0, + "step": 16787 + }, + { + "epoch": 2.1356061569774836, + "ewc_loss": 0.07413224875926971, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003751115291379392, + "grad_norm": 8.519810676574707, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8747691512107849, + "num_tokens": 640408089.0, + "step": 16788 + }, + { + "epoch": 2.135733367256074, + "ewc_loss": 0.07443621754646301, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037815127870999277, + "grad_norm": 8.655673027038574, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8741479516029358, + "num_tokens": 640443532.0, + "step": 16789 + }, + { + "epoch": 2.1358605775346646, + "ewc_loss": 0.07426974922418594, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003764865396078676, + "grad_norm": 8.585402488708496, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8578321933746338, + "num_tokens": 640480425.0, + "step": 16790 + }, + { + "epoch": 2.135987787813255, + "ewc_loss": 0.0747077614068985, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000378425233066082, + "grad_norm": 8.582183837890625, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8687857389450073, + "num_tokens": 640525658.0, + "step": 16791 + }, + { + "epoch": 2.1361149980918457, + "ewc_loss": 0.07432268559932709, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037701588007621467, + "grad_norm": 8.622634887695312, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8738757371902466, + "num_tokens": 640563276.0, + "step": 16792 + }, + { + "epoch": 2.136242208370436, + "ewc_loss": 0.07434660941362381, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037725517177022994, + "grad_norm": 8.62099838256836, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8565828800201416, + "num_tokens": 640608150.0, + "step": 16793 + }, + { + "epoch": 2.1363694186490267, + "ewc_loss": 0.07436501979827881, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003774392534978688, + "grad_norm": 8.65233325958252, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8662052750587463, + "num_tokens": 640646127.0, + "step": 16794 + }, + { + "epoch": 2.1364966289276173, + "ewc_loss": 0.07446539402008057, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003760016115847975, + "grad_norm": 8.633893013000488, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.871161162853241, + "num_tokens": 640683195.0, + "step": 16795 + }, + { + "epoch": 2.136623839206208, + "ewc_loss": 0.07445786893367767, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037592637818306684, + "grad_norm": 8.613754272460938, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8789336681365967, + "num_tokens": 640724102.0, + "step": 16796 + }, + { + "epoch": 2.1367510494847983, + "ewc_loss": 0.07433348894119263, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037712400080636144, + "grad_norm": 8.580309867858887, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8612288236618042, + "num_tokens": 640762624.0, + "step": 16797 + }, + { + "epoch": 2.136878259763389, + "ewc_loss": 0.07434792816638947, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037726835580542684, + "grad_norm": 8.639823913574219, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.859264612197876, + "num_tokens": 640799437.0, + "step": 16798 + }, + { + "epoch": 2.1370054700419794, + "ewc_loss": 0.07418341934680939, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003756232326850295, + "grad_norm": 8.548981666564941, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.870622992515564, + "num_tokens": 640837115.0, + "step": 16799 + }, + { + "epoch": 2.13713268032057, + "ewc_loss": 0.07445459812879562, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003783350402954966, + "grad_norm": 8.587973594665527, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8669747114181519, + "num_tokens": 640882312.0, + "step": 16800 + }, + { + "epoch": 2.1372598905991604, + "ewc_loss": 0.07430459558963776, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037683502887375653, + "grad_norm": 8.577506065368652, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8644903898239136, + "num_tokens": 640921032.0, + "step": 16801 + }, + { + "epoch": 2.137387100877751, + "ewc_loss": 0.07448442280292511, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037863326724618673, + "grad_norm": 8.628525733947754, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8670114278793335, + "num_tokens": 640957424.0, + "step": 16802 + }, + { + "epoch": 2.1375143111563415, + "ewc_loss": 0.07428260147571564, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037661512033082545, + "grad_norm": 8.615878105163574, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8567618131637573, + "num_tokens": 640992985.0, + "step": 16803 + }, + { + "epoch": 2.137641521434932, + "ewc_loss": 0.07438004016876221, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000377589458366856, + "grad_norm": 8.538229942321777, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.87504643201828, + "num_tokens": 641030379.0, + "step": 16804 + }, + { + "epoch": 2.1377687317135226, + "ewc_loss": 0.07446330785751343, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003784220898523927, + "grad_norm": 8.64266586303711, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8560395240783691, + "num_tokens": 641068526.0, + "step": 16805 + }, + { + "epoch": 2.137895941992113, + "ewc_loss": 0.0743003636598587, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000376792682800442, + "grad_norm": 8.638304710388184, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8566266298294067, + "num_tokens": 641103351.0, + "step": 16806 + }, + { + "epoch": 2.1380231522707036, + "ewc_loss": 0.07435102760791779, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003772993222810328, + "grad_norm": 8.581149101257324, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8671132326126099, + "num_tokens": 641143570.0, + "step": 16807 + }, + { + "epoch": 2.138150362549294, + "ewc_loss": 0.07477926462888718, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037914031418040395, + "grad_norm": 8.629962921142578, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8686770796775818, + "num_tokens": 641180625.0, + "step": 16808 + }, + { + "epoch": 2.1382775728278847, + "ewc_loss": 0.07446109503507614, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037595859612338245, + "grad_norm": 8.563039779663086, + "learning_rate": 1e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.8433052897453308, + "num_tokens": 641226770.0, + "step": 16809 + }, + { + "epoch": 2.138404783106475, + "ewc_loss": 0.07492989301681519, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003782051499001682, + "grad_norm": 8.621479034423828, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8765836954116821, + "num_tokens": 641269682.0, + "step": 16810 + }, + { + "epoch": 2.1385319933850653, + "ewc_loss": 0.07430388033390045, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037682786933146417, + "grad_norm": 8.66512680053711, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8772226572036743, + "num_tokens": 641303803.0, + "step": 16811 + }, + { + "epoch": 2.1386592036636562, + "ewc_loss": 0.07475939393043518, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037650021840818226, + "grad_norm": 8.579127311706543, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8589582443237305, + "num_tokens": 641343141.0, + "step": 16812 + }, + { + "epoch": 2.1387864139422463, + "ewc_loss": 0.07482728362083435, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003796204982791096, + "grad_norm": 8.655084609985352, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8693557977676392, + "num_tokens": 641384006.0, + "step": 16813 + }, + { + "epoch": 2.138913624220837, + "ewc_loss": 0.07470803707838058, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003759866231121123, + "grad_norm": 8.593655586242676, + "learning_rate": 1e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8437421321868896, + "num_tokens": 641423998.0, + "step": 16814 + }, + { + "epoch": 2.1390408344994274, + "ewc_loss": 0.07508938759565353, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037980012712068856, + "grad_norm": 8.713406562805176, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8641548156738281, + "num_tokens": 641457886.0, + "step": 16815 + }, + { + "epoch": 2.139168044778018, + "ewc_loss": 0.07458910346031189, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037479729508049786, + "grad_norm": 8.529233932495117, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8655505180358887, + "num_tokens": 641503386.0, + "step": 16816 + }, + { + "epoch": 2.1392952550566084, + "ewc_loss": 0.07520116865634918, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038091797614470124, + "grad_norm": 8.68255615234375, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8746212720870972, + "num_tokens": 641539533.0, + "step": 16817 + }, + { + "epoch": 2.139422465335199, + "ewc_loss": 0.07460695505142212, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037497581797651947, + "grad_norm": 8.532781600952148, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8595582246780396, + "num_tokens": 641574020.0, + "step": 16818 + }, + { + "epoch": 2.1395496756137895, + "ewc_loss": 0.07523500919342041, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003812563663814217, + "grad_norm": 8.765512466430664, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8692291975021362, + "num_tokens": 641605206.0, + "step": 16819 + }, + { + "epoch": 2.13967688589238, + "ewc_loss": 0.07395027577877045, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037329187034629285, + "grad_norm": 8.509723663330078, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8542454242706299, + "num_tokens": 641649857.0, + "step": 16820 + }, + { + "epoch": 2.1398040961709706, + "ewc_loss": 0.07511086016893387, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003824562591034919, + "grad_norm": 8.709261894226074, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.858972430229187, + "num_tokens": 641683360.0, + "step": 16821 + }, + { + "epoch": 2.139931306449561, + "ewc_loss": 0.07429812848567963, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003743289562407881, + "grad_norm": 8.61516284942627, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8647230863571167, + "num_tokens": 641720105.0, + "step": 16822 + }, + { + "epoch": 2.1400585167281516, + "ewc_loss": 0.07469504326581955, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038073951145634055, + "grad_norm": 8.686959266662598, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8671613335609436, + "num_tokens": 641750718.0, + "step": 16823 + }, + { + "epoch": 2.140185727006742, + "ewc_loss": 0.07467673718929291, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003756736114155501, + "grad_norm": 8.525336265563965, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8628318905830383, + "num_tokens": 641791620.0, + "step": 16824 + }, + { + "epoch": 2.1403129372853327, + "ewc_loss": 0.07495811581611633, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038092880276963115, + "grad_norm": 8.740194320678711, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8531572818756104, + "num_tokens": 641833566.0, + "step": 16825 + }, + { + "epoch": 2.140440147563923, + "ewc_loss": 0.07442211359739304, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037556877941824496, + "grad_norm": 8.542150497436523, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8715545535087585, + "num_tokens": 641869080.0, + "step": 16826 + }, + { + "epoch": 2.1405673578425137, + "ewc_loss": 0.07521792501211166, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003810854977928102, + "grad_norm": 8.681842803955078, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8616465330123901, + "num_tokens": 641910195.0, + "step": 16827 + }, + { + "epoch": 2.1406945681211043, + "ewc_loss": 0.07465299963951111, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037543621147051454, + "grad_norm": 8.604567527770996, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.851159930229187, + "num_tokens": 641947270.0, + "step": 16828 + }, + { + "epoch": 2.140821778399695, + "ewc_loss": 0.07513481378555298, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003802544088102877, + "grad_norm": 8.642269134521484, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8660544157028198, + "num_tokens": 641986472.0, + "step": 16829 + }, + { + "epoch": 2.1409489886782853, + "ewc_loss": 0.0749681293964386, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037858757423236966, + "grad_norm": 8.608502388000488, + "learning_rate": 1e-06, + "loss": 0.523, + "mean_token_accuracy": 0.852196216583252, + "num_tokens": 642028130.0, + "step": 16830 + }, + { + "epoch": 2.141076198956876, + "ewc_loss": 0.0750635415315628, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003795416560024023, + "grad_norm": 8.637425422668457, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8598604202270508, + "num_tokens": 642067097.0, + "step": 16831 + }, + { + "epoch": 2.1412034092354664, + "ewc_loss": 0.0749754011631012, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037866021739318967, + "grad_norm": 8.57687759399414, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.876410961151123, + "num_tokens": 642099628.0, + "step": 16832 + }, + { + "epoch": 2.141330619514057, + "ewc_loss": 0.0752510279417038, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003814165247604251, + "grad_norm": 8.664565086364746, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8872140645980835, + "num_tokens": 642135941.0, + "step": 16833 + }, + { + "epoch": 2.1414578297926474, + "ewc_loss": 0.07464408874511719, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037778852856718004, + "grad_norm": 8.576618194580078, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8573736548423767, + "num_tokens": 642169465.0, + "step": 16834 + }, + { + "epoch": 2.141585040071238, + "ewc_loss": 0.07511451840400696, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000380051409592852, + "grad_norm": 8.63853645324707, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8669878244400024, + "num_tokens": 642208961.0, + "step": 16835 + }, + { + "epoch": 2.141712250349828, + "ewc_loss": 0.0746731162071228, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003780788683798164, + "grad_norm": 8.5936918258667, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8626948595046997, + "num_tokens": 642249691.0, + "step": 16836 + }, + { + "epoch": 2.141839460628419, + "ewc_loss": 0.07492224127054214, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038057006895542145, + "grad_norm": 8.657108306884766, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8662341237068176, + "num_tokens": 642292425.0, + "step": 16837 + }, + { + "epoch": 2.141966670907009, + "ewc_loss": 0.07458586245775223, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003772062773350626, + "grad_norm": 8.561731338500977, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8754258155822754, + "num_tokens": 642330982.0, + "step": 16838 + }, + { + "epoch": 2.1420938811855996, + "ewc_loss": 0.07528051733970642, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003817114047706127, + "grad_norm": 8.629570007324219, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8640239238739014, + "num_tokens": 642365414.0, + "step": 16839 + }, + { + "epoch": 2.14222109146419, + "ewc_loss": 0.07497796416282654, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003786858869716525, + "grad_norm": 8.609113693237305, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.858303427696228, + "num_tokens": 642398660.0, + "step": 16840 + }, + { + "epoch": 2.1423483017427807, + "ewc_loss": 0.0751587301492691, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038049358408898115, + "grad_norm": 8.703925132751465, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8715760707855225, + "num_tokens": 642432605.0, + "step": 16841 + }, + { + "epoch": 2.142475512021371, + "ewc_loss": 0.07494980096817017, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037840422010049224, + "grad_norm": 8.900636672973633, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8747979998588562, + "num_tokens": 642472041.0, + "step": 16842 + }, + { + "epoch": 2.1426027222999617, + "ewc_loss": 0.07468569278717041, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003757631639018655, + "grad_norm": 14.234045028686523, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8735058307647705, + "num_tokens": 642512399.0, + "step": 16843 + }, + { + "epoch": 2.1427299325785523, + "ewc_loss": 0.08142584562301636, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004431647539604455, + "grad_norm": 9.208736419677734, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8581951260566711, + "num_tokens": 642550829.0, + "step": 16844 + }, + { + "epoch": 2.142857142857143, + "ewc_loss": 0.0767488032579422, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039639428723603487, + "grad_norm": 9.112313270568848, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.863747775554657, + "num_tokens": 642591255.0, + "step": 16845 + }, + { + "epoch": 2.1429843531357333, + "ewc_loss": 0.07552926987409592, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038419896736741066, + "grad_norm": 8.81602954864502, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8662495613098145, + "num_tokens": 642625772.0, + "step": 16846 + }, + { + "epoch": 2.143111563414324, + "ewc_loss": 0.07707380503416061, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003996443119831383, + "grad_norm": 8.988100051879883, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8579285144805908, + "num_tokens": 642667596.0, + "step": 16847 + }, + { + "epoch": 2.1432387736929144, + "ewc_loss": 0.07530553638935089, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003819615812972188, + "grad_norm": 8.796483993530273, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8670857548713684, + "num_tokens": 642702159.0, + "step": 16848 + }, + { + "epoch": 2.143365983971505, + "ewc_loss": 0.0760398656129837, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038930494338274, + "grad_norm": 8.886802673339844, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8785207271575928, + "num_tokens": 642737307.0, + "step": 16849 + }, + { + "epoch": 2.1434931942500954, + "ewc_loss": 0.07524418830871582, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003813481016550213, + "grad_norm": 8.741954803466797, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8556528687477112, + "num_tokens": 642776660.0, + "step": 16850 + }, + { + "epoch": 2.143620404528686, + "ewc_loss": 0.07571884989738464, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003860947035718709, + "grad_norm": 8.8866605758667, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8586816787719727, + "num_tokens": 642814131.0, + "step": 16851 + }, + { + "epoch": 2.1437476148072765, + "ewc_loss": 0.07496323436498642, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000378538592485711, + "grad_norm": 8.648697853088379, + "learning_rate": 1e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8459103107452393, + "num_tokens": 642854138.0, + "step": 16852 + }, + { + "epoch": 2.143874825085867, + "ewc_loss": 0.07563073933124542, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003852136433124542, + "grad_norm": 8.754121780395508, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8709914088249207, + "num_tokens": 642893767.0, + "step": 16853 + }, + { + "epoch": 2.1440020353644575, + "ewc_loss": 0.07491325587034225, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037803882150910795, + "grad_norm": 8.678671836853027, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8804142475128174, + "num_tokens": 642928286.0, + "step": 16854 + }, + { + "epoch": 2.144129245643048, + "ewc_loss": 0.0753532201051712, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003824384475592524, + "grad_norm": 8.766927719116211, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8729811310768127, + "num_tokens": 642971220.0, + "step": 16855 + }, + { + "epoch": 2.1442564559216386, + "ewc_loss": 0.07481130212545395, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037701925612054765, + "grad_norm": 8.64456844329834, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8692204356193542, + "num_tokens": 643013618.0, + "step": 16856 + }, + { + "epoch": 2.144383666200229, + "ewc_loss": 0.0752779170870781, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038168541505001485, + "grad_norm": 8.79218864440918, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8675122857093811, + "num_tokens": 643058122.0, + "step": 16857 + }, + { + "epoch": 2.1445108764788197, + "ewc_loss": 0.07467535883188248, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003756598453037441, + "grad_norm": 8.629313468933105, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8521935939788818, + "num_tokens": 643096662.0, + "step": 16858 + }, + { + "epoch": 2.14463808675741, + "ewc_loss": 0.07524271309375763, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003813334333244711, + "grad_norm": 8.761818885803223, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8788340091705322, + "num_tokens": 643137923.0, + "step": 16859 + }, + { + "epoch": 2.1447652970360007, + "ewc_loss": 0.07477837800979614, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037669006269425154, + "grad_norm": 8.616740226745605, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8717405200004578, + "num_tokens": 643176918.0, + "step": 16860 + }, + { + "epoch": 2.144892507314591, + "ewc_loss": 0.07470326870679855, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003808217588812113, + "grad_norm": 8.727785110473633, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8834252953529358, + "num_tokens": 643217525.0, + "step": 16861 + }, + { + "epoch": 2.1450197175931813, + "ewc_loss": 0.07432354241609573, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037702449481002986, + "grad_norm": 8.684517860412598, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8831535577774048, + "num_tokens": 643255131.0, + "step": 16862 + }, + { + "epoch": 2.145146927871772, + "ewc_loss": 0.07447992265224457, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003785882727243006, + "grad_norm": 8.712751388549805, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8740125894546509, + "num_tokens": 643292092.0, + "step": 16863 + }, + { + "epoch": 2.1452741381503624, + "ewc_loss": 0.07429631799459457, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003767522284761071, + "grad_norm": 8.688819885253906, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8605493307113647, + "num_tokens": 643330510.0, + "step": 16864 + }, + { + "epoch": 2.145401348428953, + "ewc_loss": 0.0743831992149353, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000377621065126732, + "grad_norm": 8.621118545532227, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8591687083244324, + "num_tokens": 643372900.0, + "step": 16865 + }, + { + "epoch": 2.1455285587075434, + "ewc_loss": 0.07472197711467743, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037856740527786314, + "grad_norm": 8.720992088317871, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8742852210998535, + "num_tokens": 643412866.0, + "step": 16866 + }, + { + "epoch": 2.145655768986134, + "ewc_loss": 0.0743425115942955, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003747727896552533, + "grad_norm": 8.613672256469727, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8484142422676086, + "num_tokens": 643452846.0, + "step": 16867 + }, + { + "epoch": 2.1457829792647245, + "ewc_loss": 0.07466927170753479, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003804817679338157, + "grad_norm": 8.66301155090332, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8717540502548218, + "num_tokens": 643488264.0, + "step": 16868 + }, + { + "epoch": 2.145910189543315, + "ewc_loss": 0.07435083389282227, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003772973723243922, + "grad_norm": 8.678191184997559, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8532443046569824, + "num_tokens": 643528131.0, + "step": 16869 + }, + { + "epoch": 2.1460373998219056, + "ewc_loss": 0.07472091913223267, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003785568114835769, + "grad_norm": 8.658480644226074, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8642532229423523, + "num_tokens": 643564041.0, + "step": 16870 + }, + { + "epoch": 2.146164610100496, + "ewc_loss": 0.07447216659784317, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003785107401199639, + "grad_norm": 8.716361999511719, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8528977632522583, + "num_tokens": 643601507.0, + "step": 16871 + }, + { + "epoch": 2.1462918203790866, + "ewc_loss": 0.07429080456495285, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037669710582122207, + "grad_norm": 8.603267669677734, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8502956032752991, + "num_tokens": 643647161.0, + "step": 16872 + }, + { + "epoch": 2.146419030657677, + "ewc_loss": 0.07467199862003326, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003805090382229537, + "grad_norm": 8.652412414550781, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8708197474479675, + "num_tokens": 643687114.0, + "step": 16873 + }, + { + "epoch": 2.1465462409362677, + "ewc_loss": 0.07442653179168701, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003780543338507414, + "grad_norm": 8.677858352661133, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8512678146362305, + "num_tokens": 643722110.0, + "step": 16874 + }, + { + "epoch": 2.146673451214858, + "ewc_loss": 0.07474266737699509, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003787743335124105, + "grad_norm": 8.715323448181152, + "learning_rate": 1e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8428303003311157, + "num_tokens": 643765770.0, + "step": 16875 + }, + { + "epoch": 2.1468006614934487, + "ewc_loss": 0.0744643434882164, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003784325090236962, + "grad_norm": 8.561761856079102, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8555184602737427, + "num_tokens": 643810097.0, + "step": 16876 + }, + { + "epoch": 2.1469278717720393, + "ewc_loss": 0.07484551519155502, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038224420859478414, + "grad_norm": 8.773123741149902, + "learning_rate": 1e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8380789160728455, + "num_tokens": 643847904.0, + "step": 16877 + }, + { + "epoch": 2.14705508205063, + "ewc_loss": 0.07461386919021606, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003774863143917173, + "grad_norm": 8.59051513671875, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8722230195999146, + "num_tokens": 643889157.0, + "step": 16878 + }, + { + "epoch": 2.1471822923292203, + "ewc_loss": 0.07543227076530457, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003832289658021182, + "grad_norm": 8.73617935180664, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8686593174934387, + "num_tokens": 643931181.0, + "step": 16879 + }, + { + "epoch": 2.147309502607811, + "ewc_loss": 0.07476944476366043, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037660071393474936, + "grad_norm": 8.655660629272461, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8639897108078003, + "num_tokens": 643965415.0, + "step": 16880 + }, + { + "epoch": 2.1474367128864014, + "ewc_loss": 0.0749368667602539, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003807162865996361, + "grad_norm": 8.686286926269531, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8499937653541565, + "num_tokens": 644010647.0, + "step": 16881 + }, + { + "epoch": 2.147563923164992, + "ewc_loss": 0.0746045857667923, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003773935022763908, + "grad_norm": 8.672012329101562, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.881969690322876, + "num_tokens": 644044347.0, + "step": 16882 + }, + { + "epoch": 2.1476911334435824, + "ewc_loss": 0.07483776658773422, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037972533027641475, + "grad_norm": 8.64306640625, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8731252551078796, + "num_tokens": 644083456.0, + "step": 16883 + }, + { + "epoch": 2.147818343722173, + "ewc_loss": 0.07457292079925537, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037951822741888463, + "grad_norm": 8.659549713134766, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8568921089172363, + "num_tokens": 644123851.0, + "step": 16884 + }, + { + "epoch": 2.1479455540007635, + "ewc_loss": 0.07461908459663391, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003799799014814198, + "grad_norm": 8.7073335647583, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8754197359085083, + "num_tokens": 644168880.0, + "step": 16885 + }, + { + "epoch": 2.1480727642793536, + "ewc_loss": 0.0745912492275238, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037970152334310114, + "grad_norm": 8.655000686645508, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8662551641464233, + "num_tokens": 644204532.0, + "step": 16886 + }, + { + "epoch": 2.148199974557944, + "ewc_loss": 0.07512539625167847, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003801601706072688, + "grad_norm": 8.659114837646484, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8842601776123047, + "num_tokens": 644246160.0, + "step": 16887 + }, + { + "epoch": 2.1483271848365346, + "ewc_loss": 0.07523795962333679, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038128584856167436, + "grad_norm": 8.713342666625977, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8779104948043823, + "num_tokens": 644284881.0, + "step": 16888 + }, + { + "epoch": 2.148454395115125, + "ewc_loss": 0.07500586658716202, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003789649053942412, + "grad_norm": 8.677751541137695, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8585857152938843, + "num_tokens": 644326661.0, + "step": 16889 + }, + { + "epoch": 2.1485816053937157, + "ewc_loss": 0.07484474778175354, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037979509215801954, + "grad_norm": 8.670866012573242, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8698008060455322, + "num_tokens": 644364420.0, + "step": 16890 + }, + { + "epoch": 2.148708815672306, + "ewc_loss": 0.07459371536970139, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037972620339132845, + "grad_norm": 8.667134284973145, + "learning_rate": 1e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8487967252731323, + "num_tokens": 644398626.0, + "step": 16891 + }, + { + "epoch": 2.1488360259508967, + "ewc_loss": 0.07457160204648972, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037950510159134865, + "grad_norm": 8.714815139770508, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8652499914169312, + "num_tokens": 644434772.0, + "step": 16892 + }, + { + "epoch": 2.1489632362294873, + "ewc_loss": 0.0745067298412323, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003788563481066376, + "grad_norm": 8.648465156555176, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8622068166732788, + "num_tokens": 644474425.0, + "step": 16893 + }, + { + "epoch": 2.149090446508078, + "ewc_loss": 0.07499054074287415, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038125310675241053, + "grad_norm": 8.75313949584961, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8559553623199463, + "num_tokens": 644507796.0, + "step": 16894 + }, + { + "epoch": 2.1492176567866683, + "ewc_loss": 0.07463039457798004, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037765156594105065, + "grad_norm": 8.636130332946777, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8603999614715576, + "num_tokens": 644544172.0, + "step": 16895 + }, + { + "epoch": 2.149344867065259, + "ewc_loss": 0.07510380446910858, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038238565321080387, + "grad_norm": 14.40200424194336, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8637562990188599, + "num_tokens": 644582217.0, + "step": 16896 + }, + { + "epoch": 2.1494720773438494, + "ewc_loss": 0.08297310024499893, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004610786563716829, + "grad_norm": 9.502423286437988, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8713991641998291, + "num_tokens": 644620028.0, + "step": 16897 + }, + { + "epoch": 2.14959928762244, + "ewc_loss": 0.07632368803024292, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003945845819544047, + "grad_norm": 8.990781784057617, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8491385579109192, + "num_tokens": 644663743.0, + "step": 16898 + }, + { + "epoch": 2.1497264979010304, + "ewc_loss": 0.07571554183959961, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038850304554216564, + "grad_norm": 8.82834243774414, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8669022917747498, + "num_tokens": 644700914.0, + "step": 16899 + }, + { + "epoch": 2.149853708179621, + "ewc_loss": 0.07729002833366394, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004042479849886149, + "grad_norm": 9.053901672363281, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8606581091880798, + "num_tokens": 644744232.0, + "step": 16900 + }, + { + "epoch": 2.1499809184582115, + "ewc_loss": 0.07528898864984512, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003842375590465963, + "grad_norm": 8.82670783996582, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8576107621192932, + "num_tokens": 644780851.0, + "step": 16901 + }, + { + "epoch": 2.150108128736802, + "ewc_loss": 0.07573642581701279, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003911533276550472, + "grad_norm": 8.894259452819824, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8723806142807007, + "num_tokens": 644822103.0, + "step": 16902 + }, + { + "epoch": 2.1502353390153925, + "ewc_loss": 0.07506437599658966, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003844328166451305, + "grad_norm": 8.768478393554688, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8625092506408691, + "num_tokens": 644861209.0, + "step": 16903 + }, + { + "epoch": 2.150362549293983, + "ewc_loss": 0.07579874247312546, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003893350949510932, + "grad_norm": 8.867719650268555, + "learning_rate": 1e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8517609238624573, + "num_tokens": 644899860.0, + "step": 16904 + }, + { + "epoch": 2.1504897595725736, + "ewc_loss": 0.07478173077106476, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003816063981503248, + "grad_norm": 8.704853057861328, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8596150875091553, + "num_tokens": 644943101.0, + "step": 16905 + }, + { + "epoch": 2.150616969851164, + "ewc_loss": 0.07567372918128967, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038808491080999374, + "grad_norm": 8.90738582611084, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8533161282539368, + "num_tokens": 644981786.0, + "step": 16906 + }, + { + "epoch": 2.1507441801297547, + "ewc_loss": 0.07464934140443802, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.000380282464902848, + "grad_norm": 8.686358451843262, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8669999241828918, + "num_tokens": 645017949.0, + "step": 16907 + }, + { + "epoch": 2.150871390408345, + "ewc_loss": 0.07532580196857452, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038704712642356753, + "grad_norm": 8.870281219482422, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8637123107910156, + "num_tokens": 645053443.0, + "step": 16908 + }, + { + "epoch": 2.1509986006869353, + "ewc_loss": 0.07444954663515091, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003782845160458237, + "grad_norm": 8.670470237731934, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8661942481994629, + "num_tokens": 645089916.0, + "step": 16909 + }, + { + "epoch": 2.1511258109655262, + "ewc_loss": 0.07525790482759476, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003863681049551815, + "grad_norm": 8.882086753845215, + "learning_rate": 1e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8488699793815613, + "num_tokens": 645134531.0, + "step": 16910 + }, + { + "epoch": 2.1512530212441163, + "ewc_loss": 0.07428570091724396, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003766460868064314, + "grad_norm": 8.637802124023438, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8593021631240845, + "num_tokens": 645172608.0, + "step": 16911 + }, + { + "epoch": 2.151380231522707, + "ewc_loss": 0.07519493252038956, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038573838537558913, + "grad_norm": 8.830415725708008, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8605238795280457, + "num_tokens": 645209895.0, + "step": 16912 + }, + { + "epoch": 2.1515074418012974, + "ewc_loss": 0.07458396255970001, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003771872725337744, + "grad_norm": 8.650055885314941, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8706686496734619, + "num_tokens": 645245259.0, + "step": 16913 + }, + { + "epoch": 2.151634652079888, + "ewc_loss": 0.07494531571865082, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038324217894114554, + "grad_norm": 8.766424179077148, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8610471487045288, + "num_tokens": 645284962.0, + "step": 16914 + }, + { + "epoch": 2.1517618623584784, + "ewc_loss": 0.0747358649969101, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003787063469644636, + "grad_norm": 8.652154922485352, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8783779144287109, + "num_tokens": 645322352.0, + "step": 16915 + }, + { + "epoch": 2.151889072637069, + "ewc_loss": 0.07517722249031067, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003831199137493968, + "grad_norm": 8.70117473602295, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8639518022537231, + "num_tokens": 645358401.0, + "step": 16916 + }, + { + "epoch": 2.1520162829156595, + "ewc_loss": 0.0746503621339798, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00038029265124350786, + "grad_norm": 8.593864440917969, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8775914907455444, + "num_tokens": 645399147.0, + "step": 16917 + }, + { + "epoch": 2.15214349319425, + "ewc_loss": 0.07529351860284805, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038428284460678697, + "grad_norm": 8.741742134094238, + "learning_rate": 1e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8469904065132141, + "num_tokens": 645437286.0, + "step": 16918 + }, + { + "epoch": 2.1522707034728406, + "ewc_loss": 0.07471867650747299, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037853443063795567, + "grad_norm": 8.632823944091797, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8564559817314148, + "num_tokens": 645477853.0, + "step": 16919 + }, + { + "epoch": 2.152397913751431, + "ewc_loss": 0.07522733509540558, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003836210526060313, + "grad_norm": 8.683130264282227, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8551785945892334, + "num_tokens": 645520476.0, + "step": 16920 + }, + { + "epoch": 2.1525251240300216, + "ewc_loss": 0.07540347427129745, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038049957947805524, + "grad_norm": 9.207090377807617, + "learning_rate": 1e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8492704033851624, + "num_tokens": 645561539.0, + "step": 16921 + }, + { + "epoch": 2.152652334308612, + "ewc_loss": 0.07417826354503632, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037313028587959707, + "grad_norm": 8.463519096374512, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8599413633346558, + "num_tokens": 645602950.0, + "step": 16922 + }, + { + "epoch": 2.1527795445872027, + "ewc_loss": 0.07600831985473633, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039143083267845213, + "grad_norm": 8.851415634155273, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8547523021697998, + "num_tokens": 645639215.0, + "step": 16923 + }, + { + "epoch": 2.152906754865793, + "ewc_loss": 0.0741019919514656, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037236756179481745, + "grad_norm": 8.447202682495117, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8632106781005859, + "num_tokens": 645677287.0, + "step": 16924 + }, + { + "epoch": 2.1530339651443837, + "ewc_loss": 0.07619751989841461, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039332290180027485, + "grad_norm": 8.933058738708496, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8710423707962036, + "num_tokens": 645710784.0, + "step": 16925 + }, + { + "epoch": 2.1531611754229742, + "ewc_loss": 0.07421044260263443, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037345208693295717, + "grad_norm": 8.512079238891602, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8600879907608032, + "num_tokens": 645750568.0, + "step": 16926 + }, + { + "epoch": 2.1532883857015648, + "ewc_loss": 0.07596094161272049, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003909570805262774, + "grad_norm": 8.926263809204102, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8676365613937378, + "num_tokens": 645783911.0, + "step": 16927 + }, + { + "epoch": 2.1534155959801553, + "ewc_loss": 0.07437995076179504, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037514715222641826, + "grad_norm": 8.513468742370605, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8807029128074646, + "num_tokens": 645822524.0, + "step": 16928 + }, + { + "epoch": 2.153542806258746, + "ewc_loss": 0.07588108628988266, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039015852962620556, + "grad_norm": 8.89615535736084, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8596015572547913, + "num_tokens": 645856924.0, + "step": 16929 + }, + { + "epoch": 2.1536700165373364, + "ewc_loss": 0.07456885278224945, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037703622365370393, + "grad_norm": 8.566584587097168, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8565232753753662, + "num_tokens": 645896995.0, + "step": 16930 + }, + { + "epoch": 2.153797226815927, + "ewc_loss": 0.07575435936450958, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038889123243279755, + "grad_norm": 8.777862548828125, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8705461621284485, + "num_tokens": 645941768.0, + "step": 16931 + }, + { + "epoch": 2.1539244370945174, + "ewc_loss": 0.07507583498954773, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003796645614784211, + "grad_norm": 8.60887336730957, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8619673252105713, + "num_tokens": 645981909.0, + "step": 16932 + }, + { + "epoch": 2.154051647373108, + "ewc_loss": 0.07572831213474274, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003861893492285162, + "grad_norm": 8.789804458618164, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8533343076705933, + "num_tokens": 646022034.0, + "step": 16933 + }, + { + "epoch": 2.154178857651698, + "ewc_loss": 0.07528721541166306, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003817784017883241, + "grad_norm": 8.700416564941406, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8635362386703491, + "num_tokens": 646056714.0, + "step": 16934 + }, + { + "epoch": 2.1543060679302886, + "ewc_loss": 0.07546859234571457, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003835921816062182, + "grad_norm": 8.741695404052734, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8555809259414673, + "num_tokens": 646095668.0, + "step": 16935 + }, + { + "epoch": 2.154433278208879, + "ewc_loss": 0.075299471616745, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003819009871222079, + "grad_norm": 8.680429458618164, + "learning_rate": 1e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8486632108688354, + "num_tokens": 646136745.0, + "step": 16936 + }, + { + "epoch": 2.1545604884874696, + "ewc_loss": 0.07531219720840454, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003820282290689647, + "grad_norm": 8.690096855163574, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8602745532989502, + "num_tokens": 646180314.0, + "step": 16937 + }, + { + "epoch": 2.15468769876606, + "ewc_loss": 0.07533246278762817, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038223087904043496, + "grad_norm": 8.675363540649414, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8659939765930176, + "num_tokens": 646218191.0, + "step": 16938 + }, + { + "epoch": 2.1548149090446507, + "ewc_loss": 0.07535189390182495, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003824251762125641, + "grad_norm": 8.751314163208008, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8794994950294495, + "num_tokens": 646256350.0, + "step": 16939 + }, + { + "epoch": 2.154942119323241, + "ewc_loss": 0.07509776949882507, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003798839170485735, + "grad_norm": 8.647878646850586, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8864966630935669, + "num_tokens": 646290915.0, + "step": 16940 + }, + { + "epoch": 2.1550693296018317, + "ewc_loss": 0.07553844898939133, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038429073174484074, + "grad_norm": 8.800335884094238, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8575128316879272, + "num_tokens": 646325838.0, + "step": 16941 + }, + { + "epoch": 2.1551965398804223, + "ewc_loss": 0.0748906284570694, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037781253922730684, + "grad_norm": 8.655975341796875, + "learning_rate": 1e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.8447973728179932, + "num_tokens": 646362742.0, + "step": 16942 + }, + { + "epoch": 2.155323750159013, + "ewc_loss": 0.07554037123918533, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038430996937677264, + "grad_norm": 8.718547821044922, + "learning_rate": 1e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8511462807655334, + "num_tokens": 646403958.0, + "step": 16943 + }, + { + "epoch": 2.1554509604376033, + "ewc_loss": 0.07471626996994019, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037851036177016795, + "grad_norm": 8.661635398864746, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.855245053768158, + "num_tokens": 646442872.0, + "step": 16944 + }, + { + "epoch": 2.155578170716194, + "ewc_loss": 0.07512097805738449, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038255745312198997, + "grad_norm": 8.72696304321289, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8700990676879883, + "num_tokens": 646484642.0, + "step": 16945 + }, + { + "epoch": 2.1557053809947844, + "ewc_loss": 0.0745576024055481, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003793650830630213, + "grad_norm": 8.6502103805542, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.873218297958374, + "num_tokens": 646521688.0, + "step": 16946 + }, + { + "epoch": 2.155832591273375, + "ewc_loss": 0.07512372732162476, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003825849562417716, + "grad_norm": 8.694051742553711, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.871621310710907, + "num_tokens": 646560182.0, + "step": 16947 + }, + { + "epoch": 2.1559598015519654, + "ewc_loss": 0.07496944069862366, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003810420457739383, + "grad_norm": 8.750822067260742, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8691074848175049, + "num_tokens": 646593252.0, + "step": 16948 + }, + { + "epoch": 2.156087011830556, + "ewc_loss": 0.07461649179458618, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.00037995396996848285, + "grad_norm": 8.689148902893066, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8622437715530396, + "num_tokens": 646635514.0, + "step": 16949 + }, + { + "epoch": 2.1562142221091465, + "ewc_loss": 0.07511144876480103, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003824621671810746, + "grad_norm": 8.770405769348145, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8523274660110474, + "num_tokens": 646674785.0, + "step": 16950 + }, + { + "epoch": 2.156341432387737, + "ewc_loss": 0.07460561394691467, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003774038050323725, + "grad_norm": 8.612606048583984, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8642182350158691, + "num_tokens": 646713832.0, + "step": 16951 + }, + { + "epoch": 2.1564686426663275, + "ewc_loss": 0.07524020969867706, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038374977884814143, + "grad_norm": 8.72635555267334, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.864723801612854, + "num_tokens": 646752218.0, + "step": 16952 + }, + { + "epoch": 2.156595852944918, + "ewc_loss": 0.07466056942939758, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037795331445522606, + "grad_norm": 8.63123607635498, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8618811964988708, + "num_tokens": 646786196.0, + "step": 16953 + }, + { + "epoch": 2.1567230632235086, + "ewc_loss": 0.07522661983966827, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003836138639599085, + "grad_norm": 8.793184280395508, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8559214472770691, + "num_tokens": 646821462.0, + "step": 16954 + }, + { + "epoch": 2.156850273502099, + "ewc_loss": 0.07465144991874695, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037786219036206603, + "grad_norm": 8.616751670837402, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.871059775352478, + "num_tokens": 646856613.0, + "step": 16955 + }, + { + "epoch": 2.1569774837806897, + "ewc_loss": 0.07536410540342331, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003849887289106846, + "grad_norm": 8.760371208190918, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.875733494758606, + "num_tokens": 646893013.0, + "step": 16956 + }, + { + "epoch": 2.15710469405928, + "ewc_loss": 0.07456530630588531, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003770007169805467, + "grad_norm": 8.573440551757812, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8682543635368347, + "num_tokens": 646936180.0, + "step": 16957 + }, + { + "epoch": 2.1572319043378707, + "ewc_loss": 0.07550179958343506, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003863656602334231, + "grad_norm": 8.820950508117676, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8641631603240967, + "num_tokens": 646979314.0, + "step": 16958 + }, + { + "epoch": 2.157359114616461, + "ewc_loss": 0.07444213330745697, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003757690137717873, + "grad_norm": 8.561372756958008, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8749886751174927, + "num_tokens": 647020136.0, + "step": 16959 + }, + { + "epoch": 2.1574863248950513, + "ewc_loss": 0.07542672753334045, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003856149560306221, + "grad_norm": 8.820505142211914, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8677748441696167, + "num_tokens": 647052970.0, + "step": 16960 + }, + { + "epoch": 2.157613535173642, + "ewc_loss": 0.07491423189640045, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003780485421884805, + "grad_norm": 8.59874439239502, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8574607372283936, + "num_tokens": 647093195.0, + "step": 16961 + }, + { + "epoch": 2.1577407454522324, + "ewc_loss": 0.07541073858737946, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003854550304822624, + "grad_norm": 8.761929512023926, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8642868995666504, + "num_tokens": 647133800.0, + "step": 16962 + }, + { + "epoch": 2.157867955730823, + "ewc_loss": 0.07485638558864594, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000379911478376016, + "grad_norm": 8.688154220581055, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.847906768321991, + "num_tokens": 647167704.0, + "step": 16963 + }, + { + "epoch": 2.1579951660094134, + "ewc_loss": 0.07534366846084595, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038478439091704786, + "grad_norm": 8.759218215942383, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8509777188301086, + "num_tokens": 647208474.0, + "step": 16964 + }, + { + "epoch": 2.158122376288004, + "ewc_loss": 0.07487662136554718, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003801138955168426, + "grad_norm": 8.61529541015625, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8767354488372803, + "num_tokens": 647242131.0, + "step": 16965 + }, + { + "epoch": 2.1582495865665945, + "ewc_loss": 0.07557906210422516, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000384696846595034, + "grad_norm": 8.800185203552246, + "learning_rate": 1e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8443694114685059, + "num_tokens": 647280952.0, + "step": 16966 + }, + { + "epoch": 2.158376796845185, + "ewc_loss": 0.07503440976142883, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037925035576336086, + "grad_norm": 8.641923904418945, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8585078716278076, + "num_tokens": 647318411.0, + "step": 16967 + }, + { + "epoch": 2.1585040071237755, + "ewc_loss": 0.07532590627670288, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003846067702397704, + "grad_norm": 8.733219146728516, + "learning_rate": 1e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8504221439361572, + "num_tokens": 647361002.0, + "step": 16968 + }, + { + "epoch": 2.158631217402366, + "ewc_loss": 0.07507698237895966, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003796760574914515, + "grad_norm": 8.633655548095703, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8658270835876465, + "num_tokens": 647402995.0, + "step": 16969 + }, + { + "epoch": 2.1587584276809566, + "ewc_loss": 0.07564535737037659, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003853598318528384, + "grad_norm": 8.791486740112305, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8654341697692871, + "num_tokens": 647441220.0, + "step": 16970 + }, + { + "epoch": 2.158885637959547, + "ewc_loss": 0.07483350485563278, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037968269316479564, + "grad_norm": 8.669960975646973, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8567072153091431, + "num_tokens": 647476832.0, + "step": 16971 + }, + { + "epoch": 2.1590128482381377, + "ewc_loss": 0.07555060088634491, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003844122402369976, + "grad_norm": 8.72961139678955, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8666998744010925, + "num_tokens": 647516855.0, + "step": 16972 + }, + { + "epoch": 2.159140058516728, + "ewc_loss": 0.07512505352497101, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038015679456293583, + "grad_norm": 8.709239959716797, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8607262969017029, + "num_tokens": 647556913.0, + "step": 16973 + }, + { + "epoch": 2.1592672687953187, + "ewc_loss": 0.0752842128276825, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003817483375314623, + "grad_norm": 8.698684692382812, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8812583684921265, + "num_tokens": 647591242.0, + "step": 16974 + }, + { + "epoch": 2.1593944790739092, + "ewc_loss": 0.07530718296766281, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038197808316908777, + "grad_norm": 8.697114944458008, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8670861721038818, + "num_tokens": 647632156.0, + "step": 16975 + }, + { + "epoch": 2.1595216893524998, + "ewc_loss": 0.07530690729618073, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003819753183051944, + "grad_norm": 8.710722923278809, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8696180582046509, + "num_tokens": 647669857.0, + "step": 16976 + }, + { + "epoch": 2.1596488996310903, + "ewc_loss": 0.0752338171005249, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003812444629147649, + "grad_norm": 8.666728019714355, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8513794541358948, + "num_tokens": 647715580.0, + "step": 16977 + }, + { + "epoch": 2.159776109909681, + "ewc_loss": 0.07550561428070068, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038396238232962787, + "grad_norm": 8.758227348327637, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.855117917060852, + "num_tokens": 647755917.0, + "step": 16978 + }, + { + "epoch": 2.1599033201882714, + "ewc_loss": 0.07516307383775711, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038053697790019214, + "grad_norm": 8.6735258102417, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8794387578964233, + "num_tokens": 647802205.0, + "step": 16979 + }, + { + "epoch": 2.160030530466862, + "ewc_loss": 0.07538712024688721, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038277747808024287, + "grad_norm": 8.735857009887695, + "learning_rate": 1e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8508275747299194, + "num_tokens": 647846627.0, + "step": 16980 + }, + { + "epoch": 2.1601577407454524, + "ewc_loss": 0.07496844232082367, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003810320340562612, + "grad_norm": 8.741183280944824, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8547816276550293, + "num_tokens": 647885033.0, + "step": 16981 + }, + { + "epoch": 2.160284951024043, + "ewc_loss": 0.07487516850233078, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038009934360161424, + "grad_norm": 8.665132522583008, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.854121744632721, + "num_tokens": 647923370.0, + "step": 16982 + }, + { + "epoch": 2.1604121613026335, + "ewc_loss": 0.07527214288711548, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003816276730503887, + "grad_norm": 8.687411308288574, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8704582452774048, + "num_tokens": 647963431.0, + "step": 16983 + }, + { + "epoch": 2.1605393715812236, + "ewc_loss": 0.0748589038848877, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003799366531893611, + "grad_norm": 8.664505004882812, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8561236262321472, + "num_tokens": 648002854.0, + "step": 16984 + }, + { + "epoch": 2.160666581859814, + "ewc_loss": 0.07501690089702606, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038151670014485717, + "grad_norm": 8.710722923278809, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8507304191589355, + "num_tokens": 648041082.0, + "step": 16985 + }, + { + "epoch": 2.1607937921384046, + "ewc_loss": 0.07512085884809494, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003825562307611108, + "grad_norm": 8.71204948425293, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8550301194190979, + "num_tokens": 648082109.0, + "step": 16986 + }, + { + "epoch": 2.160921002416995, + "ewc_loss": 0.07477076351642609, + "ewc_loss_diag": 3.6716461181640625e-05, + "ewc_loss_parallel": 0.0003814967058133334, + "grad_norm": 8.78586483001709, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8787147402763367, + "num_tokens": 648117398.0, + "step": 16987 + }, + { + "epoch": 2.1610482126955857, + "ewc_loss": 0.07493182271718979, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000380665878765285, + "grad_norm": 8.674090385437012, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8720110058784485, + "num_tokens": 648158997.0, + "step": 16988 + }, + { + "epoch": 2.161175422974176, + "ewc_loss": 0.0750901997089386, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038224965101107955, + "grad_norm": 8.739744186401367, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8613419532775879, + "num_tokens": 648194334.0, + "step": 16989 + }, + { + "epoch": 2.1613026332527667, + "ewc_loss": 0.07489903271198273, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038033799501135945, + "grad_norm": 8.694597244262695, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8595675826072693, + "num_tokens": 648230601.0, + "step": 16990 + }, + { + "epoch": 2.1614298435313573, + "ewc_loss": 0.07499851286411285, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038133279304020107, + "grad_norm": 8.702160835266113, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8585449457168579, + "num_tokens": 648274979.0, + "step": 16991 + }, + { + "epoch": 2.161557053809948, + "ewc_loss": 0.07501820474863052, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038152970955707133, + "grad_norm": 8.688857078552246, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8478204011917114, + "num_tokens": 648319229.0, + "step": 16992 + }, + { + "epoch": 2.1616842640885383, + "ewc_loss": 0.07495816051959991, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003809292975347489, + "grad_norm": 8.681779861450195, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8767929673194885, + "num_tokens": 648354550.0, + "step": 16993 + }, + { + "epoch": 2.161811474367129, + "ewc_loss": 0.0750860720872879, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038220841088332236, + "grad_norm": 8.735624313354492, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.87651127576828, + "num_tokens": 648389820.0, + "step": 16994 + }, + { + "epoch": 2.1619386846457194, + "ewc_loss": 0.07489872723817825, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003803349391091615, + "grad_norm": 8.642660140991211, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8784819841384888, + "num_tokens": 648423841.0, + "step": 16995 + }, + { + "epoch": 2.16206589492431, + "ewc_loss": 0.07514233142137527, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000382770987926051, + "grad_norm": 8.728750228881836, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8508524894714355, + "num_tokens": 648459865.0, + "step": 16996 + }, + { + "epoch": 2.1621931052029004, + "ewc_loss": 0.07485467940568924, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037989445263519883, + "grad_norm": 8.638884544372559, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8817731142044067, + "num_tokens": 648492639.0, + "step": 16997 + }, + { + "epoch": 2.162320315481491, + "ewc_loss": 0.07526938617229462, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038404151564463973, + "grad_norm": 8.733194351196289, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8695958852767944, + "num_tokens": 648533456.0, + "step": 16998 + }, + { + "epoch": 2.1624475257600815, + "ewc_loss": 0.07495182752609253, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038086590939201415, + "grad_norm": 8.646210670471191, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8782002925872803, + "num_tokens": 648571955.0, + "step": 16999 + }, + { + "epoch": 2.162574736038672, + "ewc_loss": 0.07521942257881165, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038354191929101944, + "grad_norm": 8.727543830871582, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8801876306533813, + "num_tokens": 648614284.0, + "step": 17000 + }, + { + "epoch": 2.1627019463172625, + "ewc_loss": 0.07480201870203018, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003793678479269147, + "grad_norm": 8.647396087646484, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8728798627853394, + "num_tokens": 648654399.0, + "step": 17001 + }, + { + "epoch": 2.162829156595853, + "ewc_loss": 0.07523937523365021, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003837414551526308, + "grad_norm": 8.694217681884766, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8684768676757812, + "num_tokens": 648694490.0, + "step": 17002 + }, + { + "epoch": 2.1629563668744436, + "ewc_loss": 0.0749787762761116, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003811354108620435, + "grad_norm": 8.689501762390137, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8697683215141296, + "num_tokens": 648730665.0, + "step": 17003 + }, + { + "epoch": 2.163083577153034, + "ewc_loss": 0.07514050602912903, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003827527689281851, + "grad_norm": 8.735151290893555, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8682364225387573, + "num_tokens": 648765289.0, + "step": 17004 + }, + { + "epoch": 2.1632107874316246, + "ewc_loss": 0.07505260407924652, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038187368772923946, + "grad_norm": 8.667135238647461, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8655802607536316, + "num_tokens": 648803982.0, + "step": 17005 + }, + { + "epoch": 2.163337997710215, + "ewc_loss": 0.0750080794095993, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038142845733091235, + "grad_norm": 8.702569007873535, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8692020773887634, + "num_tokens": 648840992.0, + "step": 17006 + }, + { + "epoch": 2.1634652079888053, + "ewc_loss": 0.07492496818304062, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003805973392445594, + "grad_norm": 8.638151168823242, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8607723116874695, + "num_tokens": 648883621.0, + "step": 17007 + }, + { + "epoch": 2.1635924182673962, + "ewc_loss": 0.07529479265213013, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003842955338768661, + "grad_norm": 8.724201202392578, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8508473634719849, + "num_tokens": 648922258.0, + "step": 17008 + }, + { + "epoch": 2.1637196285459863, + "ewc_loss": 0.0750148817896843, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038149647298268974, + "grad_norm": 8.673377990722656, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8638720512390137, + "num_tokens": 648965010.0, + "step": 17009 + }, + { + "epoch": 2.163846838824577, + "ewc_loss": 0.07522672414779663, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038361491169780493, + "grad_norm": 8.68364143371582, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8589931130409241, + "num_tokens": 649008124.0, + "step": 17010 + }, + { + "epoch": 2.1639740491031674, + "ewc_loss": 0.07511280477046967, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000382475700462237, + "grad_norm": 8.728591918945312, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8642410635948181, + "num_tokens": 649044743.0, + "step": 17011 + }, + { + "epoch": 2.164101259381758, + "ewc_loss": 0.07509233057498932, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003822709550149739, + "grad_norm": 8.7300443649292, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8781980872154236, + "num_tokens": 649078410.0, + "step": 17012 + }, + { + "epoch": 2.1642284696603484, + "ewc_loss": 0.0750407725572586, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038175538065843284, + "grad_norm": 8.66052532196045, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8741329908370972, + "num_tokens": 649115415.0, + "step": 17013 + }, + { + "epoch": 2.164355679938939, + "ewc_loss": 0.075259268283844, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003839403507299721, + "grad_norm": 8.833854675292969, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8621948957443237, + "num_tokens": 649156266.0, + "step": 17014 + }, + { + "epoch": 2.1644828902175295, + "ewc_loss": 0.07477270066738129, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000379074685042724, + "grad_norm": 8.630783081054688, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8705534338951111, + "num_tokens": 649192552.0, + "step": 17015 + }, + { + "epoch": 2.16461010049612, + "ewc_loss": 0.0753955990076065, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038530369056388736, + "grad_norm": 8.759453773498535, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.866493284702301, + "num_tokens": 649224163.0, + "step": 17016 + }, + { + "epoch": 2.1647373107747105, + "ewc_loss": 0.07501204311847687, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038146806764416397, + "grad_norm": 8.672821998596191, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8854270577430725, + "num_tokens": 649259348.0, + "step": 17017 + }, + { + "epoch": 2.164864521053301, + "ewc_loss": 0.07537670433521271, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003851147193927318, + "grad_norm": 8.759827613830566, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.879347562789917, + "num_tokens": 649296875.0, + "step": 17018 + }, + { + "epoch": 2.1649917313318916, + "ewc_loss": 0.0749540776014328, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038088837754912674, + "grad_norm": 8.654130935668945, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.85096275806427, + "num_tokens": 649333455.0, + "step": 17019 + }, + { + "epoch": 2.165118941610482, + "ewc_loss": 0.07523681968450546, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003837158437818289, + "grad_norm": 8.731232643127441, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8665773868560791, + "num_tokens": 649364567.0, + "step": 17020 + }, + { + "epoch": 2.1652461518890727, + "ewc_loss": 0.07502181828022003, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038156582741066813, + "grad_norm": 8.731639862060547, + "learning_rate": 1e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8461831212043762, + "num_tokens": 649400579.0, + "step": 17021 + }, + { + "epoch": 2.165373362167663, + "ewc_loss": 0.07501918077468872, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003815394302364439, + "grad_norm": 8.728519439697266, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8764747977256775, + "num_tokens": 649432940.0, + "step": 17022 + }, + { + "epoch": 2.1655005724462537, + "ewc_loss": 0.07508330792188644, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000382180733140558, + "grad_norm": 8.672578811645508, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8702688217163086, + "num_tokens": 649466695.0, + "step": 17023 + }, + { + "epoch": 2.1656277827248442, + "ewc_loss": 0.07504524290561676, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038180011324584484, + "grad_norm": 8.719137191772461, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8611733317375183, + "num_tokens": 649504578.0, + "step": 17024 + }, + { + "epoch": 2.1657549930034348, + "ewc_loss": 0.07480473071336746, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037939497269690037, + "grad_norm": 8.681045532226562, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8536641597747803, + "num_tokens": 649541536.0, + "step": 17025 + }, + { + "epoch": 2.1658822032820253, + "ewc_loss": 0.07498110830783844, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003811586939264089, + "grad_norm": 8.708134651184082, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8640468716621399, + "num_tokens": 649581280.0, + "step": 17026 + }, + { + "epoch": 2.166009413560616, + "ewc_loss": 0.07485256344079971, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003798732941504568, + "grad_norm": 8.666988372802734, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8855880498886108, + "num_tokens": 649619837.0, + "step": 17027 + }, + { + "epoch": 2.1661366238392064, + "ewc_loss": 0.07496483623981476, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038099606172181666, + "grad_norm": 8.710885047912598, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.87534099817276, + "num_tokens": 649654708.0, + "step": 17028 + }, + { + "epoch": 2.166263834117797, + "ewc_loss": 0.07489663362503052, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003803139552474022, + "grad_norm": 8.701155662536621, + "learning_rate": 1e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8422049880027771, + "num_tokens": 649691709.0, + "step": 17029 + }, + { + "epoch": 2.1663910443963874, + "ewc_loss": 0.07505272328853607, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003794334770645946, + "grad_norm": 8.65356731414795, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8603691458702087, + "num_tokens": 649734876.0, + "step": 17030 + }, + { + "epoch": 2.166518254674978, + "ewc_loss": 0.07519933581352234, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003808995825238526, + "grad_norm": 8.678784370422363, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.873939573764801, + "num_tokens": 649775477.0, + "step": 17031 + }, + { + "epoch": 2.166645464953568, + "ewc_loss": 0.07488444447517395, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038019215571694076, + "grad_norm": 8.58604621887207, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.87647944688797, + "num_tokens": 649816136.0, + "step": 17032 + }, + { + "epoch": 2.1667726752321586, + "ewc_loss": 0.07538624852895737, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038276874693110585, + "grad_norm": 8.927000999450684, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.873041570186615, + "num_tokens": 649847942.0, + "step": 17033 + }, + { + "epoch": 2.166899885510749, + "ewc_loss": 0.07465295493602753, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037543580401688814, + "grad_norm": 8.59561824798584, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8611599206924438, + "num_tokens": 649887875.0, + "step": 17034 + }, + { + "epoch": 2.1670270957893396, + "ewc_loss": 0.07545742392539978, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038592194323427975, + "grad_norm": 8.796846389770508, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.852392852306366, + "num_tokens": 649920557.0, + "step": 17035 + }, + { + "epoch": 2.16715430606793, + "ewc_loss": 0.07439815253019333, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003753291966859251, + "grad_norm": 8.560962677001953, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8731999397277832, + "num_tokens": 649959704.0, + "step": 17036 + }, + { + "epoch": 2.1672815163465207, + "ewc_loss": 0.07531039416790009, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000384451646823436, + "grad_norm": 8.704723358154297, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.869925856590271, + "num_tokens": 650000433.0, + "step": 17037 + }, + { + "epoch": 2.167408726625111, + "ewc_loss": 0.07464037835597992, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003777514211833477, + "grad_norm": 8.562118530273438, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8764575123786926, + "num_tokens": 650038793.0, + "step": 17038 + }, + { + "epoch": 2.1675359369037017, + "ewc_loss": 0.07530419528484344, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003843895683530718, + "grad_norm": 8.740899085998535, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8743728995323181, + "num_tokens": 650072877.0, + "step": 17039 + }, + { + "epoch": 2.1676631471822922, + "ewc_loss": 0.07480307668447495, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003793784126173705, + "grad_norm": 8.612655639648438, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8530083298683167, + "num_tokens": 650109245.0, + "step": 17040 + }, + { + "epoch": 2.1677903574608828, + "ewc_loss": 0.07517962157726288, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003831439244095236, + "grad_norm": 8.67105770111084, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8784084916114807, + "num_tokens": 650150254.0, + "step": 17041 + }, + { + "epoch": 2.1679175677394733, + "ewc_loss": 0.07500194013118744, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003813670191448182, + "grad_norm": 8.646759986877441, + "learning_rate": 1e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8442922830581665, + "num_tokens": 650189587.0, + "step": 17042 + }, + { + "epoch": 2.168044778018064, + "ewc_loss": 0.07512108981609344, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003825585008598864, + "grad_norm": 8.853303909301758, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8619025945663452, + "num_tokens": 650225443.0, + "step": 17043 + }, + { + "epoch": 2.1681719882966544, + "ewc_loss": 0.07471806555986404, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037852831883355975, + "grad_norm": 8.600162506103516, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8605427742004395, + "num_tokens": 650262319.0, + "step": 17044 + }, + { + "epoch": 2.168299198575245, + "ewc_loss": 0.07536184787750244, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003849661152344197, + "grad_norm": 8.775322914123535, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8463788032531738, + "num_tokens": 650297137.0, + "step": 17045 + }, + { + "epoch": 2.1684264088538354, + "ewc_loss": 0.07460525631904602, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003774002252612263, + "grad_norm": 8.569890975952148, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8621197938919067, + "num_tokens": 650335521.0, + "step": 17046 + }, + { + "epoch": 2.168553619132426, + "ewc_loss": 0.07552678883075714, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038661554572172463, + "grad_norm": 8.797989845275879, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8589651584625244, + "num_tokens": 650373050.0, + "step": 17047 + }, + { + "epoch": 2.1686808294110165, + "ewc_loss": 0.0745856761932373, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037720438558608294, + "grad_norm": 8.54931926727295, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8712877631187439, + "num_tokens": 650407438.0, + "step": 17048 + }, + { + "epoch": 2.168808039689607, + "ewc_loss": 0.07557173818349838, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003870650252792984, + "grad_norm": 8.77348804473877, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.860169529914856, + "num_tokens": 650447793.0, + "step": 17049 + }, + { + "epoch": 2.1689352499681975, + "ewc_loss": 0.07468332350254059, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003781809355132282, + "grad_norm": 8.59329891204834, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8598440885543823, + "num_tokens": 650487167.0, + "step": 17050 + }, + { + "epoch": 2.169062460246788, + "ewc_loss": 0.07542961090803146, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003856437688227743, + "grad_norm": 8.760071754455566, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8517513871192932, + "num_tokens": 650522960.0, + "step": 17051 + }, + { + "epoch": 2.1691896705253786, + "ewc_loss": 0.07488807290792465, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003802283899858594, + "grad_norm": 8.592544555664062, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.857509434223175, + "num_tokens": 650564161.0, + "step": 17052 + }, + { + "epoch": 2.169316880803969, + "ewc_loss": 0.07552585005760193, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000386606203392148, + "grad_norm": 8.783236503601074, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8703431487083435, + "num_tokens": 650599201.0, + "step": 17053 + }, + { + "epoch": 2.1694440910825596, + "ewc_loss": 0.07498666644096375, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003787729365285486, + "grad_norm": 8.582784652709961, + "learning_rate": 1e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8455565571784973, + "num_tokens": 650637385.0, + "step": 17054 + }, + { + "epoch": 2.16957130136115, + "ewc_loss": 0.07562734186649323, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038762111216783524, + "grad_norm": 8.780345916748047, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.864703893661499, + "num_tokens": 650671954.0, + "step": 17055 + }, + { + "epoch": 2.1696985116397407, + "ewc_loss": 0.07487666606903076, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003801143611781299, + "grad_norm": 8.631277084350586, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.881592869758606, + "num_tokens": 650708475.0, + "step": 17056 + }, + { + "epoch": 2.169825721918331, + "ewc_loss": 0.07540516555309296, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038539929664693773, + "grad_norm": 8.676084518432617, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8678164482116699, + "num_tokens": 650748089.0, + "step": 17057 + }, + { + "epoch": 2.1699529321969213, + "ewc_loss": 0.07503870129585266, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038173465873114765, + "grad_norm": 8.69486141204834, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.86713707447052, + "num_tokens": 650790869.0, + "step": 17058 + }, + { + "epoch": 2.170080142475512, + "ewc_loss": 0.0753747820854187, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003826541069429368, + "grad_norm": 8.832070350646973, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.850562572479248, + "num_tokens": 650830697.0, + "step": 17059 + }, + { + "epoch": 2.1702073527541024, + "ewc_loss": 0.07476380467414856, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037898574373684824, + "grad_norm": 8.6548433303833, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8744438290596008, + "num_tokens": 650868755.0, + "step": 17060 + }, + { + "epoch": 2.170334563032693, + "ewc_loss": 0.07547848671674728, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003836911346297711, + "grad_norm": 8.724963188171387, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8747779130935669, + "num_tokens": 650908019.0, + "step": 17061 + }, + { + "epoch": 2.1704617733112834, + "ewc_loss": 0.07501421868801117, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037904849159531295, + "grad_norm": 8.636868476867676, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8630672693252563, + "num_tokens": 650954195.0, + "step": 17062 + }, + { + "epoch": 2.170588983589874, + "ewc_loss": 0.07538963109254837, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003828025655820966, + "grad_norm": 8.694910049438477, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8792175054550171, + "num_tokens": 650992740.0, + "step": 17063 + }, + { + "epoch": 2.1707161938684645, + "ewc_loss": 0.075298011302948, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003818863187916577, + "grad_norm": 8.732023239135742, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8781828284263611, + "num_tokens": 651031524.0, + "step": 17064 + }, + { + "epoch": 2.170843404147055, + "ewc_loss": 0.0751686543226242, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003805928281508386, + "grad_norm": 8.781415939331055, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8641796708106995, + "num_tokens": 651070004.0, + "step": 17065 + }, + { + "epoch": 2.1709706144256455, + "ewc_loss": 0.07507593929767609, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003796656965278089, + "grad_norm": 8.725106239318848, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.851207435131073, + "num_tokens": 651104474.0, + "step": 17066 + }, + { + "epoch": 2.171097824704236, + "ewc_loss": 0.07487624883651733, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003801101993303746, + "grad_norm": 8.644554138183594, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8567062616348267, + "num_tokens": 651148913.0, + "step": 17067 + }, + { + "epoch": 2.1712250349828266, + "ewc_loss": 0.07534648478031158, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003823711012955755, + "grad_norm": 8.739418029785156, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8729554414749146, + "num_tokens": 651192072.0, + "step": 17068 + }, + { + "epoch": 2.171352245261417, + "ewc_loss": 0.07485611736774445, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000379908800823614, + "grad_norm": 8.729694366455078, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8567109107971191, + "num_tokens": 651226132.0, + "step": 17069 + }, + { + "epoch": 2.1714794555400077, + "ewc_loss": 0.0752352625131607, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003812589275185019, + "grad_norm": 8.707489967346191, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8662487268447876, + "num_tokens": 651263467.0, + "step": 17070 + }, + { + "epoch": 2.171606665818598, + "ewc_loss": 0.07505254447460175, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038187310565263033, + "grad_norm": 8.653233528137207, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8551336526870728, + "num_tokens": 651309521.0, + "step": 17071 + }, + { + "epoch": 2.1717338760971887, + "ewc_loss": 0.07534801959991455, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003823864390142262, + "grad_norm": 8.690703392028809, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8669565916061401, + "num_tokens": 651347026.0, + "step": 17072 + }, + { + "epoch": 2.1718610863757792, + "ewc_loss": 0.07510319352149963, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038237954140640795, + "grad_norm": 8.744755744934082, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8738257884979248, + "num_tokens": 651380545.0, + "step": 17073 + }, + { + "epoch": 2.1719882966543698, + "ewc_loss": 0.07526367157697678, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003815429809037596, + "grad_norm": 8.70039176940918, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8698247075080872, + "num_tokens": 651409428.0, + "step": 17074 + }, + { + "epoch": 2.1721155069329603, + "ewc_loss": 0.07514212280511856, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038276889245025814, + "grad_norm": 8.674631118774414, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8517792820930481, + "num_tokens": 651447993.0, + "step": 17075 + }, + { + "epoch": 2.172242717211551, + "ewc_loss": 0.07513247430324554, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038267235504463315, + "grad_norm": 8.707537651062012, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.858802855014801, + "num_tokens": 651488210.0, + "step": 17076 + }, + { + "epoch": 2.1723699274901414, + "ewc_loss": 0.07538169622421265, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038272319943644106, + "grad_norm": 8.69222354888916, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8603742122650146, + "num_tokens": 651525742.0, + "step": 17077 + }, + { + "epoch": 2.172497137768732, + "ewc_loss": 0.07536893337965012, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003825955791398883, + "grad_norm": 8.659168243408203, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8711488842964172, + "num_tokens": 651565569.0, + "step": 17078 + }, + { + "epoch": 2.1726243480473224, + "ewc_loss": 0.07547637820243835, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003836700052488595, + "grad_norm": 8.67791748046875, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8592777252197266, + "num_tokens": 651605291.0, + "step": 17079 + }, + { + "epoch": 2.172751558325913, + "ewc_loss": 0.07551581412553787, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003840643912553787, + "grad_norm": 8.672623634338379, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8654923439025879, + "num_tokens": 651641998.0, + "step": 17080 + }, + { + "epoch": 2.1728787686045035, + "ewc_loss": 0.07554636895656586, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038436995237134397, + "grad_norm": 8.714579582214355, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8622772097587585, + "num_tokens": 651683040.0, + "step": 17081 + }, + { + "epoch": 2.1730059788830935, + "ewc_loss": 0.07543511688709259, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003832574002444744, + "grad_norm": 8.673202514648438, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8732889890670776, + "num_tokens": 651718067.0, + "step": 17082 + }, + { + "epoch": 2.173133189161684, + "ewc_loss": 0.07561171054840088, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038502339157275856, + "grad_norm": 8.744465827941895, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8800005912780762, + "num_tokens": 651761995.0, + "step": 17083 + }, + { + "epoch": 2.1732603994402746, + "ewc_loss": 0.07541628181934357, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003830690693575889, + "grad_norm": 8.736543655395508, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8799175024032593, + "num_tokens": 651797780.0, + "step": 17084 + }, + { + "epoch": 2.173387609718865, + "ewc_loss": 0.07527032494544983, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003840508870780468, + "grad_norm": 8.702582359313965, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8523416519165039, + "num_tokens": 651832931.0, + "step": 17085 + }, + { + "epoch": 2.1735148199974557, + "ewc_loss": 0.07548484206199646, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038375466829165816, + "grad_norm": 8.914739608764648, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8586751222610474, + "num_tokens": 651868440.0, + "step": 17086 + }, + { + "epoch": 2.173642030276046, + "ewc_loss": 0.07482683658599854, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003796160744968802, + "grad_norm": 8.651359558105469, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8670040965080261, + "num_tokens": 651900029.0, + "step": 17087 + }, + { + "epoch": 2.1737692405546367, + "ewc_loss": 0.07558006793260574, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003871483204420656, + "grad_norm": 8.71256160736084, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8608073592185974, + "num_tokens": 651944892.0, + "step": 17088 + }, + { + "epoch": 2.1738964508332272, + "ewc_loss": 0.0749518871307373, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038086652057245374, + "grad_norm": 8.787442207336426, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.872118353843689, + "num_tokens": 651979330.0, + "step": 17089 + }, + { + "epoch": 2.1740236611118178, + "ewc_loss": 0.07504506409168243, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000381798337912187, + "grad_norm": 8.590373039245605, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8559350967407227, + "num_tokens": 652017389.0, + "step": 17090 + }, + { + "epoch": 2.1741508713904083, + "ewc_loss": 0.07546447217464447, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038599237450398505, + "grad_norm": 8.759210586547852, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8607777953147888, + "num_tokens": 652052260.0, + "step": 17091 + }, + { + "epoch": 2.174278081668999, + "ewc_loss": 0.07478739321231842, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003792215429712087, + "grad_norm": 8.645751953125, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.849976658821106, + "num_tokens": 652089917.0, + "step": 17092 + }, + { + "epoch": 2.1744052919475894, + "ewc_loss": 0.07547716796398163, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003861193254124373, + "grad_norm": 8.688097953796387, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8679463863372803, + "num_tokens": 652129849.0, + "step": 17093 + }, + { + "epoch": 2.17453250222618, + "ewc_loss": 0.07493538409471512, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038070150185376406, + "grad_norm": 8.65633773803711, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8556418418884277, + "num_tokens": 652163178.0, + "step": 17094 + }, + { + "epoch": 2.1746597125047704, + "ewc_loss": 0.07515527307987213, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003829003544524312, + "grad_norm": 8.707555770874023, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8714195489883423, + "num_tokens": 652201521.0, + "step": 17095 + }, + { + "epoch": 2.174786922783361, + "ewc_loss": 0.07504811137914658, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003818287805188447, + "grad_norm": 8.680079460144043, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8664051294326782, + "num_tokens": 652234820.0, + "step": 17096 + }, + { + "epoch": 2.1749141330619515, + "ewc_loss": 0.07531848549842834, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003820911515504122, + "grad_norm": 8.692131996154785, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8615802526473999, + "num_tokens": 652275686.0, + "step": 17097 + }, + { + "epoch": 2.175041343340542, + "ewc_loss": 0.07504093647003174, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003817570104729384, + "grad_norm": 8.635446548461914, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8550145626068115, + "num_tokens": 652313243.0, + "step": 17098 + }, + { + "epoch": 2.1751685536191325, + "ewc_loss": 0.07514875382184982, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003828351909760386, + "grad_norm": 8.663947105407715, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8796571493148804, + "num_tokens": 652351550.0, + "step": 17099 + }, + { + "epoch": 2.175295763897723, + "ewc_loss": 0.0750570148229599, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003819177800323814, + "grad_norm": 8.575118064880371, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8615602254867554, + "num_tokens": 652399834.0, + "step": 17100 + }, + { + "epoch": 2.1754229741763136, + "ewc_loss": 0.07526887208223343, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003840363642666489, + "grad_norm": 8.739372253417969, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8507584929466248, + "num_tokens": 652435839.0, + "step": 17101 + }, + { + "epoch": 2.175550184454904, + "ewc_loss": 0.07481718808412552, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003795195370912552, + "grad_norm": 8.61458969116211, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8605269193649292, + "num_tokens": 652475747.0, + "step": 17102 + }, + { + "epoch": 2.1756773947334946, + "ewc_loss": 0.07536723464727402, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003850199864245951, + "grad_norm": 8.720712661743164, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8759024739265442, + "num_tokens": 652517147.0, + "step": 17103 + }, + { + "epoch": 2.175804605012085, + "ewc_loss": 0.0748969316482544, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038031695294193923, + "grad_norm": 8.601500511169434, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.876544713973999, + "num_tokens": 652552811.0, + "step": 17104 + }, + { + "epoch": 2.1759318152906753, + "ewc_loss": 0.07550641149282455, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003864117607008666, + "grad_norm": 8.830822944641113, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.871406614780426, + "num_tokens": 652587709.0, + "step": 17105 + }, + { + "epoch": 2.1760590255692662, + "ewc_loss": 0.07463820278644562, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003777296806219965, + "grad_norm": 8.629088401794434, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8630222678184509, + "num_tokens": 652626213.0, + "step": 17106 + }, + { + "epoch": 2.1761862358478563, + "ewc_loss": 0.07555072754621506, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038685492472723126, + "grad_norm": 8.8201265335083, + "learning_rate": 1e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8362502455711365, + "num_tokens": 652665692.0, + "step": 17107 + }, + { + "epoch": 2.176313446126447, + "ewc_loss": 0.07489617168903351, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037786795292049646, + "grad_norm": 8.605706214904785, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8560928106307983, + "num_tokens": 652705515.0, + "step": 17108 + }, + { + "epoch": 2.1764406564050374, + "ewc_loss": 0.07528826594352722, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003842303704004735, + "grad_norm": 8.702825546264648, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8765736818313599, + "num_tokens": 652743343.0, + "step": 17109 + }, + { + "epoch": 2.176567866683628, + "ewc_loss": 0.07481654733419418, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003795131342485547, + "grad_norm": 8.635005950927734, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8584716320037842, + "num_tokens": 652784587.0, + "step": 17110 + }, + { + "epoch": 2.1766950769622184, + "ewc_loss": 0.07525220513343811, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003838697448372841, + "grad_norm": 8.728470802307129, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8565261960029602, + "num_tokens": 652821985.0, + "step": 17111 + }, + { + "epoch": 2.176822287240809, + "ewc_loss": 0.07483133673667908, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003796610690187663, + "grad_norm": 8.592384338378906, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8748975396156311, + "num_tokens": 652855772.0, + "step": 17112 + }, + { + "epoch": 2.1769494975193995, + "ewc_loss": 0.07539033889770508, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003852510417345911, + "grad_norm": 8.78080940246582, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.859991192817688, + "num_tokens": 652890497.0, + "step": 17113 + }, + { + "epoch": 2.17707670779799, + "ewc_loss": 0.07468722015619278, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037821984733454883, + "grad_norm": 8.544514656066895, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8749310970306396, + "num_tokens": 652931611.0, + "step": 17114 + }, + { + "epoch": 2.1772039180765805, + "ewc_loss": 0.07554225623607635, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038677017437294126, + "grad_norm": 8.711323738098145, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.865060567855835, + "num_tokens": 652972129.0, + "step": 17115 + }, + { + "epoch": 2.177331128355171, + "ewc_loss": 0.07483518123626709, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037969951517879963, + "grad_norm": 8.582199096679688, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.857398509979248, + "num_tokens": 653009848.0, + "step": 17116 + }, + { + "epoch": 2.1774583386337616, + "ewc_loss": 0.07570721209049225, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038597831735387444, + "grad_norm": 8.799424171447754, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.861561119556427, + "num_tokens": 653044614.0, + "step": 17117 + }, + { + "epoch": 2.177585548912352, + "ewc_loss": 0.07506172358989716, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037952346610836685, + "grad_norm": 8.505777359008789, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8785247802734375, + "num_tokens": 653087614.0, + "step": 17118 + }, + { + "epoch": 2.1777127591909426, + "ewc_loss": 0.07573573291301727, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038870502612553537, + "grad_norm": 8.819097518920898, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8778446912765503, + "num_tokens": 653120434.0, + "step": 17119 + }, + { + "epoch": 2.177839969469533, + "ewc_loss": 0.07471657544374466, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003785134176723659, + "grad_norm": 8.557652473449707, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8740943670272827, + "num_tokens": 653159390.0, + "step": 17120 + }, + { + "epoch": 2.1779671797481237, + "ewc_loss": 0.07573388516902924, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003886865160893649, + "grad_norm": 8.78748607635498, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8572918176651001, + "num_tokens": 653201045.0, + "step": 17121 + }, + { + "epoch": 2.1780943900267142, + "ewc_loss": 0.07480881363153458, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037943580537103117, + "grad_norm": 8.563616752624512, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8671414852142334, + "num_tokens": 653244968.0, + "step": 17122 + }, + { + "epoch": 2.1782216003053048, + "ewc_loss": 0.07558387517929077, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003871864464599639, + "grad_norm": 8.760848999023438, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8583225011825562, + "num_tokens": 653281157.0, + "step": 17123 + }, + { + "epoch": 2.1783488105838953, + "ewc_loss": 0.07479746639728546, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037932232953608036, + "grad_norm": 8.577004432678223, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8781770467758179, + "num_tokens": 653321055.0, + "step": 17124 + }, + { + "epoch": 2.178476020862486, + "ewc_loss": 0.07565686106681824, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003854748501908034, + "grad_norm": 8.762084007263184, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8526749014854431, + "num_tokens": 653359320.0, + "step": 17125 + }, + { + "epoch": 2.1786032311410763, + "ewc_loss": 0.07510030269622803, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037990923738107085, + "grad_norm": 8.53185749053955, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8854181170463562, + "num_tokens": 653404186.0, + "step": 17126 + }, + { + "epoch": 2.178730441419667, + "ewc_loss": 0.07595022022724152, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038840839988552034, + "grad_norm": 8.828831672668457, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.874423623085022, + "num_tokens": 653438122.0, + "step": 17127 + }, + { + "epoch": 2.1788576516982574, + "ewc_loss": 0.0750521793961525, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003794280346482992, + "grad_norm": 8.556917190551758, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8642762899398804, + "num_tokens": 653474727.0, + "step": 17128 + }, + { + "epoch": 2.178984861976848, + "ewc_loss": 0.07600906491279602, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003889969375450164, + "grad_norm": 8.81981086730957, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8701943159103394, + "num_tokens": 653513531.0, + "step": 17129 + }, + { + "epoch": 2.179112072255438, + "ewc_loss": 0.07474108040332794, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037875850102864206, + "grad_norm": 8.518316268920898, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8780470490455627, + "num_tokens": 653555871.0, + "step": 17130 + }, + { + "epoch": 2.1792392825340285, + "ewc_loss": 0.07594896852970123, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003908373764716089, + "grad_norm": 8.815681457519531, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.866926372051239, + "num_tokens": 653598825.0, + "step": 17131 + }, + { + "epoch": 2.179366492812619, + "ewc_loss": 0.07473129034042358, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003786605375353247, + "grad_norm": 8.53931713104248, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8742954134941101, + "num_tokens": 653639838.0, + "step": 17132 + }, + { + "epoch": 2.1794937030912096, + "ewc_loss": 0.07596038281917572, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003909515216946602, + "grad_norm": 8.86346435546875, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8565961122512817, + "num_tokens": 653678958.0, + "step": 17133 + }, + { + "epoch": 2.1796209133698, + "ewc_loss": 0.0748029351234436, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003793769865296781, + "grad_norm": 8.60173225402832, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8736841082572937, + "num_tokens": 653718043.0, + "step": 17134 + }, + { + "epoch": 2.1797481236483907, + "ewc_loss": 0.07591979950666428, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039054566877894104, + "grad_norm": 8.825495719909668, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8805444836616516, + "num_tokens": 653756308.0, + "step": 17135 + }, + { + "epoch": 2.179875333926981, + "ewc_loss": 0.07489755749702454, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038032326847314835, + "grad_norm": 8.623187065124512, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8733792304992676, + "num_tokens": 653793140.0, + "step": 17136 + }, + { + "epoch": 2.1800025442055717, + "ewc_loss": 0.07571135461330414, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038846119423396885, + "grad_norm": 8.852682113647461, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8697532415390015, + "num_tokens": 653823441.0, + "step": 17137 + }, + { + "epoch": 2.1801297544841622, + "ewc_loss": 0.07485417276620865, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037988938856869936, + "grad_norm": 8.638978958129883, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8552695512771606, + "num_tokens": 653860743.0, + "step": 17138 + }, + { + "epoch": 2.1802569647627528, + "ewc_loss": 0.07563357800245285, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003876834234688431, + "grad_norm": 8.836160659790039, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8766165375709534, + "num_tokens": 653894785.0, + "step": 17139 + }, + { + "epoch": 2.1803841750413433, + "ewc_loss": 0.07489662617444992, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038031392614357173, + "grad_norm": 8.646888732910156, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.861557126045227, + "num_tokens": 653932208.0, + "step": 17140 + }, + { + "epoch": 2.180511385319934, + "ewc_loss": 0.0754907876253128, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003862555022351444, + "grad_norm": 8.769258499145508, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8511887192726135, + "num_tokens": 653968818.0, + "step": 17141 + }, + { + "epoch": 2.1806385955985244, + "ewc_loss": 0.07494135200977325, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003807611938100308, + "grad_norm": 8.671628952026367, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8621520400047302, + "num_tokens": 654010696.0, + "step": 17142 + }, + { + "epoch": 2.180765805877115, + "ewc_loss": 0.07531812787055969, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038452891749329865, + "grad_norm": 8.762889862060547, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8688746690750122, + "num_tokens": 654047632.0, + "step": 17143 + }, + { + "epoch": 2.1808930161557054, + "ewc_loss": 0.07476832717657089, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037903094198554754, + "grad_norm": 8.606380462646484, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8682350516319275, + "num_tokens": 654088200.0, + "step": 17144 + }, + { + "epoch": 2.181020226434296, + "ewc_loss": 0.07550947368144989, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003864424070343375, + "grad_norm": 8.780914306640625, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8616341948509216, + "num_tokens": 654124225.0, + "step": 17145 + }, + { + "epoch": 2.1811474367128865, + "ewc_loss": 0.07488921284675598, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000380239769583568, + "grad_norm": 8.630195617675781, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8676559925079346, + "num_tokens": 654163317.0, + "step": 17146 + }, + { + "epoch": 2.181274646991477, + "ewc_loss": 0.07542023807764053, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038555002538487315, + "grad_norm": 8.775081634521484, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8584920167922974, + "num_tokens": 654199208.0, + "step": 17147 + }, + { + "epoch": 2.1814018572700675, + "ewc_loss": 0.07508011162281036, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037970737321302295, + "grad_norm": 8.60440444946289, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8533064126968384, + "num_tokens": 654239988.0, + "step": 17148 + }, + { + "epoch": 2.181529067548658, + "ewc_loss": 0.07567799091339111, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038568611489608884, + "grad_norm": 8.784600257873535, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8577990531921387, + "num_tokens": 654276361.0, + "step": 17149 + }, + { + "epoch": 2.1816562778272486, + "ewc_loss": 0.07484900951385498, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003798377583734691, + "grad_norm": 8.606196403503418, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8667658567428589, + "num_tokens": 654316690.0, + "step": 17150 + }, + { + "epoch": 2.181783488105839, + "ewc_loss": 0.0754009336233139, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003853569505736232, + "grad_norm": 8.71252155303955, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8745176196098328, + "num_tokens": 654354112.0, + "step": 17151 + }, + { + "epoch": 2.1819106983844296, + "ewc_loss": 0.07496381551027298, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003809858171734959, + "grad_norm": 8.57254409790039, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8769065141677856, + "num_tokens": 654398896.0, + "step": 17152 + }, + { + "epoch": 2.18203790866302, + "ewc_loss": 0.0754709392786026, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038605707231909037, + "grad_norm": 8.756075859069824, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8790050148963928, + "num_tokens": 654435180.0, + "step": 17153 + }, + { + "epoch": 2.1821651189416107, + "ewc_loss": 0.07483536005020142, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003797012323047966, + "grad_norm": 8.593810081481934, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8528559803962708, + "num_tokens": 654473958.0, + "step": 17154 + }, + { + "epoch": 2.1822923292202008, + "ewc_loss": 0.0755806639790535, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003871542867273092, + "grad_norm": 8.773374557495117, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8510637283325195, + "num_tokens": 654512918.0, + "step": 17155 + }, + { + "epoch": 2.1824195394987913, + "ewc_loss": 0.07489141076803207, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038026177207939327, + "grad_norm": 8.597248077392578, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8725186586380005, + "num_tokens": 654552296.0, + "step": 17156 + }, + { + "epoch": 2.182546749777382, + "ewc_loss": 0.07558038830757141, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038715152186341584, + "grad_norm": 8.782255172729492, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8715835213661194, + "num_tokens": 654592088.0, + "step": 17157 + }, + { + "epoch": 2.1826739600559724, + "ewc_loss": 0.0748988687992096, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003803363651968539, + "grad_norm": 8.606382369995117, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8579719066619873, + "num_tokens": 654635119.0, + "step": 17158 + }, + { + "epoch": 2.182801170334563, + "ewc_loss": 0.07563532888889313, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003877009730786085, + "grad_norm": 8.739672660827637, + "learning_rate": 1e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8525460958480835, + "num_tokens": 654671204.0, + "step": 17159 + }, + { + "epoch": 2.1829283806131534, + "ewc_loss": 0.07523512095212936, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038125747232697904, + "grad_norm": 8.714349746704102, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.863641619682312, + "num_tokens": 654702986.0, + "step": 17160 + }, + { + "epoch": 2.183055590891744, + "ewc_loss": 0.075676828622818, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038567453157156706, + "grad_norm": 8.70372200012207, + "learning_rate": 1e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8407256603240967, + "num_tokens": 654745813.0, + "step": 17161 + }, + { + "epoch": 2.1831828011703345, + "ewc_loss": 0.07534265518188477, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000382332771550864, + "grad_norm": 8.643637657165527, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8640137910842896, + "num_tokens": 654783305.0, + "step": 17162 + }, + { + "epoch": 2.183310011448925, + "ewc_loss": 0.07562102377414703, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003851164656225592, + "grad_norm": 8.762432098388672, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8735417127609253, + "num_tokens": 654823427.0, + "step": 17163 + }, + { + "epoch": 2.1834372217275155, + "ewc_loss": 0.07532037049531937, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038210995262488723, + "grad_norm": 8.645374298095703, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8789899349212646, + "num_tokens": 654862400.0, + "step": 17164 + }, + { + "epoch": 2.183564432006106, + "ewc_loss": 0.07566387951374054, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003855450777336955, + "grad_norm": 8.697395324707031, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8690798878669739, + "num_tokens": 654902484.0, + "step": 17165 + }, + { + "epoch": 2.1836916422846966, + "ewc_loss": 0.07536846399307251, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038259089342318475, + "grad_norm": 8.699202537536621, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.871307373046875, + "num_tokens": 654946882.0, + "step": 17166 + }, + { + "epoch": 2.183818852563287, + "ewc_loss": 0.07536937296390533, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003850414068438113, + "grad_norm": 8.73643970489502, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8553186058998108, + "num_tokens": 654977850.0, + "step": 17167 + }, + { + "epoch": 2.1839460628418776, + "ewc_loss": 0.07509434223175049, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003822911239694804, + "grad_norm": 8.694280624389648, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8508323431015015, + "num_tokens": 655015352.0, + "step": 17168 + }, + { + "epoch": 2.184073273120468, + "ewc_loss": 0.07532522082328796, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003845999017357826, + "grad_norm": 8.783113479614258, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8665714263916016, + "num_tokens": 655047488.0, + "step": 17169 + }, + { + "epoch": 2.1842004833990587, + "ewc_loss": 0.07501228898763657, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003814705414697528, + "grad_norm": 8.643311500549316, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8622750639915466, + "num_tokens": 655083410.0, + "step": 17170 + }, + { + "epoch": 2.1843276936776492, + "ewc_loss": 0.0753430724143982, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003847783664241433, + "grad_norm": 8.682439804077148, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8741462826728821, + "num_tokens": 655120594.0, + "step": 17171 + }, + { + "epoch": 2.1844549039562398, + "ewc_loss": 0.07510072737932205, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038235491956584156, + "grad_norm": 8.65435791015625, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8631654977798462, + "num_tokens": 655157310.0, + "step": 17172 + }, + { + "epoch": 2.1845821142348303, + "ewc_loss": 0.07552075386047363, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003841137804556638, + "grad_norm": 8.691383361816406, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8741683959960938, + "num_tokens": 655193749.0, + "step": 17173 + }, + { + "epoch": 2.184709324513421, + "ewc_loss": 0.07535296678543091, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003824359446298331, + "grad_norm": 8.629185676574707, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8791157007217407, + "num_tokens": 655235532.0, + "step": 17174 + }, + { + "epoch": 2.1848365347920113, + "ewc_loss": 0.07552926242351532, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038419890915974975, + "grad_norm": 8.710759162902832, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8725049495697021, + "num_tokens": 655282154.0, + "step": 17175 + }, + { + "epoch": 2.184963745070602, + "ewc_loss": 0.07537876814603806, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038269395008683205, + "grad_norm": 8.680139541625977, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8577855825424194, + "num_tokens": 655321342.0, + "step": 17176 + }, + { + "epoch": 2.1850909553491924, + "ewc_loss": 0.0754319429397583, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038322570617310703, + "grad_norm": 8.69211196899414, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.882321298122406, + "num_tokens": 655358107.0, + "step": 17177 + }, + { + "epoch": 2.185218165627783, + "ewc_loss": 0.07553291320800781, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038423537625931203, + "grad_norm": 8.709334373474121, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8596898317337036, + "num_tokens": 655393462.0, + "step": 17178 + }, + { + "epoch": 2.1853453759063735, + "ewc_loss": 0.07526219636201859, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003815281961578876, + "grad_norm": 8.646584510803223, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8657474517822266, + "num_tokens": 655436114.0, + "step": 17179 + }, + { + "epoch": 2.1854725861849635, + "ewc_loss": 0.07561247050762177, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003850309003610164, + "grad_norm": 8.822251319885254, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8676425218582153, + "num_tokens": 655473119.0, + "step": 17180 + }, + { + "epoch": 2.185599796463554, + "ewc_loss": 0.07512316852807999, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038013793528079987, + "grad_norm": 8.724555969238281, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8603023290634155, + "num_tokens": 655515537.0, + "step": 17181 + }, + { + "epoch": 2.1857270067421446, + "ewc_loss": 0.07545106112957001, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038341686013154685, + "grad_norm": 8.721118927001953, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8587142825126648, + "num_tokens": 655555944.0, + "step": 17182 + }, + { + "epoch": 2.185854217020735, + "ewc_loss": 0.07523816078901291, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038128785672597587, + "grad_norm": 8.86510944366455, + "learning_rate": 1e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8483158349990845, + "num_tokens": 655589952.0, + "step": 17183 + }, + { + "epoch": 2.1859814272993257, + "ewc_loss": 0.07502162456512451, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003791225026361644, + "grad_norm": 8.647507667541504, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8773534297943115, + "num_tokens": 655628269.0, + "step": 17184 + }, + { + "epoch": 2.186108637577916, + "ewc_loss": 0.07554894685745239, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003843956801574677, + "grad_norm": 8.83074951171875, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8688271045684814, + "num_tokens": 655657315.0, + "step": 17185 + }, + { + "epoch": 2.1862358478565067, + "ewc_loss": 0.07496557384729385, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003785619919653982, + "grad_norm": 8.638236045837402, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8652384281158447, + "num_tokens": 655696117.0, + "step": 17186 + }, + { + "epoch": 2.1863630581350972, + "ewc_loss": 0.07538522779941559, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038519990630447865, + "grad_norm": 8.783778190612793, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8462831974029541, + "num_tokens": 655738823.0, + "step": 17187 + }, + { + "epoch": 2.1864902684136878, + "ewc_loss": 0.07491161674261093, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003780224360525608, + "grad_norm": 8.653876304626465, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8549221754074097, + "num_tokens": 655781457.0, + "step": 17188 + }, + { + "epoch": 2.1866174786922783, + "ewc_loss": 0.07553337514400482, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003842400328721851, + "grad_norm": 8.764405250549316, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8595983386039734, + "num_tokens": 655824704.0, + "step": 17189 + }, + { + "epoch": 2.186744688970869, + "ewc_loss": 0.07503481209278107, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003792543720919639, + "grad_norm": 8.70868968963623, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8747265338897705, + "num_tokens": 655860013.0, + "step": 17190 + }, + { + "epoch": 2.1868718992494594, + "ewc_loss": 0.07505093514919281, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038185701123438776, + "grad_norm": 8.690940856933594, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8633245229721069, + "num_tokens": 655897146.0, + "step": 17191 + }, + { + "epoch": 2.18699910952805, + "ewc_loss": 0.07503638416528702, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003817114920821041, + "grad_norm": 8.662690162658691, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8695030808448792, + "num_tokens": 655934299.0, + "step": 17192 + }, + { + "epoch": 2.1871263198066404, + "ewc_loss": 0.0753960907459259, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003828671178780496, + "grad_norm": 8.677902221679688, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.870129406452179, + "num_tokens": 655976554.0, + "step": 17193 + }, + { + "epoch": 2.187253530085231, + "ewc_loss": 0.07497330754995346, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003810807247646153, + "grad_norm": 8.674605369567871, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8768948912620544, + "num_tokens": 656011756.0, + "step": 17194 + }, + { + "epoch": 2.1873807403638215, + "ewc_loss": 0.07516410946846008, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003829887427855283, + "grad_norm": 8.723322868347168, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8623422980308533, + "num_tokens": 656054405.0, + "step": 17195 + }, + { + "epoch": 2.187507950642412, + "ewc_loss": 0.07500537484884262, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038140141987241805, + "grad_norm": 8.672525405883789, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8697096109390259, + "num_tokens": 656094668.0, + "step": 17196 + }, + { + "epoch": 2.1876351609210025, + "ewc_loss": 0.07517468184232712, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003830944770015776, + "grad_norm": 8.753479957580566, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8645521402359009, + "num_tokens": 656137664.0, + "step": 17197 + }, + { + "epoch": 2.187762371199593, + "ewc_loss": 0.07482681423425674, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003796157834585756, + "grad_norm": 8.592254638671875, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8572642207145691, + "num_tokens": 656180611.0, + "step": 17198 + }, + { + "epoch": 2.1878895814781836, + "ewc_loss": 0.07527615129947662, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003841091529466212, + "grad_norm": 8.76888656616211, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8559357523918152, + "num_tokens": 656217789.0, + "step": 17199 + }, + { + "epoch": 2.188016791756774, + "ewc_loss": 0.07477307319641113, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003790784103330225, + "grad_norm": 8.643978118896484, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8620482683181763, + "num_tokens": 656261940.0, + "step": 17200 + }, + { + "epoch": 2.1881440020353646, + "ewc_loss": 0.07528610527515411, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003842086880467832, + "grad_norm": 8.774006843566895, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8741729259490967, + "num_tokens": 656296000.0, + "step": 17201 + }, + { + "epoch": 2.188271212313955, + "ewc_loss": 0.07487298548221588, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038007748662494123, + "grad_norm": 8.77675724029541, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8748917579650879, + "num_tokens": 656340862.0, + "step": 17202 + }, + { + "epoch": 2.1883984225925452, + "ewc_loss": 0.07523168623447418, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003836645046249032, + "grad_norm": 8.690694808959961, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8904443383216858, + "num_tokens": 656376923.0, + "step": 17203 + }, + { + "epoch": 2.188525632871136, + "ewc_loss": 0.07514182478189468, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003827658947557211, + "grad_norm": 8.733293533325195, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.861885666847229, + "num_tokens": 656414058.0, + "step": 17204 + }, + { + "epoch": 2.1886528431497263, + "ewc_loss": 0.07506398856639862, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038198757101781666, + "grad_norm": 8.763379096984863, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8545600175857544, + "num_tokens": 656450077.0, + "step": 17205 + }, + { + "epoch": 2.188780053428317, + "ewc_loss": 0.07529321312904358, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038427975960075855, + "grad_norm": 8.764634132385254, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8564793467521667, + "num_tokens": 656487258.0, + "step": 17206 + }, + { + "epoch": 2.1889072637069074, + "ewc_loss": 0.07509687542915344, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003823163569904864, + "grad_norm": 8.699902534484863, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8780863285064697, + "num_tokens": 656525141.0, + "step": 17207 + }, + { + "epoch": 2.189034473985498, + "ewc_loss": 0.0752732902765274, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038408051477745175, + "grad_norm": 8.789817810058594, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8691288828849792, + "num_tokens": 656561517.0, + "step": 17208 + }, + { + "epoch": 2.1891616842640884, + "ewc_loss": 0.0751049816608429, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038239749846979976, + "grad_norm": 14.314804077148438, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8709904551506042, + "num_tokens": 656595859.0, + "step": 17209 + }, + { + "epoch": 2.189288894542679, + "ewc_loss": 0.08233598619699478, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004547075368463993, + "grad_norm": 9.446710586547852, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8618682622909546, + "num_tokens": 656627652.0, + "step": 17210 + }, + { + "epoch": 2.1894161048212695, + "ewc_loss": 0.07700201123952866, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004013677535112947, + "grad_norm": 9.068843841552734, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8713630437850952, + "num_tokens": 656669951.0, + "step": 17211 + }, + { + "epoch": 2.18954331509986, + "ewc_loss": 0.07578936219215393, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038924123509787023, + "grad_norm": 8.853500366210938, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8769490718841553, + "num_tokens": 656702216.0, + "step": 17212 + }, + { + "epoch": 2.1896705253784505, + "ewc_loss": 0.07762269675731659, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00040757464012131095, + "grad_norm": 9.121028900146484, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8557699918746948, + "num_tokens": 656744816.0, + "step": 17213 + }, + { + "epoch": 2.189797735657041, + "ewc_loss": 0.07546738535165787, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038602150743827224, + "grad_norm": 8.780342102050781, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8603261709213257, + "num_tokens": 656785533.0, + "step": 17214 + }, + { + "epoch": 2.1899249459356316, + "ewc_loss": 0.0765208899974823, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039655654109083116, + "grad_norm": 8.942840576171875, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8641139268875122, + "num_tokens": 656825375.0, + "step": 17215 + }, + { + "epoch": 2.190052156214222, + "ewc_loss": 0.07550549507141113, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003864026512019336, + "grad_norm": 8.841804504394531, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8730834722518921, + "num_tokens": 656857951.0, + "step": 17216 + }, + { + "epoch": 2.1901793664928126, + "ewc_loss": 0.07604806125164032, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039182830369099975, + "grad_norm": 8.893223762512207, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8743484020233154, + "num_tokens": 656896911.0, + "step": 17217 + }, + { + "epoch": 2.190306576771403, + "ewc_loss": 0.07530137896537781, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038436148315668106, + "grad_norm": 8.75323486328125, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8729163408279419, + "num_tokens": 656935723.0, + "step": 17218 + }, + { + "epoch": 2.1904337870499937, + "ewc_loss": 0.07585643231868744, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003899119619745761, + "grad_norm": 8.929183006286621, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8773189783096313, + "num_tokens": 656970881.0, + "step": 17219 + }, + { + "epoch": 2.1905609973285842, + "ewc_loss": 0.07512256503105164, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038257334381341934, + "grad_norm": 8.731164932250977, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.860532283782959, + "num_tokens": 657007721.0, + "step": 17220 + }, + { + "epoch": 2.1906882076071748, + "ewc_loss": 0.07569946348667145, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003883422468788922, + "grad_norm": 8.87489128112793, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8751702308654785, + "num_tokens": 657042359.0, + "step": 17221 + }, + { + "epoch": 2.1908154178857653, + "ewc_loss": 0.07510615140199661, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003824091691058129, + "grad_norm": 8.723671913146973, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8710802793502808, + "num_tokens": 657084662.0, + "step": 17222 + }, + { + "epoch": 2.190942628164356, + "ewc_loss": 0.07554122060537338, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038675987161695957, + "grad_norm": 8.820624351501465, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.86707603931427, + "num_tokens": 657122002.0, + "step": 17223 + }, + { + "epoch": 2.1910698384429463, + "ewc_loss": 0.07510027289390564, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003823504375759512, + "grad_norm": 8.72468376159668, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8657851219177246, + "num_tokens": 657158768.0, + "step": 17224 + }, + { + "epoch": 2.191197048721537, + "ewc_loss": 0.07542245090007782, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003855721151921898, + "grad_norm": 8.794451713562012, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8552061319351196, + "num_tokens": 657193445.0, + "step": 17225 + }, + { + "epoch": 2.1913242590001274, + "ewc_loss": 0.075193852186203, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003832861839327961, + "grad_norm": 8.687456130981445, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8641524910926819, + "num_tokens": 657234922.0, + "step": 17226 + }, + { + "epoch": 2.191451469278718, + "ewc_loss": 0.07536108046770096, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003849584609270096, + "grad_norm": 8.795076370239258, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8562788963317871, + "num_tokens": 657270774.0, + "step": 17227 + }, + { + "epoch": 2.191578679557308, + "ewc_loss": 0.07510164380073547, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038236414548009634, + "grad_norm": 8.67411994934082, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8803396224975586, + "num_tokens": 657307046.0, + "step": 17228 + }, + { + "epoch": 2.1917058898358985, + "ewc_loss": 0.07556190341711044, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003869666834361851, + "grad_norm": 8.826221466064453, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8546873331069946, + "num_tokens": 657352865.0, + "step": 17229 + }, + { + "epoch": 2.191833100114489, + "ewc_loss": 0.07506684958934784, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003820161218754947, + "grad_norm": 8.621793746948242, + "learning_rate": 1e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8452581167221069, + "num_tokens": 657393372.0, + "step": 17230 + }, + { + "epoch": 2.1919603103930796, + "ewc_loss": 0.07585692405700684, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003899169387295842, + "grad_norm": 8.874497413635254, + "learning_rate": 1e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.845548689365387, + "num_tokens": 657432445.0, + "step": 17231 + }, + { + "epoch": 2.19208752067167, + "ewc_loss": 0.07503616809844971, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038170936750248075, + "grad_norm": 8.690977096557617, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8646520376205444, + "num_tokens": 657469145.0, + "step": 17232 + }, + { + "epoch": 2.1922147309502606, + "ewc_loss": 0.07563707232475281, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038771837716922164, + "grad_norm": 8.798567771911621, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.878669023513794, + "num_tokens": 657510369.0, + "step": 17233 + }, + { + "epoch": 2.192341941228851, + "ewc_loss": 0.0752544030547142, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038389168912544847, + "grad_norm": 8.698494911193848, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.871172308921814, + "num_tokens": 657550257.0, + "step": 17234 + }, + { + "epoch": 2.1924691515074417, + "ewc_loss": 0.07560397684574127, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038738740840926766, + "grad_norm": 8.770223617553711, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8679960370063782, + "num_tokens": 657587095.0, + "step": 17235 + }, + { + "epoch": 2.1925963617860322, + "ewc_loss": 0.07532045245170593, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003845521423500031, + "grad_norm": 8.691781044006348, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8701808452606201, + "num_tokens": 657619710.0, + "step": 17236 + }, + { + "epoch": 2.1927235720646228, + "ewc_loss": 0.07558880746364594, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003872356901410967, + "grad_norm": 8.760380744934082, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8617733716964722, + "num_tokens": 657653316.0, + "step": 17237 + }, + { + "epoch": 2.1928507823432133, + "ewc_loss": 0.07552023231983185, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038655000389553607, + "grad_norm": 8.706379890441895, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8603833913803101, + "num_tokens": 657693570.0, + "step": 17238 + }, + { + "epoch": 2.192977992621804, + "ewc_loss": 0.0755326896905899, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038667459739372134, + "grad_norm": 8.691300392150879, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8664553165435791, + "num_tokens": 657736781.0, + "step": 17239 + }, + { + "epoch": 2.1931052029003943, + "ewc_loss": 0.07564350217580795, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038778266753070056, + "grad_norm": 8.753610610961914, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8573073744773865, + "num_tokens": 657775603.0, + "step": 17240 + }, + { + "epoch": 2.193232413178985, + "ewc_loss": 0.0753745436668396, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000385093066142872, + "grad_norm": 8.715710639953613, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8637164235115051, + "num_tokens": 657809065.0, + "step": 17241 + }, + { + "epoch": 2.1933596234575754, + "ewc_loss": 0.07568150758743286, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003881627635564655, + "grad_norm": 8.682992935180664, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8633894920349121, + "num_tokens": 657851369.0, + "step": 17242 + }, + { + "epoch": 2.193486833736166, + "ewc_loss": 0.07562573254108429, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038760501774959266, + "grad_norm": 8.715312957763672, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8888692855834961, + "num_tokens": 657889008.0, + "step": 17243 + }, + { + "epoch": 2.1936140440147565, + "ewc_loss": 0.07556221634149551, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038696982664987445, + "grad_norm": 8.6718111038208, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8644335269927979, + "num_tokens": 657925867.0, + "step": 17244 + }, + { + "epoch": 2.193741254293347, + "ewc_loss": 0.07579178363084793, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003892654785886407, + "grad_norm": 8.78277587890625, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8549465537071228, + "num_tokens": 657959502.0, + "step": 17245 + }, + { + "epoch": 2.1938684645719375, + "ewc_loss": 0.07543487101793289, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003856963594444096, + "grad_norm": 8.643203735351562, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8686237931251526, + "num_tokens": 657999063.0, + "step": 17246 + }, + { + "epoch": 2.193995674850528, + "ewc_loss": 0.07605458050966263, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003894520632456988, + "grad_norm": 8.814327239990234, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8542094230651855, + "num_tokens": 658033016.0, + "step": 17247 + }, + { + "epoch": 2.1941228851291186, + "ewc_loss": 0.07538723945617676, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003852200461551547, + "grad_norm": 8.661295890808105, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8714531064033508, + "num_tokens": 658071259.0, + "step": 17248 + }, + { + "epoch": 2.194250095407709, + "ewc_loss": 0.07577427476644516, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038909041904844344, + "grad_norm": 8.759170532226562, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8674167394638062, + "num_tokens": 658103190.0, + "step": 17249 + }, + { + "epoch": 2.1943773056862996, + "ewc_loss": 0.07530762255191803, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003844238817691803, + "grad_norm": 8.634476661682129, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8575687408447266, + "num_tokens": 658143346.0, + "step": 17250 + }, + { + "epoch": 2.19450451596489, + "ewc_loss": 0.07594303786754608, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039077800465747714, + "grad_norm": 8.802067756652832, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8756234645843506, + "num_tokens": 658181747.0, + "step": 17251 + }, + { + "epoch": 2.1946317262434807, + "ewc_loss": 0.07514913380146027, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003828389453701675, + "grad_norm": 8.586386680603027, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8662146329879761, + "num_tokens": 658221770.0, + "step": 17252 + }, + { + "epoch": 2.1947589365220708, + "ewc_loss": 0.0759781002998352, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003911287058144808, + "grad_norm": 8.788942337036133, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8709163069725037, + "num_tokens": 658260157.0, + "step": 17253 + }, + { + "epoch": 2.1948861468006613, + "ewc_loss": 0.07523740082979202, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003837216645479202, + "grad_norm": 8.607110023498535, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8545993566513062, + "num_tokens": 658301815.0, + "step": 17254 + }, + { + "epoch": 2.195013357079252, + "ewc_loss": 0.07596704363822937, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039101808215491474, + "grad_norm": 8.807940483093262, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8680007457733154, + "num_tokens": 658340732.0, + "step": 17255 + }, + { + "epoch": 2.1951405673578424, + "ewc_loss": 0.07529697567224503, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038431741995736957, + "grad_norm": 8.601516723632812, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8593405485153198, + "num_tokens": 658380569.0, + "step": 17256 + }, + { + "epoch": 2.195267777636433, + "ewc_loss": 0.07600776851177216, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039142536115832627, + "grad_norm": 8.787787437438965, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.878372073173523, + "num_tokens": 658415842.0, + "step": 17257 + }, + { + "epoch": 2.1953949879150234, + "ewc_loss": 0.07513301819562912, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000382677826564759, + "grad_norm": 8.598388671875, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8690087199211121, + "num_tokens": 658452861.0, + "step": 17258 + }, + { + "epoch": 2.195522198193614, + "ewc_loss": 0.076353058218956, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003924368356820196, + "grad_norm": 8.86330795288086, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8687381744384766, + "num_tokens": 658487019.0, + "step": 17259 + }, + { + "epoch": 2.1956494084722045, + "ewc_loss": 0.07498802244663239, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003812278446275741, + "grad_norm": 8.576218605041504, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.865526556968689, + "num_tokens": 658529290.0, + "step": 17260 + }, + { + "epoch": 2.195776618750795, + "ewc_loss": 0.07602408528327942, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003915884590242058, + "grad_norm": 8.809977531433105, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8600536584854126, + "num_tokens": 658572118.0, + "step": 17261 + }, + { + "epoch": 2.1959038290293855, + "ewc_loss": 0.07516628503799438, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000383010454243049, + "grad_norm": 8.600802421569824, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.862567663192749, + "num_tokens": 658615972.0, + "step": 17262 + }, + { + "epoch": 2.196031039307976, + "ewc_loss": 0.07597868889570236, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003911345556844026, + "grad_norm": 8.795053482055664, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8734796047210693, + "num_tokens": 658653848.0, + "step": 17263 + }, + { + "epoch": 2.1961582495865666, + "ewc_loss": 0.07551184296607971, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038402469363063574, + "grad_norm": 8.68125057220459, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8668588399887085, + "num_tokens": 658686247.0, + "step": 17264 + }, + { + "epoch": 2.196285459865157, + "ewc_loss": 0.07575806230306625, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000388928281608969, + "grad_norm": 8.793204307556152, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8612964153289795, + "num_tokens": 658725787.0, + "step": 17265 + }, + { + "epoch": 2.1964126701437476, + "ewc_loss": 0.07522036135196686, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003835512907244265, + "grad_norm": 8.635476112365723, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8607499599456787, + "num_tokens": 658765957.0, + "step": 17266 + }, + { + "epoch": 2.196539880422338, + "ewc_loss": 0.07571903616189957, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003885379992425442, + "grad_norm": 8.749551773071289, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8770142197608948, + "num_tokens": 658800996.0, + "step": 17267 + }, + { + "epoch": 2.1966670907009287, + "ewc_loss": 0.07531566917896271, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038450429565273225, + "grad_norm": 8.71284294128418, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8622638583183289, + "num_tokens": 658833779.0, + "step": 17268 + }, + { + "epoch": 2.196794300979519, + "ewc_loss": 0.0755234807729721, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038658242556266487, + "grad_norm": 8.708427429199219, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8697740435600281, + "num_tokens": 658874425.0, + "step": 17269 + }, + { + "epoch": 2.1969215112581097, + "ewc_loss": 0.07548512518405914, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003861988661810756, + "grad_norm": 8.713557243347168, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8726744651794434, + "num_tokens": 658912007.0, + "step": 17270 + }, + { + "epoch": 2.1970487215367003, + "ewc_loss": 0.07568313181400299, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003857375995721668, + "grad_norm": 14.331305503845215, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8576720356941223, + "num_tokens": 658956775.0, + "step": 17271 + }, + { + "epoch": 2.197175931815291, + "ewc_loss": 0.08319151401519775, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00046326284063979983, + "grad_norm": 9.460619926452637, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8786472082138062, + "num_tokens": 658997132.0, + "step": 17272 + }, + { + "epoch": 2.1973031420938813, + "ewc_loss": 0.0770505741238594, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004018533800262958, + "grad_norm": 9.030972480773926, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8723804950714111, + "num_tokens": 659040437.0, + "step": 17273 + }, + { + "epoch": 2.197430352372472, + "ewc_loss": 0.07620924711227417, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039344007382169366, + "grad_norm": 8.890534400939941, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8813093900680542, + "num_tokens": 659074461.0, + "step": 17274 + }, + { + "epoch": 2.1975575626510624, + "ewc_loss": 0.07763082534074783, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00040765589801594615, + "grad_norm": 8.974082946777344, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8618272542953491, + "num_tokens": 659113818.0, + "step": 17275 + }, + { + "epoch": 2.197684772929653, + "ewc_loss": 0.07610689103603363, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039241660851985216, + "grad_norm": 8.88752269744873, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8511636257171631, + "num_tokens": 659149297.0, + "step": 17276 + }, + { + "epoch": 2.1978119832082434, + "ewc_loss": 0.07638894021511078, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003952370898332447, + "grad_norm": 8.889811515808105, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8689336776733398, + "num_tokens": 659187500.0, + "step": 17277 + }, + { + "epoch": 2.1979391934868335, + "ewc_loss": 0.076200470328331, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000393352413084358, + "grad_norm": 8.87289047241211, + "learning_rate": 1e-06, + "loss": 0.5593, + "mean_token_accuracy": 0.8385319113731384, + "num_tokens": 659228624.0, + "step": 17278 + }, + { + "epoch": 2.198066403765424, + "ewc_loss": 0.07620684802532196, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003934160922653973, + "grad_norm": 8.85212230682373, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8723402619361877, + "num_tokens": 659270317.0, + "step": 17279 + }, + { + "epoch": 2.1981936140440146, + "ewc_loss": 0.07579104602336884, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003892580862157047, + "grad_norm": 8.842853546142578, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8643425703048706, + "num_tokens": 659302133.0, + "step": 17280 + }, + { + "epoch": 2.198320824322605, + "ewc_loss": 0.0758906751871109, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039025439764373004, + "grad_norm": 8.7970609664917, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8726761937141418, + "num_tokens": 659342469.0, + "step": 17281 + }, + { + "epoch": 2.1984480346011956, + "ewc_loss": 0.07585000991821289, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000389847788028419, + "grad_norm": 8.835979461669922, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8774181604385376, + "num_tokens": 659373786.0, + "step": 17282 + }, + { + "epoch": 2.198575244879786, + "ewc_loss": 0.07555224001407623, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003868700878228992, + "grad_norm": 8.808065414428711, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8586021065711975, + "num_tokens": 659412498.0, + "step": 17283 + }, + { + "epoch": 2.1987024551583767, + "ewc_loss": 0.07570838928222656, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038843153743073344, + "grad_norm": 8.78730583190918, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8498868942260742, + "num_tokens": 659446415.0, + "step": 17284 + }, + { + "epoch": 2.1988296654369672, + "ewc_loss": 0.0755976140499115, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038732384564355016, + "grad_norm": 8.787771224975586, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8572156429290771, + "num_tokens": 659488824.0, + "step": 17285 + }, + { + "epoch": 2.1989568757155578, + "ewc_loss": 0.07545895874500275, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003859372518491, + "grad_norm": 8.733309745788574, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8685299158096313, + "num_tokens": 659523036.0, + "step": 17286 + }, + { + "epoch": 2.1990840859941483, + "ewc_loss": 0.07554490864276886, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038679674617014825, + "grad_norm": 8.766189575195312, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8592919111251831, + "num_tokens": 659562780.0, + "step": 17287 + }, + { + "epoch": 2.199211296272739, + "ewc_loss": 0.07544314861297607, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003857791889458895, + "grad_norm": 8.713021278381348, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8712048530578613, + "num_tokens": 659599364.0, + "step": 17288 + }, + { + "epoch": 2.1993385065513293, + "ewc_loss": 0.07550729066133499, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038642057916149497, + "grad_norm": 8.74008560180664, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8593978881835938, + "num_tokens": 659636275.0, + "step": 17289 + }, + { + "epoch": 2.19946571682992, + "ewc_loss": 0.07542003691196442, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003855479881167412, + "grad_norm": 8.688447952270508, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8638481497764587, + "num_tokens": 659670315.0, + "step": 17290 + }, + { + "epoch": 2.1995929271085104, + "ewc_loss": 0.07557739317417145, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038712163222953677, + "grad_norm": 8.730152130126953, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8608039617538452, + "num_tokens": 659710968.0, + "step": 17291 + }, + { + "epoch": 2.199720137387101, + "ewc_loss": 0.07539413869380951, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038528902223333716, + "grad_norm": 8.681693077087402, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8737078905105591, + "num_tokens": 659751334.0, + "step": 17292 + }, + { + "epoch": 2.1998473476656915, + "ewc_loss": 0.07567739486694336, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003881216107401997, + "grad_norm": 8.74581527709961, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8673498034477234, + "num_tokens": 659793867.0, + "step": 17293 + }, + { + "epoch": 2.199974557944282, + "ewc_loss": 0.07524485886096954, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003837962285615504, + "grad_norm": 8.655354499816895, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8669435381889343, + "num_tokens": 659838724.0, + "step": 17294 + }, + { + "epoch": 2.2001017682228725, + "ewc_loss": 0.07571324706077576, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003884801408275962, + "grad_norm": 8.755284309387207, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8626362681388855, + "num_tokens": 659876589.0, + "step": 17295 + }, + { + "epoch": 2.200228978501463, + "ewc_loss": 0.07534158229827881, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003847634943667799, + "grad_norm": 8.670141220092773, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.857033371925354, + "num_tokens": 659914864.0, + "step": 17296 + }, + { + "epoch": 2.2003561887800536, + "ewc_loss": 0.07571642100811005, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038851186400279403, + "grad_norm": 8.749899864196777, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8743770718574524, + "num_tokens": 659952482.0, + "step": 17297 + }, + { + "epoch": 2.200483399058644, + "ewc_loss": 0.07534225285053253, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038477021735161543, + "grad_norm": 8.772650718688965, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8647151589393616, + "num_tokens": 659985192.0, + "step": 17298 + }, + { + "epoch": 2.2006106093372346, + "ewc_loss": 0.07532957196235657, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038464332465082407, + "grad_norm": 8.698301315307617, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8748269081115723, + "num_tokens": 660019505.0, + "step": 17299 + }, + { + "epoch": 2.200737819615825, + "ewc_loss": 0.07552168518304825, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000386564526706934, + "grad_norm": 8.720819473266602, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8578475713729858, + "num_tokens": 660054942.0, + "step": 17300 + }, + { + "epoch": 2.2008650298944152, + "ewc_loss": 0.07535368204116821, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038488447898998857, + "grad_norm": 8.678608894348145, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8638359308242798, + "num_tokens": 660092836.0, + "step": 17301 + }, + { + "epoch": 2.200992240173006, + "ewc_loss": 0.07559126615524292, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038726028287783265, + "grad_norm": 8.72581672668457, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8607527017593384, + "num_tokens": 660129419.0, + "step": 17302 + }, + { + "epoch": 2.2011194504515963, + "ewc_loss": 0.07531772553920746, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038452495937235653, + "grad_norm": 8.646306037902832, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8698710799217224, + "num_tokens": 660168116.0, + "step": 17303 + }, + { + "epoch": 2.201246660730187, + "ewc_loss": 0.07561235129833221, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003874711983371526, + "grad_norm": 8.734991073608398, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8597100973129272, + "num_tokens": 660208006.0, + "step": 17304 + }, + { + "epoch": 2.2013738710087773, + "ewc_loss": 0.07528035342693329, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003841511788778007, + "grad_norm": 8.640151977539062, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8652898073196411, + "num_tokens": 660254046.0, + "step": 17305 + }, + { + "epoch": 2.201501081287368, + "ewc_loss": 0.07563811540603638, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038772879634052515, + "grad_norm": 8.77305793762207, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8636300563812256, + "num_tokens": 660289321.0, + "step": 17306 + }, + { + "epoch": 2.2016282915659584, + "ewc_loss": 0.07547256350517273, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003836318792309612, + "grad_norm": 8.67065715789795, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8624905347824097, + "num_tokens": 660325052.0, + "step": 17307 + }, + { + "epoch": 2.201755501844549, + "ewc_loss": 0.07572668790817261, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038861457142047584, + "grad_norm": 8.767938613891602, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8557742238044739, + "num_tokens": 660357942.0, + "step": 17308 + }, + { + "epoch": 2.2018827121231395, + "ewc_loss": 0.07521072030067444, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038345486973412335, + "grad_norm": 8.664983749389648, + "learning_rate": 1e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8403586149215698, + "num_tokens": 660390654.0, + "step": 17309 + }, + { + "epoch": 2.20200992240173, + "ewc_loss": 0.07571592926979065, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003885069163516164, + "grad_norm": 8.790961265563965, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8648772239685059, + "num_tokens": 660432020.0, + "step": 17310 + }, + { + "epoch": 2.2021371326803205, + "ewc_loss": 0.07525080442428589, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003838557458948344, + "grad_norm": 8.699043273925781, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8652278184890747, + "num_tokens": 660465051.0, + "step": 17311 + }, + { + "epoch": 2.202264342958911, + "ewc_loss": 0.07593068480491638, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003882131422869861, + "grad_norm": 8.889796257019043, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.860618531703949, + "num_tokens": 660504967.0, + "step": 17312 + }, + { + "epoch": 2.2023915532375016, + "ewc_loss": 0.07506430149078369, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000381990714231506, + "grad_norm": 8.675214767456055, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8554226160049438, + "num_tokens": 660543826.0, + "step": 17313 + }, + { + "epoch": 2.202518763516092, + "ewc_loss": 0.07572217285633087, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003885693440679461, + "grad_norm": 8.744560241699219, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8599464893341064, + "num_tokens": 660583074.0, + "step": 17314 + }, + { + "epoch": 2.2026459737946826, + "ewc_loss": 0.07536591589450836, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003850068314932287, + "grad_norm": 8.751116752624512, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8825858235359192, + "num_tokens": 660619974.0, + "step": 17315 + }, + { + "epoch": 2.202773184073273, + "ewc_loss": 0.075445756316185, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003858052077703178, + "grad_norm": 8.757933616638184, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.870730996131897, + "num_tokens": 660654005.0, + "step": 17316 + }, + { + "epoch": 2.2029003943518637, + "ewc_loss": 0.07535214722156525, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003848691703751683, + "grad_norm": 8.7594575881958, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.853770911693573, + "num_tokens": 660689179.0, + "step": 17317 + }, + { + "epoch": 2.203027604630454, + "ewc_loss": 0.07533837854862213, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038473145104944706, + "grad_norm": 8.784187316894531, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8540357351303101, + "num_tokens": 660724923.0, + "step": 17318 + }, + { + "epoch": 2.2031548149090447, + "ewc_loss": 0.07535415887832642, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038488919381052256, + "grad_norm": 8.751958847045898, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8692464232444763, + "num_tokens": 660760763.0, + "step": 17319 + }, + { + "epoch": 2.2032820251876353, + "ewc_loss": 0.07520441710948944, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003833918599411845, + "grad_norm": 8.71269702911377, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8655015230178833, + "num_tokens": 660795520.0, + "step": 17320 + }, + { + "epoch": 2.203409235466226, + "ewc_loss": 0.07526397705078125, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003839874407276511, + "grad_norm": 8.743711471557617, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8740781545639038, + "num_tokens": 660834858.0, + "step": 17321 + }, + { + "epoch": 2.2035364457448163, + "ewc_loss": 0.07521859556436539, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003835335955955088, + "grad_norm": 8.67269515991211, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8734332323074341, + "num_tokens": 660870319.0, + "step": 17322 + }, + { + "epoch": 2.203663656023407, + "ewc_loss": 0.07544219493865967, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003857696137856692, + "grad_norm": 8.894847869873047, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.851211667060852, + "num_tokens": 660907313.0, + "step": 17323 + }, + { + "epoch": 2.2037908663019974, + "ewc_loss": 0.07489465922117233, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000380294251954183, + "grad_norm": 8.595142364501953, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8800135850906372, + "num_tokens": 660948087.0, + "step": 17324 + }, + { + "epoch": 2.203918076580588, + "ewc_loss": 0.07595175504684448, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039086525794118643, + "grad_norm": 9.177449226379395, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8694597482681274, + "num_tokens": 660984984.0, + "step": 17325 + }, + { + "epoch": 2.204045286859178, + "ewc_loss": 0.07449402660131454, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00037384650204330683, + "grad_norm": 8.484530448913574, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8615924119949341, + "num_tokens": 661022898.0, + "step": 17326 + }, + { + "epoch": 2.2041724971377685, + "ewc_loss": 0.07681849598884583, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003995325823780149, + "grad_norm": 9.448856353759766, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8737195730209351, + "num_tokens": 661058645.0, + "step": 17327 + }, + { + "epoch": 2.204299707416359, + "ewc_loss": 0.07414713501930237, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037281904951669276, + "grad_norm": 8.395177841186523, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8874872922897339, + "num_tokens": 661096954.0, + "step": 17328 + }, + { + "epoch": 2.2044269176949496, + "ewc_loss": 0.07771368324756622, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004084844549652189, + "grad_norm": 9.375152587890625, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8537155389785767, + "num_tokens": 661136947.0, + "step": 17329 + }, + { + "epoch": 2.20455412797354, + "ewc_loss": 0.07445838302373886, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00037593150045722723, + "grad_norm": 8.491700172424316, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8661293983459473, + "num_tokens": 661175376.0, + "step": 17330 + }, + { + "epoch": 2.2046813382521306, + "ewc_loss": 0.07757356017827988, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00040708325104787946, + "grad_norm": 9.214506149291992, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8581438064575195, + "num_tokens": 661213238.0, + "step": 17331 + }, + { + "epoch": 2.204808548530721, + "ewc_loss": 0.07507885992527008, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038213629159145057, + "grad_norm": 8.817540168762207, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8814530968666077, + "num_tokens": 661246512.0, + "step": 17332 + }, + { + "epoch": 2.2049357588093117, + "ewc_loss": 0.07657527923583984, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003946590004488826, + "grad_norm": 8.9720458984375, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8771447539329529, + "num_tokens": 661288272.0, + "step": 17333 + }, + { + "epoch": 2.2050629690879022, + "ewc_loss": 0.07543403655290604, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000385688035748899, + "grad_norm": 8.734076499938965, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8719140291213989, + "num_tokens": 661328705.0, + "step": 17334 + }, + { + "epoch": 2.2051901793664928, + "ewc_loss": 0.07596120238304138, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000390959728974849, + "grad_norm": 9.00271224975586, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8693059086799622, + "num_tokens": 661369877.0, + "step": 17335 + }, + { + "epoch": 2.2053173896450833, + "ewc_loss": 0.07507400214672089, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003820876881945878, + "grad_norm": 8.682037353515625, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8673899173736572, + "num_tokens": 661405209.0, + "step": 17336 + }, + { + "epoch": 2.205444599923674, + "ewc_loss": 0.07587914168834686, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039013908826746047, + "grad_norm": 8.89215087890625, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8699078559875488, + "num_tokens": 661441083.0, + "step": 17337 + }, + { + "epoch": 2.2055718102022643, + "ewc_loss": 0.07521486282348633, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003834962844848633, + "grad_norm": 8.762894630432129, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8677561283111572, + "num_tokens": 661476938.0, + "step": 17338 + }, + { + "epoch": 2.205699020480855, + "ewc_loss": 0.07564958184957504, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003878434654325247, + "grad_norm": 8.841270446777344, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8574631214141846, + "num_tokens": 661519281.0, + "step": 17339 + }, + { + "epoch": 2.2058262307594454, + "ewc_loss": 0.07523223757743835, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038367003435269, + "grad_norm": 8.699614524841309, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8808462619781494, + "num_tokens": 661555880.0, + "step": 17340 + }, + { + "epoch": 2.205953441038036, + "ewc_loss": 0.07559296488761902, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003872772795148194, + "grad_norm": 8.789966583251953, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.857864499092102, + "num_tokens": 661595937.0, + "step": 17341 + }, + { + "epoch": 2.2060806513166265, + "ewc_loss": 0.07527777552604675, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003841254219878465, + "grad_norm": 8.703460693359375, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8519316911697388, + "num_tokens": 661636868.0, + "step": 17342 + }, + { + "epoch": 2.206207861595217, + "ewc_loss": 0.07553566992282867, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038670439971610904, + "grad_norm": 8.727846145629883, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8577947616577148, + "num_tokens": 661681503.0, + "step": 17343 + }, + { + "epoch": 2.2063350718738075, + "ewc_loss": 0.07544752955436707, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038582299021072686, + "grad_norm": 8.74743938446045, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8681942224502563, + "num_tokens": 661718438.0, + "step": 17344 + }, + { + "epoch": 2.206462282152398, + "ewc_loss": 0.07542124390602112, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038556012441404164, + "grad_norm": 8.734772682189941, + "learning_rate": 1e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.8460736274719238, + "num_tokens": 661759048.0, + "step": 17345 + }, + { + "epoch": 2.2065894924309886, + "ewc_loss": 0.07545258849859238, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003858735435642302, + "grad_norm": 8.730141639709473, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8762180805206299, + "num_tokens": 661803131.0, + "step": 17346 + }, + { + "epoch": 2.206716702709579, + "ewc_loss": 0.07530222833156586, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003843699232675135, + "grad_norm": 8.687396049499512, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8659592270851135, + "num_tokens": 661838139.0, + "step": 17347 + }, + { + "epoch": 2.2068439129881696, + "ewc_loss": 0.07567247748374939, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003880724834743887, + "grad_norm": 8.824981689453125, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8584381341934204, + "num_tokens": 661878568.0, + "step": 17348 + }, + { + "epoch": 2.20697112326676, + "ewc_loss": 0.07514040172100067, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038275172119028866, + "grad_norm": 8.651032447814941, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8595127463340759, + "num_tokens": 661909110.0, + "step": 17349 + }, + { + "epoch": 2.2070983335453507, + "ewc_loss": 0.07593557238578796, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003907033824361861, + "grad_norm": 8.829367637634277, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8792390823364258, + "num_tokens": 661944472.0, + "step": 17350 + }, + { + "epoch": 2.2072255438239408, + "ewc_loss": 0.07512837648391724, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003826314059551805, + "grad_norm": 8.599630355834961, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8777387142181396, + "num_tokens": 661985865.0, + "step": 17351 + }, + { + "epoch": 2.2073527541025313, + "ewc_loss": 0.07617102563381195, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003930579114239663, + "grad_norm": 8.95417308807373, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8607321977615356, + "num_tokens": 662019460.0, + "step": 17352 + }, + { + "epoch": 2.207479964381122, + "ewc_loss": 0.07497040927410126, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038105176645331085, + "grad_norm": 8.572163581848145, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8732120394706726, + "num_tokens": 662059109.0, + "step": 17353 + }, + { + "epoch": 2.2076071746597123, + "ewc_loss": 0.07645557075738907, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003959033638238907, + "grad_norm": 8.885411262512207, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8584175109863281, + "num_tokens": 662096040.0, + "step": 17354 + }, + { + "epoch": 2.207734384938303, + "ewc_loss": 0.07498137652873993, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003811613714788109, + "grad_norm": 8.58769702911377, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8712944388389587, + "num_tokens": 662139351.0, + "step": 17355 + }, + { + "epoch": 2.2078615952168934, + "ewc_loss": 0.07654121518135071, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003967598604504019, + "grad_norm": 8.920669555664062, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.867541491985321, + "num_tokens": 662177353.0, + "step": 17356 + }, + { + "epoch": 2.207988805495484, + "ewc_loss": 0.07508596777915955, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000382207304937765, + "grad_norm": 8.619979858398438, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8551778197288513, + "num_tokens": 662209519.0, + "step": 17357 + }, + { + "epoch": 2.2081160157740745, + "ewc_loss": 0.07627183943986893, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039406606811098754, + "grad_norm": 8.945958137512207, + "learning_rate": 1e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.846377432346344, + "num_tokens": 662251311.0, + "step": 17358 + }, + { + "epoch": 2.208243226052665, + "ewc_loss": 0.07518275082111359, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003831752110272646, + "grad_norm": 8.654149055480957, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8737397193908691, + "num_tokens": 662282770.0, + "step": 17359 + }, + { + "epoch": 2.2083704363312555, + "ewc_loss": 0.0763016864657402, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039436452789232135, + "grad_norm": 8.951468467712402, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8618515133857727, + "num_tokens": 662326770.0, + "step": 17360 + }, + { + "epoch": 2.208497646609846, + "ewc_loss": 0.07513541728258133, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003827018372248858, + "grad_norm": 8.598235130310059, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8644865155220032, + "num_tokens": 662366811.0, + "step": 17361 + }, + { + "epoch": 2.2086248568884366, + "ewc_loss": 0.07634101063013077, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039475777884945273, + "grad_norm": 8.895383834838867, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8625776767730713, + "num_tokens": 662404517.0, + "step": 17362 + }, + { + "epoch": 2.208752067167027, + "ewc_loss": 0.07512030750513077, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038255073013715446, + "grad_norm": 8.622962951660156, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8698563575744629, + "num_tokens": 662442091.0, + "step": 17363 + }, + { + "epoch": 2.2088792774456176, + "ewc_loss": 0.07605677098035812, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039191535324789584, + "grad_norm": 8.860106468200684, + "learning_rate": 1e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8409100770950317, + "num_tokens": 662483343.0, + "step": 17364 + }, + { + "epoch": 2.209006487724208, + "ewc_loss": 0.07538315653800964, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003851792716886848, + "grad_norm": 8.72492790222168, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8592709898948669, + "num_tokens": 662521994.0, + "step": 17365 + }, + { + "epoch": 2.2091336980027987, + "ewc_loss": 0.07602795958518982, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038918579230085015, + "grad_norm": 8.768828392028809, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8618570566177368, + "num_tokens": 662562737.0, + "step": 17366 + }, + { + "epoch": 2.209260908281389, + "ewc_loss": 0.07544977962970734, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000385845429264009, + "grad_norm": 8.680018424987793, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.861278235912323, + "num_tokens": 662596927.0, + "step": 17367 + }, + { + "epoch": 2.2093881185599797, + "ewc_loss": 0.07585643231868744, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003899119619745761, + "grad_norm": 8.794163703918457, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8744639754295349, + "num_tokens": 662632831.0, + "step": 17368 + }, + { + "epoch": 2.2095153288385703, + "ewc_loss": 0.07542617619037628, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038560942630283535, + "grad_norm": 8.684072494506836, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8617693185806274, + "num_tokens": 662675554.0, + "step": 17369 + }, + { + "epoch": 2.209642539117161, + "ewc_loss": 0.07586175203323364, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038996513467282057, + "grad_norm": 8.74731159210205, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8777852058410645, + "num_tokens": 662720248.0, + "step": 17370 + }, + { + "epoch": 2.2097697493957513, + "ewc_loss": 0.07571182399988174, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003884658799506724, + "grad_norm": 8.707669258117676, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8650649785995483, + "num_tokens": 662761388.0, + "step": 17371 + }, + { + "epoch": 2.209896959674342, + "ewc_loss": 0.07570485770702362, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003883962344843894, + "grad_norm": 8.758833885192871, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.86456298828125, + "num_tokens": 662798007.0, + "step": 17372 + }, + { + "epoch": 2.2100241699529324, + "ewc_loss": 0.07571415603160858, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038848925032652915, + "grad_norm": 8.71669864654541, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8619211912155151, + "num_tokens": 662836042.0, + "step": 17373 + }, + { + "epoch": 2.210151380231523, + "ewc_loss": 0.07578999549150467, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038924760883674026, + "grad_norm": 8.786335945129395, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.856109619140625, + "num_tokens": 662869465.0, + "step": 17374 + }, + { + "epoch": 2.2102785905101134, + "ewc_loss": 0.07570165395736694, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003883642493747175, + "grad_norm": 8.699594497680664, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8570376038551331, + "num_tokens": 662909440.0, + "step": 17375 + }, + { + "epoch": 2.2104058007887035, + "ewc_loss": 0.07589742541313171, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003903219476342201, + "grad_norm": 8.761638641357422, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8621758222579956, + "num_tokens": 662949637.0, + "step": 17376 + }, + { + "epoch": 2.210533011067294, + "ewc_loss": 0.07573039829730988, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003886516788043082, + "grad_norm": 8.718515396118164, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8618613481521606, + "num_tokens": 662987679.0, + "step": 17377 + }, + { + "epoch": 2.2106602213458846, + "ewc_loss": 0.07601489126682281, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039149660733528435, + "grad_norm": 8.813838958740234, + "learning_rate": 1e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8450493216514587, + "num_tokens": 663027651.0, + "step": 17378 + }, + { + "epoch": 2.210787431624475, + "ewc_loss": 0.07559805363416672, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038732821121811867, + "grad_norm": 8.753457069396973, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.879226803779602, + "num_tokens": 663063708.0, + "step": 17379 + }, + { + "epoch": 2.2109146419030656, + "ewc_loss": 0.07598772644996643, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003912249521818012, + "grad_norm": 8.765610694885254, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8572921752929688, + "num_tokens": 663097255.0, + "step": 17380 + }, + { + "epoch": 2.211041852181656, + "ewc_loss": 0.07572852075099945, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038863284862600267, + "grad_norm": 8.766136169433594, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8610240817070007, + "num_tokens": 663134496.0, + "step": 17381 + }, + { + "epoch": 2.2111690624602467, + "ewc_loss": 0.07557500898838043, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003870977379847318, + "grad_norm": 8.686097145080566, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8740705251693726, + "num_tokens": 663170278.0, + "step": 17382 + }, + { + "epoch": 2.211296272738837, + "ewc_loss": 0.0759836882352829, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039118455606512725, + "grad_norm": 8.820907592773438, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.862990140914917, + "num_tokens": 663205407.0, + "step": 17383 + }, + { + "epoch": 2.2114234830174277, + "ewc_loss": 0.07542642951011658, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038561190012842417, + "grad_norm": 8.699103355407715, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8698081374168396, + "num_tokens": 663243166.0, + "step": 17384 + }, + { + "epoch": 2.2115506932960183, + "ewc_loss": 0.07585963606834412, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000389944005291909, + "grad_norm": 8.75432014465332, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8867367506027222, + "num_tokens": 663282267.0, + "step": 17385 + }, + { + "epoch": 2.211677903574609, + "ewc_loss": 0.07569210231304169, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003882686432916671, + "grad_norm": 8.797088623046875, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8566381335258484, + "num_tokens": 663318620.0, + "step": 17386 + }, + { + "epoch": 2.2118051138531993, + "ewc_loss": 0.07556773722171783, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038702506572008133, + "grad_norm": 8.733858108520508, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8824142217636108, + "num_tokens": 663353256.0, + "step": 17387 + }, + { + "epoch": 2.21193232413179, + "ewc_loss": 0.07576470077037811, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038899469655007124, + "grad_norm": 8.80495548248291, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8680760860443115, + "num_tokens": 663386691.0, + "step": 17388 + }, + { + "epoch": 2.2120595344103804, + "ewc_loss": 0.07557065784931183, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003870542859658599, + "grad_norm": 8.691168785095215, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8621923923492432, + "num_tokens": 663416550.0, + "step": 17389 + }, + { + "epoch": 2.212186744688971, + "ewc_loss": 0.07568571716547012, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003882048185914755, + "grad_norm": 8.696800231933594, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8672626614570618, + "num_tokens": 663464912.0, + "step": 17390 + }, + { + "epoch": 2.2123139549675614, + "ewc_loss": 0.0758465826511383, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000389813503716141, + "grad_norm": 8.779436111450195, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8553613424301147, + "num_tokens": 663503786.0, + "step": 17391 + }, + { + "epoch": 2.212441165246152, + "ewc_loss": 0.0758405327796936, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038731153472326696, + "grad_norm": 8.69411563873291, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8722957372665405, + "num_tokens": 663540579.0, + "step": 17392 + }, + { + "epoch": 2.2125683755247425, + "ewc_loss": 0.07578125596046448, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038916018093004823, + "grad_norm": 8.765120506286621, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8662542104721069, + "num_tokens": 663577284.0, + "step": 17393 + }, + { + "epoch": 2.212695585803333, + "ewc_loss": 0.0756799727678299, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038814739673398435, + "grad_norm": 8.803265571594238, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8594464063644409, + "num_tokens": 663613118.0, + "step": 17394 + }, + { + "epoch": 2.2128227960819236, + "ewc_loss": 0.07571489363908768, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038849658449180424, + "grad_norm": 8.800298690795898, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8700476884841919, + "num_tokens": 663651548.0, + "step": 17395 + }, + { + "epoch": 2.212950006360514, + "ewc_loss": 0.0755162388086319, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038651004433631897, + "grad_norm": 8.737800598144531, + "learning_rate": 1e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8459227681159973, + "num_tokens": 663694381.0, + "step": 17396 + }, + { + "epoch": 2.2130772166391046, + "ewc_loss": 0.0756993442773819, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038834105362184346, + "grad_norm": 8.702583312988281, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.868086576461792, + "num_tokens": 663734686.0, + "step": 17397 + }, + { + "epoch": 2.213204426917695, + "ewc_loss": 0.07569648325443268, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003883125027641654, + "grad_norm": 8.783669471740723, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8572211265563965, + "num_tokens": 663768764.0, + "step": 17398 + }, + { + "epoch": 2.2133316371962852, + "ewc_loss": 0.07542642951011658, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038561198743991554, + "grad_norm": 8.685922622680664, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8842092156410217, + "num_tokens": 663807650.0, + "step": 17399 + }, + { + "epoch": 2.213458847474876, + "ewc_loss": 0.07576215267181396, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038896914338693023, + "grad_norm": 8.734814643859863, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8640543222427368, + "num_tokens": 663849308.0, + "step": 17400 + }, + { + "epoch": 2.2135860577534663, + "ewc_loss": 0.07544401288032532, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003857878327835351, + "grad_norm": 8.714731216430664, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8503010869026184, + "num_tokens": 663888917.0, + "step": 17401 + }, + { + "epoch": 2.213713268032057, + "ewc_loss": 0.07568354159593582, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003881830780301243, + "grad_norm": 8.83570671081543, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8653560876846313, + "num_tokens": 663921048.0, + "step": 17402 + }, + { + "epoch": 2.2138404783106473, + "ewc_loss": 0.07535786926746368, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038492633029818535, + "grad_norm": 8.730257034301758, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8693468570709229, + "num_tokens": 663963433.0, + "step": 17403 + }, + { + "epoch": 2.213967688589238, + "ewc_loss": 0.07565009593963623, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000387848645914346, + "grad_norm": 8.80052375793457, + "learning_rate": 1e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8477921485900879, + "num_tokens": 664003294.0, + "step": 17404 + }, + { + "epoch": 2.2140948988678284, + "ewc_loss": 0.07530993223190308, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003844470193143934, + "grad_norm": 8.694657325744629, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8531243205070496, + "num_tokens": 664043161.0, + "step": 17405 + }, + { + "epoch": 2.214222109146419, + "ewc_loss": 0.07562295347452164, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000387577194487676, + "grad_norm": 8.763960838317871, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8756123781204224, + "num_tokens": 664082457.0, + "step": 17406 + }, + { + "epoch": 2.2143493194250095, + "ewc_loss": 0.07538717985153198, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038521949318237603, + "grad_norm": 8.69668197631836, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8758054971694946, + "num_tokens": 664125999.0, + "step": 17407 + }, + { + "epoch": 2.2144765297036, + "ewc_loss": 0.07575703412294388, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003889179788529873, + "grad_norm": 8.768117904663086, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8714910745620728, + "num_tokens": 664167256.0, + "step": 17408 + }, + { + "epoch": 2.2146037399821905, + "ewc_loss": 0.07528683543205261, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038421599310822785, + "grad_norm": 8.725249290466309, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8627749085426331, + "num_tokens": 664201887.0, + "step": 17409 + }, + { + "epoch": 2.214730950260781, + "ewc_loss": 0.07561653107404709, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000387512962333858, + "grad_norm": 8.741008758544922, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8703095316886902, + "num_tokens": 664238986.0, + "step": 17410 + }, + { + "epoch": 2.2148581605393716, + "ewc_loss": 0.07540377974510193, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038538541411980987, + "grad_norm": 8.715601921081543, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.861492931842804, + "num_tokens": 664283436.0, + "step": 17411 + }, + { + "epoch": 2.214985370817962, + "ewc_loss": 0.07554686814546585, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038681633304804564, + "grad_norm": 8.760645866394043, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8589799404144287, + "num_tokens": 664325196.0, + "step": 17412 + }, + { + "epoch": 2.2151125810965526, + "ewc_loss": 0.07553607225418091, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003867083869408816, + "grad_norm": 8.7285795211792, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8566482663154602, + "num_tokens": 664365722.0, + "step": 17413 + }, + { + "epoch": 2.215239791375143, + "ewc_loss": 0.07558131963014603, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038716086419299245, + "grad_norm": 8.74875259399414, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8556523323059082, + "num_tokens": 664411570.0, + "step": 17414 + }, + { + "epoch": 2.2153670016537337, + "ewc_loss": 0.07563753426074982, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038772300467826426, + "grad_norm": 8.755452156066895, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8677539825439453, + "num_tokens": 664453545.0, + "step": 17415 + }, + { + "epoch": 2.215494211932324, + "ewc_loss": 0.0756869986653328, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003882176533807069, + "grad_norm": 8.77933406829834, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8648433685302734, + "num_tokens": 664489882.0, + "step": 17416 + }, + { + "epoch": 2.2156214222109147, + "ewc_loss": 0.07547783851623535, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003861259901896119, + "grad_norm": 8.68678092956543, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8639257550239563, + "num_tokens": 664528720.0, + "step": 17417 + }, + { + "epoch": 2.2157486324895053, + "ewc_loss": 0.07581114768981934, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038945910637266934, + "grad_norm": 8.7987642288208, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.856011152267456, + "num_tokens": 664569645.0, + "step": 17418 + }, + { + "epoch": 2.215875842768096, + "ewc_loss": 0.07556377351284027, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038698536809533834, + "grad_norm": 8.799479484558105, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8601322174072266, + "num_tokens": 664612033.0, + "step": 17419 + }, + { + "epoch": 2.2160030530466863, + "ewc_loss": 0.07554410398006439, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003867887135129422, + "grad_norm": 8.767897605895996, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8711620569229126, + "num_tokens": 664651408.0, + "step": 17420 + }, + { + "epoch": 2.216130263325277, + "ewc_loss": 0.07560595870018005, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003874072863254696, + "grad_norm": 8.763401985168457, + "learning_rate": 1e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8507242202758789, + "num_tokens": 664695764.0, + "step": 17421 + }, + { + "epoch": 2.2162574736038674, + "ewc_loss": 0.07554401457309723, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038678781129419804, + "grad_norm": 8.710728645324707, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8670063614845276, + "num_tokens": 664735528.0, + "step": 17422 + }, + { + "epoch": 2.216384683882458, + "ewc_loss": 0.07576516270637512, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003889992367476225, + "grad_norm": 8.777812957763672, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8567203283309937, + "num_tokens": 664775612.0, + "step": 17423 + }, + { + "epoch": 2.216511894161048, + "ewc_loss": 0.07548972964286804, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003862449375446886, + "grad_norm": 8.741048812866211, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8614708185195923, + "num_tokens": 664812485.0, + "step": 17424 + }, + { + "epoch": 2.2166391044396385, + "ewc_loss": 0.0757143646478653, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003884913166984916, + "grad_norm": 8.775933265686035, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8497601747512817, + "num_tokens": 664853166.0, + "step": 17425 + }, + { + "epoch": 2.216766314718229, + "ewc_loss": 0.07544999569654465, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038584761205129325, + "grad_norm": 8.694490432739258, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8544632792472839, + "num_tokens": 664888778.0, + "step": 17426 + }, + { + "epoch": 2.2168935249968196, + "ewc_loss": 0.07584625482559204, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038981015677563846, + "grad_norm": 8.803120613098145, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8625980615615845, + "num_tokens": 664933783.0, + "step": 17427 + }, + { + "epoch": 2.21702073527541, + "ewc_loss": 0.07544811069965363, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038582878187298775, + "grad_norm": 8.704854011535645, + "learning_rate": 1e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.8468421697616577, + "num_tokens": 664965228.0, + "step": 17428 + }, + { + "epoch": 2.2171479455540006, + "ewc_loss": 0.07580555975437164, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003894032270181924, + "grad_norm": 8.798770904541016, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8625332117080688, + "num_tokens": 664996691.0, + "step": 17429 + }, + { + "epoch": 2.217275155832591, + "ewc_loss": 0.07545243948698044, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003858720592688769, + "grad_norm": 8.64317798614502, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8749774694442749, + "num_tokens": 665032006.0, + "step": 17430 + }, + { + "epoch": 2.2174023661111817, + "ewc_loss": 0.0760919600725174, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003922672476619482, + "grad_norm": 8.81473445892334, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8542758226394653, + "num_tokens": 665071819.0, + "step": 17431 + }, + { + "epoch": 2.217529576389772, + "ewc_loss": 0.07549828290939331, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038633044459857047, + "grad_norm": 8.688729286193848, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8523353338241577, + "num_tokens": 665118883.0, + "step": 17432 + }, + { + "epoch": 2.2176567866683627, + "ewc_loss": 0.07595919072628021, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039093956002034247, + "grad_norm": 8.802494049072266, + "learning_rate": 1e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.8468905687332153, + "num_tokens": 665154926.0, + "step": 17433 + }, + { + "epoch": 2.2177839969469533, + "ewc_loss": 0.07560885697603226, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003874362155329436, + "grad_norm": 8.77764892578125, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8711382150650024, + "num_tokens": 665189910.0, + "step": 17434 + }, + { + "epoch": 2.217911207225544, + "ewc_loss": 0.07579821348190308, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038932982715778053, + "grad_norm": 8.723989486694336, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8716226816177368, + "num_tokens": 665235849.0, + "step": 17435 + }, + { + "epoch": 2.2180384175041343, + "ewc_loss": 0.07584652304649353, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038981286343187094, + "grad_norm": 8.840145111083984, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8553557395935059, + "num_tokens": 665274231.0, + "step": 17436 + }, + { + "epoch": 2.218165627782725, + "ewc_loss": 0.07581920921802521, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003870983491651714, + "grad_norm": 8.738934516906738, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8823486566543579, + "num_tokens": 665309057.0, + "step": 17437 + }, + { + "epoch": 2.2182928380613154, + "ewc_loss": 0.07582259923219681, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003895736299455166, + "grad_norm": 8.844367027282715, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8598983287811279, + "num_tokens": 665337031.0, + "step": 17438 + }, + { + "epoch": 2.218420048339906, + "ewc_loss": 0.0757104903459549, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003860111755784601, + "grad_norm": 14.2908935546875, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8629537224769592, + "num_tokens": 665376557.0, + "step": 17439 + }, + { + "epoch": 2.2185472586184964, + "ewc_loss": 0.08281077444553375, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000457013986306265, + "grad_norm": 9.359014511108398, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8620805740356445, + "num_tokens": 665419928.0, + "step": 17440 + }, + { + "epoch": 2.218674468897087, + "ewc_loss": 0.07817676663398743, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041067387792281806, + "grad_norm": 9.275642395019531, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.87094646692276, + "num_tokens": 665458827.0, + "step": 17441 + }, + { + "epoch": 2.2188016791756775, + "ewc_loss": 0.07647587358951569, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039366495911963284, + "grad_norm": 8.874839782714844, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8647114634513855, + "num_tokens": 665498623.0, + "step": 17442 + }, + { + "epoch": 2.218928889454268, + "ewc_loss": 0.07894905656576157, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000418396812165156, + "grad_norm": 9.25707721710205, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8581175208091736, + "num_tokens": 665536103.0, + "step": 17443 + }, + { + "epoch": 2.2190560997328586, + "ewc_loss": 0.07595789432525635, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003909265506081283, + "grad_norm": 8.839766502380371, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8611278533935547, + "num_tokens": 665572089.0, + "step": 17444 + }, + { + "epoch": 2.219183310011449, + "ewc_loss": 0.07740192860364914, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00040536693995818496, + "grad_norm": 9.096240997314453, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.855599045753479, + "num_tokens": 665608088.0, + "step": 17445 + }, + { + "epoch": 2.2193105202900396, + "ewc_loss": 0.0762745589017868, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039165184716694057, + "grad_norm": 8.90898609161377, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8723543286323547, + "num_tokens": 665641883.0, + "step": 17446 + }, + { + "epoch": 2.21943773056863, + "ewc_loss": 0.07678045332431793, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003991521953139454, + "grad_norm": 9.009303092956543, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8675487637519836, + "num_tokens": 665682211.0, + "step": 17447 + }, + { + "epoch": 2.2195649408472207, + "ewc_loss": 0.0757303237915039, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038865089300088584, + "grad_norm": 8.806503295898438, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8679572939872742, + "num_tokens": 665726182.0, + "step": 17448 + }, + { + "epoch": 2.2196921511258108, + "ewc_loss": 0.07660320401191711, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039493830990977585, + "grad_norm": 8.992440223693848, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8679592609405518, + "num_tokens": 665760404.0, + "step": 17449 + }, + { + "epoch": 2.2198193614044013, + "ewc_loss": 0.07564130425453186, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038776075234636664, + "grad_norm": 8.79665470123291, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8629677295684814, + "num_tokens": 665798235.0, + "step": 17450 + }, + { + "epoch": 2.219946571682992, + "ewc_loss": 0.07606241106987, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039197178557515144, + "grad_norm": 8.910836219787598, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8642944097518921, + "num_tokens": 665840118.0, + "step": 17451 + }, + { + "epoch": 2.2200737819615823, + "ewc_loss": 0.0754261463880539, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038560916436836123, + "grad_norm": 8.749110221862793, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8626900911331177, + "num_tokens": 665878498.0, + "step": 17452 + }, + { + "epoch": 2.220200992240173, + "ewc_loss": 0.07600900530815125, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003914377302862704, + "grad_norm": 8.852286338806152, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8601783514022827, + "num_tokens": 665924179.0, + "step": 17453 + }, + { + "epoch": 2.2203282025187634, + "ewc_loss": 0.07547858357429504, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003861334698740393, + "grad_norm": 8.760137557983398, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8794095516204834, + "num_tokens": 665963160.0, + "step": 17454 + }, + { + "epoch": 2.220455412797354, + "ewc_loss": 0.07588598132133484, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039020751137286425, + "grad_norm": 8.894879341125488, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.873254656791687, + "num_tokens": 665998854.0, + "step": 17455 + }, + { + "epoch": 2.2205826230759445, + "ewc_loss": 0.07561812549829483, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003875289112329483, + "grad_norm": 8.79391860961914, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8722851276397705, + "num_tokens": 666036420.0, + "step": 17456 + }, + { + "epoch": 2.220709833354535, + "ewc_loss": 0.07577764987945557, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038912417949177325, + "grad_norm": 8.900090217590332, + "learning_rate": 1e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8445996046066284, + "num_tokens": 666076692.0, + "step": 17457 + }, + { + "epoch": 2.2208370436331255, + "ewc_loss": 0.075437031686306, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003857179544866085, + "grad_norm": 8.78662395477295, + "learning_rate": 1e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8504863977432251, + "num_tokens": 666118009.0, + "step": 17458 + }, + { + "epoch": 2.220964253911716, + "ewc_loss": 0.07574674487113953, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038881509681232274, + "grad_norm": 8.885679244995117, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8545801043510437, + "num_tokens": 666158132.0, + "step": 17459 + }, + { + "epoch": 2.2210914641903066, + "ewc_loss": 0.07519746571779251, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003833223308902234, + "grad_norm": 8.751466751098633, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8588987588882446, + "num_tokens": 666187195.0, + "step": 17460 + }, + { + "epoch": 2.221218674468897, + "ewc_loss": 0.07576148211956024, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038896247860975564, + "grad_norm": 8.834474563598633, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8687360286712646, + "num_tokens": 666227790.0, + "step": 17461 + }, + { + "epoch": 2.2213458847474876, + "ewc_loss": 0.07547561079263687, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038610375486314297, + "grad_norm": 8.820211410522461, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8574055433273315, + "num_tokens": 666268517.0, + "step": 17462 + }, + { + "epoch": 2.221473095026078, + "ewc_loss": 0.07566581666469574, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038556443178094923, + "grad_norm": 8.733531951904297, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8806890249252319, + "num_tokens": 666313727.0, + "step": 17463 + }, + { + "epoch": 2.2216003053046687, + "ewc_loss": 0.07563440501689911, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003876916889566928, + "grad_norm": 8.811348915100098, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8668420910835266, + "num_tokens": 666352713.0, + "step": 17464 + }, + { + "epoch": 2.221727515583259, + "ewc_loss": 0.07537239789962769, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038507161661982536, + "grad_norm": 8.68213176727295, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8723234534263611, + "num_tokens": 666395065.0, + "step": 17465 + }, + { + "epoch": 2.2218547258618497, + "ewc_loss": 0.07590125501155853, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003903602482751012, + "grad_norm": 8.845520973205566, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8655052185058594, + "num_tokens": 666428019.0, + "step": 17466 + }, + { + "epoch": 2.2219819361404403, + "ewc_loss": 0.07522239536046982, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003835716051980853, + "grad_norm": 8.674115180969238, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8636136054992676, + "num_tokens": 666466830.0, + "step": 17467 + }, + { + "epoch": 2.222109146419031, + "ewc_loss": 0.07602989673614502, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039164666668511927, + "grad_norm": 8.863346099853516, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8821794986724854, + "num_tokens": 666501028.0, + "step": 17468 + }, + { + "epoch": 2.2222363566976213, + "ewc_loss": 0.07528425008058548, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038419014890678227, + "grad_norm": 8.683988571166992, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8780213594436646, + "num_tokens": 666538554.0, + "step": 17469 + }, + { + "epoch": 2.222363566976212, + "ewc_loss": 0.07618427276611328, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039319039206020534, + "grad_norm": 8.934392929077148, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8637862205505371, + "num_tokens": 666577019.0, + "step": 17470 + }, + { + "epoch": 2.2224907772548024, + "ewc_loss": 0.07518255710601807, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003831732028629631, + "grad_norm": 8.603633880615234, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8780162334442139, + "num_tokens": 666616597.0, + "step": 17471 + }, + { + "epoch": 2.222617987533393, + "ewc_loss": 0.07649056613445282, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003962533373851329, + "grad_norm": 8.993206024169922, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.866867184638977, + "num_tokens": 666651523.0, + "step": 17472 + }, + { + "epoch": 2.2227451978119834, + "ewc_loss": 0.07514294981956482, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003827771870419383, + "grad_norm": 8.63431167602539, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8537994623184204, + "num_tokens": 666689270.0, + "step": 17473 + }, + { + "epoch": 2.2228724080905735, + "ewc_loss": 0.0766582190990448, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039792986353859305, + "grad_norm": 9.041430473327637, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8612488508224487, + "num_tokens": 666726836.0, + "step": 17474 + }, + { + "epoch": 2.222999618369164, + "ewc_loss": 0.07506190240383148, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038196667446754873, + "grad_norm": 8.667933464050293, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8608916401863098, + "num_tokens": 666761207.0, + "step": 17475 + }, + { + "epoch": 2.2231268286477546, + "ewc_loss": 0.07679080963134766, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039925571763888, + "grad_norm": 8.990507125854492, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8731213212013245, + "num_tokens": 666800293.0, + "step": 17476 + }, + { + "epoch": 2.223254038926345, + "ewc_loss": 0.07515623420476913, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003829099878203124, + "grad_norm": 8.617433547973633, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8739408254623413, + "num_tokens": 666843046.0, + "step": 17477 + }, + { + "epoch": 2.2233812492049356, + "ewc_loss": 0.07665388286113739, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003978864988312125, + "grad_norm": 8.97688102722168, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8790293335914612, + "num_tokens": 666885840.0, + "step": 17478 + }, + { + "epoch": 2.223508459483526, + "ewc_loss": 0.07539524137973785, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003853000816889107, + "grad_norm": 8.72602653503418, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8651245832443237, + "num_tokens": 666925654.0, + "step": 17479 + }, + { + "epoch": 2.2236356697621167, + "ewc_loss": 0.07631351053714752, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039448271854780614, + "grad_norm": 8.874174118041992, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8658406734466553, + "num_tokens": 666968098.0, + "step": 17480 + }, + { + "epoch": 2.223762880040707, + "ewc_loss": 0.07542769610881805, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003856246476061642, + "grad_norm": 8.711145401000977, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8751169443130493, + "num_tokens": 667010349.0, + "step": 17481 + }, + { + "epoch": 2.2238900903192977, + "ewc_loss": 0.07603617012500763, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039170938543975353, + "grad_norm": 8.881105422973633, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8561056852340698, + "num_tokens": 667047086.0, + "step": 17482 + }, + { + "epoch": 2.2240173005978883, + "ewc_loss": 0.07561357319355011, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003874833637382835, + "grad_norm": 8.815083503723145, + "learning_rate": 1e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8481235504150391, + "num_tokens": 667083955.0, + "step": 17483 + }, + { + "epoch": 2.224144510876479, + "ewc_loss": 0.07575660943984985, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003889137879014015, + "grad_norm": 8.811206817626953, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8701969385147095, + "num_tokens": 667119751.0, + "step": 17484 + }, + { + "epoch": 2.2242717211550693, + "ewc_loss": 0.0757928341627121, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003892759559676051, + "grad_norm": 8.904084205627441, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8585848212242126, + "num_tokens": 667150980.0, + "step": 17485 + }, + { + "epoch": 2.22439893143366, + "ewc_loss": 0.0754127949476242, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003854755777865648, + "grad_norm": 8.790855407714844, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8603500723838806, + "num_tokens": 667185198.0, + "step": 17486 + }, + { + "epoch": 2.2245261417122504, + "ewc_loss": 0.0757552981376648, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003889006038662046, + "grad_norm": 8.824872016906738, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8662846088409424, + "num_tokens": 667225286.0, + "step": 17487 + }, + { + "epoch": 2.224653351990841, + "ewc_loss": 0.07541951537132263, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003855428076349199, + "grad_norm": 8.760133743286133, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8591641187667847, + "num_tokens": 667266294.0, + "step": 17488 + }, + { + "epoch": 2.2247805622694314, + "ewc_loss": 0.07579895853996277, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003893371904268861, + "grad_norm": 8.784241676330566, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8634613752365112, + "num_tokens": 667302430.0, + "step": 17489 + }, + { + "epoch": 2.224907772548022, + "ewc_loss": 0.0755423754453659, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003867714258376509, + "grad_norm": 8.795013427734375, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8576786518096924, + "num_tokens": 667342881.0, + "step": 17490 + }, + { + "epoch": 2.2250349828266125, + "ewc_loss": 0.07543230801820755, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003856707480736077, + "grad_norm": 8.753424644470215, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8661378622055054, + "num_tokens": 667376122.0, + "step": 17491 + }, + { + "epoch": 2.225162193105203, + "ewc_loss": 0.07558656483888626, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003872133092954755, + "grad_norm": 8.786343574523926, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8566845655441284, + "num_tokens": 667415120.0, + "step": 17492 + }, + { + "epoch": 2.2252894033837936, + "ewc_loss": 0.07566754519939423, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003880231233779341, + "grad_norm": 8.763998985290527, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8629106283187866, + "num_tokens": 667451014.0, + "step": 17493 + }, + { + "epoch": 2.225416613662384, + "ewc_loss": 0.07563458383083344, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038769349339418113, + "grad_norm": 8.817275047302246, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8562929630279541, + "num_tokens": 667485600.0, + "step": 17494 + }, + { + "epoch": 2.2255438239409746, + "ewc_loss": 0.07544615119695663, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003858091658912599, + "grad_norm": 8.683865547180176, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8594100475311279, + "num_tokens": 667527561.0, + "step": 17495 + }, + { + "epoch": 2.225671034219565, + "ewc_loss": 0.07578977942466736, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038924539694562554, + "grad_norm": 8.861261367797852, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8714819550514221, + "num_tokens": 667569677.0, + "step": 17496 + }, + { + "epoch": 2.225798244498155, + "ewc_loss": 0.07535118609666824, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003848595079034567, + "grad_norm": 8.731573104858398, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8636307716369629, + "num_tokens": 667605954.0, + "step": 17497 + }, + { + "epoch": 2.225925454776746, + "ewc_loss": 0.07578790932893753, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003892267413903028, + "grad_norm": 8.776366233825684, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8675713539123535, + "num_tokens": 667645082.0, + "step": 17498 + }, + { + "epoch": 2.2260526650553363, + "ewc_loss": 0.07542195916175842, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000385567283956334, + "grad_norm": 8.744159698486328, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8613238334655762, + "num_tokens": 667681390.0, + "step": 17499 + }, + { + "epoch": 2.226179875333927, + "ewc_loss": 0.07579851150512695, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003893327375408262, + "grad_norm": 8.829450607299805, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8540127277374268, + "num_tokens": 667722772.0, + "step": 17500 + }, + { + "epoch": 2.2263070856125173, + "ewc_loss": 0.07584407925605774, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003873470122925937, + "grad_norm": 8.785340309143066, + "learning_rate": 1e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8404439687728882, + "num_tokens": 667758647.0, + "step": 17501 + }, + { + "epoch": 2.226434295891108, + "ewc_loss": 0.07586918771266937, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038759814924560487, + "grad_norm": 8.905351638793945, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8653134703636169, + "num_tokens": 667790625.0, + "step": 17502 + }, + { + "epoch": 2.2265615061696984, + "ewc_loss": 0.07566700875759125, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003855763061437756, + "grad_norm": 8.787822723388672, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.854168713092804, + "num_tokens": 667823724.0, + "step": 17503 + }, + { + "epoch": 2.226688716448289, + "ewc_loss": 0.07593049108982086, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003882111923303455, + "grad_norm": 8.78180980682373, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8772144317626953, + "num_tokens": 667862548.0, + "step": 17504 + }, + { + "epoch": 2.2268159267268794, + "ewc_loss": 0.07578275352716446, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038673379458487034, + "grad_norm": 8.74605941772461, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8576982617378235, + "num_tokens": 667905994.0, + "step": 17505 + }, + { + "epoch": 2.22694313700547, + "ewc_loss": 0.07588522136211395, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038775845314376056, + "grad_norm": 8.769670486450195, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8581960201263428, + "num_tokens": 667945087.0, + "step": 17506 + }, + { + "epoch": 2.2270703472840605, + "ewc_loss": 0.07586405426263809, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003875468100886792, + "grad_norm": 8.724447250366211, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8732380867004395, + "num_tokens": 667984736.0, + "step": 17507 + }, + { + "epoch": 2.227197557562651, + "ewc_loss": 0.07613059133291245, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003902121679857373, + "grad_norm": 8.891400337219238, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8575716614723206, + "num_tokens": 668024020.0, + "step": 17508 + }, + { + "epoch": 2.2273247678412416, + "ewc_loss": 0.07562334835529327, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038513969047926366, + "grad_norm": 8.71703052520752, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8596052527427673, + "num_tokens": 668059567.0, + "step": 17509 + }, + { + "epoch": 2.227451978119832, + "ewc_loss": 0.07630898058414459, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039199605816975236, + "grad_norm": 8.796845436096191, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8706109523773193, + "num_tokens": 668099901.0, + "step": 17510 + }, + { + "epoch": 2.2275791883984226, + "ewc_loss": 0.07555755972862244, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003869232314173132, + "grad_norm": 8.754371643066406, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8783261775970459, + "num_tokens": 668135974.0, + "step": 17511 + }, + { + "epoch": 2.227706398677013, + "ewc_loss": 0.07591855525970459, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000390533241443336, + "grad_norm": 8.788725852966309, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8677999973297119, + "num_tokens": 668179200.0, + "step": 17512 + }, + { + "epoch": 2.2278336089556037, + "ewc_loss": 0.07592839002609253, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003881901502609253, + "grad_norm": 8.798222541809082, + "learning_rate": 1e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8416450023651123, + "num_tokens": 668219683.0, + "step": 17513 + }, + { + "epoch": 2.227960819234194, + "ewc_loss": 0.0758778378367424, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038768461672589183, + "grad_norm": 8.733711242675781, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8660349249839783, + "num_tokens": 668253899.0, + "step": 17514 + }, + { + "epoch": 2.2280880295127847, + "ewc_loss": 0.0762285441160202, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003911916573997587, + "grad_norm": 8.92746353149414, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.857155442237854, + "num_tokens": 668294140.0, + "step": 17515 + }, + { + "epoch": 2.2282152397913753, + "ewc_loss": 0.07551763951778412, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000384082697564736, + "grad_norm": 8.701997756958008, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.868998646736145, + "num_tokens": 668328442.0, + "step": 17516 + }, + { + "epoch": 2.228342450069966, + "ewc_loss": 0.07642818242311478, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039318809285759926, + "grad_norm": 8.993605613708496, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8667744398117065, + "num_tokens": 668367903.0, + "step": 17517 + }, + { + "epoch": 2.2284696603485563, + "ewc_loss": 0.07521237432956696, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003810299967881292, + "grad_norm": 8.586962699890137, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8758468627929688, + "num_tokens": 668409889.0, + "step": 17518 + }, + { + "epoch": 2.228596870627147, + "ewc_loss": 0.0766061544418335, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039740916690789163, + "grad_norm": 8.99747085571289, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8692781329154968, + "num_tokens": 668451061.0, + "step": 17519 + }, + { + "epoch": 2.2287240809057374, + "ewc_loss": 0.07499248534440994, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038127251900732517, + "grad_norm": 8.583264350891113, + "learning_rate": 1e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8424922227859497, + "num_tokens": 668493038.0, + "step": 17520 + }, + { + "epoch": 2.228851291184328, + "ewc_loss": 0.07690797746181488, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039798597572371364, + "grad_norm": 9.033824920654297, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8711215853691101, + "num_tokens": 668528087.0, + "step": 17521 + }, + { + "epoch": 2.228978501462918, + "ewc_loss": 0.07500043511390686, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038135197246447206, + "grad_norm": 8.63644027709961, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8659668564796448, + "num_tokens": 668562325.0, + "step": 17522 + }, + { + "epoch": 2.2291057117415085, + "ewc_loss": 0.0768037885427475, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039694414590485394, + "grad_norm": 8.872556686401367, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8656097054481506, + "num_tokens": 668601336.0, + "step": 17523 + }, + { + "epoch": 2.229232922020099, + "ewc_loss": 0.07554313540458679, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003843375889118761, + "grad_norm": 8.678552627563477, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8831003308296204, + "num_tokens": 668637113.0, + "step": 17524 + }, + { + "epoch": 2.2293601322986896, + "ewc_loss": 0.07643476873636246, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003956953587476164, + "grad_norm": 8.832151412963867, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8601559996604919, + "num_tokens": 668675645.0, + "step": 17525 + }, + { + "epoch": 2.22948734257728, + "ewc_loss": 0.07549765706062317, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003863242163788527, + "grad_norm": 8.654423713684082, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8654137253761292, + "num_tokens": 668706480.0, + "step": 17526 + }, + { + "epoch": 2.2296145528558706, + "ewc_loss": 0.07633104920387268, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003946580982301384, + "grad_norm": 8.910348892211914, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8592873215675354, + "num_tokens": 668743956.0, + "step": 17527 + }, + { + "epoch": 2.229741763134461, + "ewc_loss": 0.0754178985953331, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003855266550090164, + "grad_norm": 8.626254081726074, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.872556746006012, + "num_tokens": 668779134.0, + "step": 17528 + }, + { + "epoch": 2.2298689734130517, + "ewc_loss": 0.0764940157532692, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003962877963203937, + "grad_norm": 8.892633438110352, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8658910989761353, + "num_tokens": 668814978.0, + "step": 17529 + }, + { + "epoch": 2.229996183691642, + "ewc_loss": 0.07542581856250763, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038560578832402825, + "grad_norm": 8.6559419631958, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8474475145339966, + "num_tokens": 668854194.0, + "step": 17530 + }, + { + "epoch": 2.2301233939702327, + "ewc_loss": 0.07643449306488037, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003956926229875535, + "grad_norm": 8.840423583984375, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8559877872467041, + "num_tokens": 668893095.0, + "step": 17531 + }, + { + "epoch": 2.2302506042488233, + "ewc_loss": 0.07569612562656403, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038830889388918877, + "grad_norm": 8.6939058303833, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8724925518035889, + "num_tokens": 668930626.0, + "step": 17532 + }, + { + "epoch": 2.230377814527414, + "ewc_loss": 0.07623127102851868, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003936603607144207, + "grad_norm": 8.849682807922363, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8727868795394897, + "num_tokens": 668960941.0, + "step": 17533 + }, + { + "epoch": 2.2305050248060043, + "ewc_loss": 0.07563409954309464, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038768863305449486, + "grad_norm": 8.711925506591797, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8662477731704712, + "num_tokens": 668997771.0, + "step": 17534 + }, + { + "epoch": 2.230632235084595, + "ewc_loss": 0.07600373029708862, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003913849068339914, + "grad_norm": 8.751309394836426, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8781431913375854, + "num_tokens": 669038714.0, + "step": 17535 + }, + { + "epoch": 2.2307594453631854, + "ewc_loss": 0.07572218775749207, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038856948958709836, + "grad_norm": 8.726287841796875, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8723351359367371, + "num_tokens": 669072336.0, + "step": 17536 + }, + { + "epoch": 2.230886655641776, + "ewc_loss": 0.07615526020526886, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003904588520526886, + "grad_norm": 8.732366561889648, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8733019232749939, + "num_tokens": 669110303.0, + "step": 17537 + }, + { + "epoch": 2.2310138659203664, + "ewc_loss": 0.07584380358457565, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038978568045422435, + "grad_norm": 8.737692832946777, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8566151857376099, + "num_tokens": 669146874.0, + "step": 17538 + }, + { + "epoch": 2.231141076198957, + "ewc_loss": 0.07572983205318451, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003886459453497082, + "grad_norm": 8.7825927734375, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8768103122711182, + "num_tokens": 669181682.0, + "step": 17539 + }, + { + "epoch": 2.2312682864775475, + "ewc_loss": 0.07591795921325684, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003905272751580924, + "grad_norm": 8.703147888183594, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8658238649368286, + "num_tokens": 669227201.0, + "step": 17540 + }, + { + "epoch": 2.231395496756138, + "ewc_loss": 0.07601362466812134, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039148388896137476, + "grad_norm": 8.748958587646484, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8657242059707642, + "num_tokens": 669265915.0, + "step": 17541 + }, + { + "epoch": 2.2315227070347285, + "ewc_loss": 0.07578636705875397, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003892113745678216, + "grad_norm": 8.759218215942383, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8690831661224365, + "num_tokens": 669304130.0, + "step": 17542 + }, + { + "epoch": 2.231649917313319, + "ewc_loss": 0.07609786093235016, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038988483720459044, + "grad_norm": 8.756858825683594, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8619882464408875, + "num_tokens": 669344225.0, + "step": 17543 + }, + { + "epoch": 2.2317771275919096, + "ewc_loss": 0.07589548081159592, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003903024480678141, + "grad_norm": 8.741680145263672, + "learning_rate": 1e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.846312940120697, + "num_tokens": 669385126.0, + "step": 17544 + }, + { + "epoch": 2.2319043378705, + "ewc_loss": 0.07608501613140106, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003897564020007849, + "grad_norm": 8.770990371704102, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.85335373878479, + "num_tokens": 669425774.0, + "step": 17545 + }, + { + "epoch": 2.2320315481490907, + "ewc_loss": 0.07602117955684662, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003891180385835469, + "grad_norm": 8.770216941833496, + "learning_rate": 1e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8544502258300781, + "num_tokens": 669459789.0, + "step": 17546 + }, + { + "epoch": 2.2321587584276807, + "ewc_loss": 0.07601045072078705, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003914521948900074, + "grad_norm": 8.79658317565918, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8799865245819092, + "num_tokens": 669498709.0, + "step": 17547 + }, + { + "epoch": 2.2322859687062713, + "ewc_loss": 0.0760805681347847, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000389711931347847, + "grad_norm": 8.785680770874023, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8690531253814697, + "num_tokens": 669536021.0, + "step": 17548 + }, + { + "epoch": 2.232413178984862, + "ewc_loss": 0.07608064264059067, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038971268804743886, + "grad_norm": 8.716094970703125, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8510860800743103, + "num_tokens": 669574316.0, + "step": 17549 + }, + { + "epoch": 2.2325403892634523, + "ewc_loss": 0.07603002339601517, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039164788904599845, + "grad_norm": 8.878877639770508, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8557024002075195, + "num_tokens": 669611826.0, + "step": 17550 + }, + { + "epoch": 2.232667599542043, + "ewc_loss": 0.07579196989536285, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038682599551975727, + "grad_norm": 8.704143524169922, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8616811037063599, + "num_tokens": 669655277.0, + "step": 17551 + }, + { + "epoch": 2.2327948098206334, + "ewc_loss": 0.07627583295106888, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003916645946446806, + "grad_norm": 8.776891708374023, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8578750491142273, + "num_tokens": 669693481.0, + "step": 17552 + }, + { + "epoch": 2.232922020099224, + "ewc_loss": 0.07579852640628815, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003868915082421154, + "grad_norm": 8.82181453704834, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8628326654434204, + "num_tokens": 669724104.0, + "step": 17553 + }, + { + "epoch": 2.2330492303778144, + "ewc_loss": 0.07589989900588989, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003879052528645843, + "grad_norm": 8.691642761230469, + "learning_rate": 1e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.8479401469230652, + "num_tokens": 669768725.0, + "step": 17554 + }, + { + "epoch": 2.233176440656405, + "ewc_loss": 0.0762292891740799, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003911991079803556, + "grad_norm": 8.849173545837402, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8738424777984619, + "num_tokens": 669809508.0, + "step": 17555 + }, + { + "epoch": 2.2333036509349955, + "ewc_loss": 0.07581710815429688, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003870773653034121, + "grad_norm": 8.680828094482422, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8657448887825012, + "num_tokens": 669848047.0, + "step": 17556 + }, + { + "epoch": 2.233430861213586, + "ewc_loss": 0.07632136344909668, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039211989496834576, + "grad_norm": 8.794281959533691, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8812047243118286, + "num_tokens": 669881274.0, + "step": 17557 + }, + { + "epoch": 2.2335580714921766, + "ewc_loss": 0.07584617286920547, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038736796705052257, + "grad_norm": 8.752742767333984, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8590526580810547, + "num_tokens": 669918517.0, + "step": 17558 + }, + { + "epoch": 2.233685281770767, + "ewc_loss": 0.07628649473190308, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003917711728718132, + "grad_norm": 8.837776184082031, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8571538925170898, + "num_tokens": 669960990.0, + "step": 17559 + }, + { + "epoch": 2.2338124920493576, + "ewc_loss": 0.07584267854690552, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038733298424631357, + "grad_norm": 8.756195068359375, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8673712611198425, + "num_tokens": 669995542.0, + "step": 17560 + }, + { + "epoch": 2.233939702327948, + "ewc_loss": 0.07617034018039703, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003906096681021154, + "grad_norm": 8.845447540283203, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8652191758155823, + "num_tokens": 670036798.0, + "step": 17561 + }, + { + "epoch": 2.2340669126065387, + "ewc_loss": 0.07574053108692169, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038631161442026496, + "grad_norm": 8.737807273864746, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8634915351867676, + "num_tokens": 670071763.0, + "step": 17562 + }, + { + "epoch": 2.234194122885129, + "ewc_loss": 0.0762682855129242, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039158915751613677, + "grad_norm": 8.91463565826416, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8622074723243713, + "num_tokens": 670100864.0, + "step": 17563 + }, + { + "epoch": 2.2343213331637197, + "ewc_loss": 0.07556891441345215, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038459539064206183, + "grad_norm": 8.64273452758789, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8527920246124268, + "num_tokens": 670144114.0, + "step": 17564 + }, + { + "epoch": 2.2344485434423103, + "ewc_loss": 0.07640700042247772, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039297627517953515, + "grad_norm": 8.828239440917969, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.868094801902771, + "num_tokens": 670182719.0, + "step": 17565 + }, + { + "epoch": 2.234575753720901, + "ewc_loss": 0.0755634754896164, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003845409955829382, + "grad_norm": 8.728412628173828, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8677045106887817, + "num_tokens": 670218059.0, + "step": 17566 + }, + { + "epoch": 2.2347029639994913, + "ewc_loss": 0.07628436386585236, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003917498397640884, + "grad_norm": 8.770440101623535, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8604435920715332, + "num_tokens": 670259589.0, + "step": 17567 + }, + { + "epoch": 2.234830174278082, + "ewc_loss": 0.07585809379816055, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003874872054439038, + "grad_norm": 8.71345329284668, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8558579087257385, + "num_tokens": 670301755.0, + "step": 17568 + }, + { + "epoch": 2.2349573845566724, + "ewc_loss": 0.07603400945663452, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038924638647586107, + "grad_norm": 8.776968955993652, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.868418276309967, + "num_tokens": 670339205.0, + "step": 17569 + }, + { + "epoch": 2.235084594835263, + "ewc_loss": 0.07601287961006165, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038903506356291473, + "grad_norm": 8.842491149902344, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8674548268318176, + "num_tokens": 670382961.0, + "step": 17570 + }, + { + "epoch": 2.2352118051138534, + "ewc_loss": 0.0758301317691803, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038720754673704505, + "grad_norm": 8.760214805603027, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8530919551849365, + "num_tokens": 670420747.0, + "step": 17571 + }, + { + "epoch": 2.2353390153924435, + "ewc_loss": 0.07610572129487991, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038996347575448453, + "grad_norm": 8.742104530334473, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8804886341094971, + "num_tokens": 670457906.0, + "step": 17572 + }, + { + "epoch": 2.235466225671034, + "ewc_loss": 0.07596196234226227, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038852592115290463, + "grad_norm": 8.793171882629395, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8535141944885254, + "num_tokens": 670493187.0, + "step": 17573 + }, + { + "epoch": 2.2355934359496246, + "ewc_loss": 0.0759519636631012, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003884259203914553, + "grad_norm": 8.71998119354248, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8773870468139648, + "num_tokens": 670529750.0, + "step": 17574 + }, + { + "epoch": 2.235720646228215, + "ewc_loss": 0.07598106563091278, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003887169295921922, + "grad_norm": 8.777040481567383, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8614758253097534, + "num_tokens": 670570147.0, + "step": 17575 + }, + { + "epoch": 2.2358478565068056, + "ewc_loss": 0.0758751705288887, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038765795761719346, + "grad_norm": 8.78559398651123, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8663462996482849, + "num_tokens": 670604113.0, + "step": 17576 + }, + { + "epoch": 2.235975066785396, + "ewc_loss": 0.07597216963768005, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000388627900974825, + "grad_norm": 8.751338958740234, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8645486831665039, + "num_tokens": 670645451.0, + "step": 17577 + }, + { + "epoch": 2.2361022770639867, + "ewc_loss": 0.07599252462387085, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003888314822688699, + "grad_norm": 8.810564041137695, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8694772720336914, + "num_tokens": 670684055.0, + "step": 17578 + }, + { + "epoch": 2.236229487342577, + "ewc_loss": 0.07591623812913895, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038806864176876843, + "grad_norm": 8.79879093170166, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8561455011367798, + "num_tokens": 670724397.0, + "step": 17579 + }, + { + "epoch": 2.2363566976211677, + "ewc_loss": 0.07612025737762451, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039010882028378546, + "grad_norm": 8.81350326538086, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8758417367935181, + "num_tokens": 670761282.0, + "step": 17580 + }, + { + "epoch": 2.2364839078997583, + "ewc_loss": 0.0755510926246643, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038685862091369927, + "grad_norm": 8.74382495880127, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8689907193183899, + "num_tokens": 670800007.0, + "step": 17581 + }, + { + "epoch": 2.236611118178349, + "ewc_loss": 0.07624663412570953, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039137256680987775, + "grad_norm": 8.846587181091309, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8612945079803467, + "num_tokens": 670840040.0, + "step": 17582 + }, + { + "epoch": 2.2367383284569393, + "ewc_loss": 0.0757598876953125, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038650515489280224, + "grad_norm": 8.74144458770752, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8577000498771667, + "num_tokens": 670881193.0, + "step": 17583 + }, + { + "epoch": 2.23686553873553, + "ewc_loss": 0.07600116729736328, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003913593536708504, + "grad_norm": 8.824711799621582, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8687412738800049, + "num_tokens": 670927275.0, + "step": 17584 + }, + { + "epoch": 2.2369927490141204, + "ewc_loss": 0.07577072083950043, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038661350845359266, + "grad_norm": 8.725081443786621, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8675853610038757, + "num_tokens": 670967327.0, + "step": 17585 + }, + { + "epoch": 2.237119959292711, + "ewc_loss": 0.07599450647830963, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039129273500293493, + "grad_norm": 8.857667922973633, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8677811622619629, + "num_tokens": 671009250.0, + "step": 17586 + }, + { + "epoch": 2.2372471695713014, + "ewc_loss": 0.07558806240558624, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003872282395604998, + "grad_norm": 8.775406837463379, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8671355247497559, + "num_tokens": 671045068.0, + "step": 17587 + }, + { + "epoch": 2.237374379849892, + "ewc_loss": 0.07581701129674911, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003895177796948701, + "grad_norm": 8.785913467407227, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8527206778526306, + "num_tokens": 671084786.0, + "step": 17588 + }, + { + "epoch": 2.2375015901284825, + "ewc_loss": 0.07569484412670135, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003882960882037878, + "grad_norm": 8.768403053283691, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8575297594070435, + "num_tokens": 671123235.0, + "step": 17589 + }, + { + "epoch": 2.237628800407073, + "ewc_loss": 0.07587941735982895, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003901418240275234, + "grad_norm": 8.813674926757812, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8797582387924194, + "num_tokens": 671161702.0, + "step": 17590 + }, + { + "epoch": 2.2377560106856635, + "ewc_loss": 0.0757339745759964, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038868741830810905, + "grad_norm": 8.780277252197266, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8763153553009033, + "num_tokens": 671195487.0, + "step": 17591 + }, + { + "epoch": 2.237883220964254, + "ewc_loss": 0.07621866464614868, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039109293720684946, + "grad_norm": 8.905229568481445, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8629928827285767, + "num_tokens": 671232288.0, + "step": 17592 + }, + { + "epoch": 2.2380104312428446, + "ewc_loss": 0.07548069208860397, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003861545701511204, + "grad_norm": 8.683521270751953, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8479923009872437, + "num_tokens": 671275154.0, + "step": 17593 + }, + { + "epoch": 2.238137641521435, + "ewc_loss": 0.0761907622218132, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003932552644982934, + "grad_norm": 8.860562324523926, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8544375896453857, + "num_tokens": 671312266.0, + "step": 17594 + }, + { + "epoch": 2.238264851800025, + "ewc_loss": 0.07546970248222351, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038604464498348534, + "grad_norm": 8.661924362182617, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8639369010925293, + "num_tokens": 671349690.0, + "step": 17595 + }, + { + "epoch": 2.238392062078616, + "ewc_loss": 0.07638105750083923, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003951582475565374, + "grad_norm": 8.882075309753418, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8669290542602539, + "num_tokens": 671387898.0, + "step": 17596 + }, + { + "epoch": 2.2385192723572063, + "ewc_loss": 0.07531218230724335, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000384469487471506, + "grad_norm": 8.675130844116211, + "learning_rate": 1e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8419716358184814, + "num_tokens": 671428054.0, + "step": 17597 + }, + { + "epoch": 2.238646482635797, + "ewc_loss": 0.07645341753959656, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003958818269893527, + "grad_norm": 8.879557609558105, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.870525598526001, + "num_tokens": 671467096.0, + "step": 17598 + }, + { + "epoch": 2.2387736929143873, + "ewc_loss": 0.07554732263088226, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038682084414176643, + "grad_norm": 8.68045711517334, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8645046949386597, + "num_tokens": 671506752.0, + "step": 17599 + }, + { + "epoch": 2.238900903192978, + "ewc_loss": 0.07626096904277802, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039395736530423164, + "grad_norm": 8.841643333435059, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8568702936172485, + "num_tokens": 671542403.0, + "step": 17600 + }, + { + "epoch": 2.2390281134715684, + "ewc_loss": 0.0757284089922905, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003886317426804453, + "grad_norm": 8.696334838867188, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8753387928009033, + "num_tokens": 671578115.0, + "step": 17601 + }, + { + "epoch": 2.239155323750159, + "ewc_loss": 0.07619626820087433, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000393310358049348, + "grad_norm": 8.837684631347656, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8764988780021667, + "num_tokens": 671619854.0, + "step": 17602 + }, + { + "epoch": 2.2392825340287494, + "ewc_loss": 0.07606109976768494, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038951722672209144, + "grad_norm": 8.709306716918945, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8776934742927551, + "num_tokens": 671661617.0, + "step": 17603 + }, + { + "epoch": 2.23940974430734, + "ewc_loss": 0.07632559537887573, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003946036158595234, + "grad_norm": 8.862527847290039, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8527798652648926, + "num_tokens": 671702231.0, + "step": 17604 + }, + { + "epoch": 2.2395369545859305, + "ewc_loss": 0.0755905956029892, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003872536472044885, + "grad_norm": 8.703194618225098, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8646238446235657, + "num_tokens": 671741483.0, + "step": 17605 + }, + { + "epoch": 2.239664164864521, + "ewc_loss": 0.07622885704040527, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039363623363897204, + "grad_norm": 8.854776382446289, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8584400415420532, + "num_tokens": 671780168.0, + "step": 17606 + }, + { + "epoch": 2.2397913751431116, + "ewc_loss": 0.07567112892866135, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038805895019322634, + "grad_norm": 8.710729598999023, + "learning_rate": 1e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.8413495421409607, + "num_tokens": 671818980.0, + "step": 17607 + }, + { + "epoch": 2.239918585421702, + "ewc_loss": 0.07612344622612, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003925821220036596, + "grad_norm": 8.842144966125488, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8633416295051575, + "num_tokens": 671850010.0, + "step": 17608 + }, + { + "epoch": 2.2400457957002926, + "ewc_loss": 0.07602483779191971, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000389154622098431, + "grad_norm": 8.768377304077148, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8567430377006531, + "num_tokens": 671888506.0, + "step": 17609 + }, + { + "epoch": 2.240173005978883, + "ewc_loss": 0.07623608410358429, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003912670654244721, + "grad_norm": 8.778382301330566, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8731527328491211, + "num_tokens": 671930965.0, + "step": 17610 + }, + { + "epoch": 2.2403002162574737, + "ewc_loss": 0.07572885602712631, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038863622467033565, + "grad_norm": 8.717090606689453, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8726382255554199, + "num_tokens": 671967472.0, + "step": 17611 + }, + { + "epoch": 2.240427426536064, + "ewc_loss": 0.07612092792987823, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003925568889826536, + "grad_norm": 8.84676742553711, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8871370553970337, + "num_tokens": 672005986.0, + "step": 17612 + }, + { + "epoch": 2.2405546368146547, + "ewc_loss": 0.07556471228599548, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003869947977364063, + "grad_norm": 8.664836883544922, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8636640310287476, + "num_tokens": 672044729.0, + "step": 17613 + }, + { + "epoch": 2.2406818470932452, + "ewc_loss": 0.07649995386600494, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003939057933166623, + "grad_norm": 8.854350090026855, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8644788861274719, + "num_tokens": 672080141.0, + "step": 17614 + }, + { + "epoch": 2.2408090573718358, + "ewc_loss": 0.0755687728524208, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000387035368476063, + "grad_norm": 8.725584030151367, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.872154176235199, + "num_tokens": 672119277.0, + "step": 17615 + }, + { + "epoch": 2.2409362676504263, + "ewc_loss": 0.07601422071456909, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003914899134542793, + "grad_norm": 8.794122695922852, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8617368936538696, + "num_tokens": 672154205.0, + "step": 17616 + }, + { + "epoch": 2.241063477929017, + "ewc_loss": 0.07576292753219604, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003889769141096622, + "grad_norm": 8.792682647705078, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8642889857292175, + "num_tokens": 672193667.0, + "step": 17617 + }, + { + "epoch": 2.2411906882076074, + "ewc_loss": 0.0757204070687294, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038855173625051975, + "grad_norm": 8.787797927856445, + "learning_rate": 1e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.834393322467804, + "num_tokens": 672232360.0, + "step": 17618 + }, + { + "epoch": 2.241317898486198, + "ewc_loss": 0.0758344978094101, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038969263550825417, + "grad_norm": 8.790553092956543, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.851388156414032, + "num_tokens": 672264127.0, + "step": 17619 + }, + { + "epoch": 2.241445108764788, + "ewc_loss": 0.07576719671487808, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003890196094289422, + "grad_norm": 8.784799575805664, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8724068403244019, + "num_tokens": 672300719.0, + "step": 17620 + }, + { + "epoch": 2.2415723190433785, + "ewc_loss": 0.07583264261484146, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003896740672644228, + "grad_norm": 8.755839347839355, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.875340461730957, + "num_tokens": 672342796.0, + "step": 17621 + }, + { + "epoch": 2.241699529321969, + "ewc_loss": 0.07571622729301453, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038850988494232297, + "grad_norm": 8.801411628723145, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8664900064468384, + "num_tokens": 672382597.0, + "step": 17622 + }, + { + "epoch": 2.2418267396005596, + "ewc_loss": 0.0757269412279129, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003886171034537256, + "grad_norm": 8.93620491027832, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8517748713493347, + "num_tokens": 672427350.0, + "step": 17623 + }, + { + "epoch": 2.24195394987915, + "ewc_loss": 0.07553395628929138, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038668717024847865, + "grad_norm": 8.72813606262207, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8565140962600708, + "num_tokens": 672468997.0, + "step": 17624 + }, + { + "epoch": 2.2420811601577406, + "ewc_loss": 0.07601867616176605, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039153447141870856, + "grad_norm": 8.847304344177246, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8697083592414856, + "num_tokens": 672505061.0, + "step": 17625 + }, + { + "epoch": 2.242208370436331, + "ewc_loss": 0.07546990364789963, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038604671135544777, + "grad_norm": 8.708873748779297, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8640840649604797, + "num_tokens": 672544241.0, + "step": 17626 + }, + { + "epoch": 2.2423355807149217, + "ewc_loss": 0.07612010091543198, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039254865259863436, + "grad_norm": 8.853732109069824, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.864160418510437, + "num_tokens": 672586334.0, + "step": 17627 + }, + { + "epoch": 2.242462790993512, + "ewc_loss": 0.07539787888526917, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000385326478863135, + "grad_norm": 8.689743995666504, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8600686192512512, + "num_tokens": 672628627.0, + "step": 17628 + }, + { + "epoch": 2.2425900012721027, + "ewc_loss": 0.07623963057994843, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039374400512315333, + "grad_norm": 8.876665115356445, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.854392945766449, + "num_tokens": 672663046.0, + "step": 17629 + }, + { + "epoch": 2.2427172115506933, + "ewc_loss": 0.07551677525043488, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003865154576487839, + "grad_norm": 8.653339385986328, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8704748153686523, + "num_tokens": 672703394.0, + "step": 17630 + }, + { + "epoch": 2.242844421829284, + "ewc_loss": 0.07659201323986053, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003948263474740088, + "grad_norm": 8.894865989685059, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8549492359161377, + "num_tokens": 672743262.0, + "step": 17631 + }, + { + "epoch": 2.2429716321078743, + "ewc_loss": 0.07556073367595673, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038695501280017197, + "grad_norm": 8.73764705657959, + "learning_rate": 1e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8291170597076416, + "num_tokens": 672782662.0, + "step": 17632 + }, + { + "epoch": 2.243098842386465, + "ewc_loss": 0.07628733664751053, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003942210169043392, + "grad_norm": 8.859702110290527, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.861640214920044, + "num_tokens": 672818400.0, + "step": 17633 + }, + { + "epoch": 2.2432260526650554, + "ewc_loss": 0.07557238638401031, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038707154453732073, + "grad_norm": 8.727863311767578, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8764564990997314, + "num_tokens": 672858843.0, + "step": 17634 + }, + { + "epoch": 2.243353262943646, + "ewc_loss": 0.07623356580734253, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003936832654289901, + "grad_norm": 8.894048690795898, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.866905689239502, + "num_tokens": 672900397.0, + "step": 17635 + }, + { + "epoch": 2.2434804732222364, + "ewc_loss": 0.07560722529888153, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038741991738788784, + "grad_norm": 8.745379447937012, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.862581729888916, + "num_tokens": 672936025.0, + "step": 17636 + }, + { + "epoch": 2.243607683500827, + "ewc_loss": 0.07595252990722656, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003908729995600879, + "grad_norm": 8.853280067443848, + "learning_rate": 1e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.844779908657074, + "num_tokens": 672979180.0, + "step": 17637 + }, + { + "epoch": 2.2437348937794175, + "ewc_loss": 0.07576079666614532, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003889556392095983, + "grad_norm": 8.871110916137695, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8750922679901123, + "num_tokens": 673018053.0, + "step": 17638 + }, + { + "epoch": 2.243862104058008, + "ewc_loss": 0.07566165924072266, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003879642754327506, + "grad_norm": 8.751669883728027, + "learning_rate": 1e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8436276912689209, + "num_tokens": 673057596.0, + "step": 17639 + }, + { + "epoch": 2.2439893143365985, + "ewc_loss": 0.07582034170627594, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003895510744769126, + "grad_norm": 8.825919151306152, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8756088614463806, + "num_tokens": 673097740.0, + "step": 17640 + }, + { + "epoch": 2.244116524615189, + "ewc_loss": 0.07562512159347534, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038759884773753583, + "grad_norm": 8.682276725769043, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8691365122795105, + "num_tokens": 673141821.0, + "step": 17641 + }, + { + "epoch": 2.2442437348937796, + "ewc_loss": 0.07605922967195511, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003919399459846318, + "grad_norm": 8.873193740844727, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.873102068901062, + "num_tokens": 673184279.0, + "step": 17642 + }, + { + "epoch": 2.24437094517237, + "ewc_loss": 0.07535696774721146, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003849173372145742, + "grad_norm": 8.697299003601074, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.873005211353302, + "num_tokens": 673224328.0, + "step": 17643 + }, + { + "epoch": 2.2444981554509607, + "ewc_loss": 0.0762348398566246, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039369604201056063, + "grad_norm": 8.893316268920898, + "learning_rate": 1e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8504087924957275, + "num_tokens": 673263328.0, + "step": 17644 + }, + { + "epoch": 2.2446253657295507, + "ewc_loss": 0.07549905776977539, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038633827352896333, + "grad_norm": 8.816378593444824, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8684359788894653, + "num_tokens": 673301663.0, + "step": 17645 + }, + { + "epoch": 2.2447525760081413, + "ewc_loss": 0.07593744248151779, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039072209619916975, + "grad_norm": 8.808446884155273, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.861638605594635, + "num_tokens": 673340606.0, + "step": 17646 + }, + { + "epoch": 2.244879786286732, + "ewc_loss": 0.07588252425193787, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003901729069184512, + "grad_norm": 8.788963317871094, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8603227734565735, + "num_tokens": 673382201.0, + "step": 17647 + }, + { + "epoch": 2.2450069965653223, + "ewc_loss": 0.0758785754442215, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003901334130205214, + "grad_norm": 8.79238510131836, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8742926120758057, + "num_tokens": 673423330.0, + "step": 17648 + }, + { + "epoch": 2.245134206843913, + "ewc_loss": 0.0758175402879715, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003895231056958437, + "grad_norm": 8.832246780395508, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8800780177116394, + "num_tokens": 673459690.0, + "step": 17649 + }, + { + "epoch": 2.2452614171225034, + "ewc_loss": 0.07572227716445923, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000388570420909673, + "grad_norm": 8.797520637512207, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8608381152153015, + "num_tokens": 673494827.0, + "step": 17650 + }, + { + "epoch": 2.245388627401094, + "ewc_loss": 0.07590419054031372, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003903895558323711, + "grad_norm": 8.850105285644531, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8529490232467651, + "num_tokens": 673540568.0, + "step": 17651 + }, + { + "epoch": 2.2455158376796844, + "ewc_loss": 0.07569591701030731, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003883068566210568, + "grad_norm": 8.743017196655273, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8798139691352844, + "num_tokens": 673574022.0, + "step": 17652 + }, + { + "epoch": 2.245643047958275, + "ewc_loss": 0.07607641816139221, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003921118041034788, + "grad_norm": 8.854636192321777, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8602908849716187, + "num_tokens": 673612493.0, + "step": 17653 + }, + { + "epoch": 2.2457702582368655, + "ewc_loss": 0.0755818784236908, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003871664812322706, + "grad_norm": 8.718106269836426, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8607953786849976, + "num_tokens": 673653729.0, + "step": 17654 + }, + { + "epoch": 2.245897468515456, + "ewc_loss": 0.0761052668094635, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003924003685824573, + "grad_norm": 8.834859848022461, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8755784034729004, + "num_tokens": 673695626.0, + "step": 17655 + }, + { + "epoch": 2.2460246787940465, + "ewc_loss": 0.07589288055896759, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003878350544255227, + "grad_norm": 8.7557373046875, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8704124689102173, + "num_tokens": 673732188.0, + "step": 17656 + }, + { + "epoch": 2.246151889072637, + "ewc_loss": 0.0762864500284195, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039421216933988035, + "grad_norm": 8.86662483215332, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.856047511100769, + "num_tokens": 673770259.0, + "step": 17657 + }, + { + "epoch": 2.2462790993512276, + "ewc_loss": 0.07579731941223145, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003893208922818303, + "grad_norm": 8.797921180725098, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8609320521354675, + "num_tokens": 673805018.0, + "step": 17658 + }, + { + "epoch": 2.246406309629818, + "ewc_loss": 0.07599945366382599, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039134224061854184, + "grad_norm": 8.786709785461426, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8567385077476501, + "num_tokens": 673846058.0, + "step": 17659 + }, + { + "epoch": 2.2465335199084087, + "ewc_loss": 0.0760761946439743, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003921095922123641, + "grad_norm": 8.814901351928711, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8516906499862671, + "num_tokens": 673886437.0, + "step": 17660 + }, + { + "epoch": 2.246660730186999, + "ewc_loss": 0.07580053806304932, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038935302291065454, + "grad_norm": 8.78012466430664, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.86705482006073, + "num_tokens": 673925701.0, + "step": 17661 + }, + { + "epoch": 2.2467879404655897, + "ewc_loss": 0.07609959691762924, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003923436161130667, + "grad_norm": 8.795645713806152, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8677352666854858, + "num_tokens": 673969854.0, + "step": 17662 + }, + { + "epoch": 2.2469151507441802, + "ewc_loss": 0.07592548429965973, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003906024503521621, + "grad_norm": 8.811556816101074, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.865729570388794, + "num_tokens": 674005048.0, + "step": 17663 + }, + { + "epoch": 2.2470423610227708, + "ewc_loss": 0.07586159557104111, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003899636212736368, + "grad_norm": 8.732595443725586, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8587849140167236, + "num_tokens": 674046548.0, + "step": 17664 + }, + { + "epoch": 2.2471695713013613, + "ewc_loss": 0.07608199864625931, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039216765435412526, + "grad_norm": 8.829612731933594, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8584325909614563, + "num_tokens": 674086905.0, + "step": 17665 + }, + { + "epoch": 2.247296781579952, + "ewc_loss": 0.07573771476745605, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038872481673024595, + "grad_norm": 8.746053695678711, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8741283416748047, + "num_tokens": 674124589.0, + "step": 17666 + }, + { + "epoch": 2.2474239918585424, + "ewc_loss": 0.07620849460363388, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003934325941372663, + "grad_norm": 8.80506706237793, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8762109279632568, + "num_tokens": 674165135.0, + "step": 17667 + }, + { + "epoch": 2.247551202137133, + "ewc_loss": 0.07592939585447311, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003906416241079569, + "grad_norm": 8.728981971740723, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8595902919769287, + "num_tokens": 674207269.0, + "step": 17668 + }, + { + "epoch": 2.2476784124157234, + "ewc_loss": 0.07616159319877625, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000392963585909456, + "grad_norm": 8.852006912231445, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.880595862865448, + "num_tokens": 674240806.0, + "step": 17669 + }, + { + "epoch": 2.2478056226943135, + "ewc_loss": 0.07592128962278366, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003905605699401349, + "grad_norm": 8.773197174072266, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8547918796539307, + "num_tokens": 674274697.0, + "step": 17670 + }, + { + "epoch": 2.247932832972904, + "ewc_loss": 0.0766129270195961, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039503551670350134, + "grad_norm": 8.9114408493042, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8565359115600586, + "num_tokens": 674311362.0, + "step": 17671 + }, + { + "epoch": 2.2480600432514946, + "ewc_loss": 0.07599292695522308, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039127690251916647, + "grad_norm": 8.819403648376465, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8702595233917236, + "num_tokens": 674352505.0, + "step": 17672 + }, + { + "epoch": 2.248187253530085, + "ewc_loss": 0.07629229128360748, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000394270580727607, + "grad_norm": 8.99041748046875, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8681825399398804, + "num_tokens": 674383938.0, + "step": 17673 + }, + { + "epoch": 2.2483144638086756, + "ewc_loss": 0.07583188265562057, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000389666500268504, + "grad_norm": 8.823668479919434, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8620722889900208, + "num_tokens": 674421665.0, + "step": 17674 + }, + { + "epoch": 2.248441674087266, + "ewc_loss": 0.07618191093206406, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039316675974987447, + "grad_norm": 8.859753608703613, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8669149875640869, + "num_tokens": 674466065.0, + "step": 17675 + }, + { + "epoch": 2.2485688843658567, + "ewc_loss": 0.07567839324474335, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038813153514638543, + "grad_norm": 8.743436813354492, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8654515147209167, + "num_tokens": 674506868.0, + "step": 17676 + }, + { + "epoch": 2.248696094644447, + "ewc_loss": 0.07632626593112946, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003946103388443589, + "grad_norm": 8.878150939941406, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8582795858383179, + "num_tokens": 674545756.0, + "step": 17677 + }, + { + "epoch": 2.2488233049230377, + "ewc_loss": 0.07568950206041336, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003882426826748997, + "grad_norm": 8.713457107543945, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8596990704536438, + "num_tokens": 674590680.0, + "step": 17678 + }, + { + "epoch": 2.2489505152016283, + "ewc_loss": 0.07646848261356354, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039603252662345767, + "grad_norm": 8.946883201599121, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8699555993080139, + "num_tokens": 674631624.0, + "step": 17679 + }, + { + "epoch": 2.249077725480219, + "ewc_loss": 0.07574661076068878, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038881372893229127, + "grad_norm": 8.789846420288086, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8586167097091675, + "num_tokens": 674667365.0, + "step": 17680 + }, + { + "epoch": 2.2492049357588093, + "ewc_loss": 0.07638759911060333, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003952236147597432, + "grad_norm": 8.928399085998535, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8630450963973999, + "num_tokens": 674706587.0, + "step": 17681 + }, + { + "epoch": 2.2493321460374, + "ewc_loss": 0.07578165084123611, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003891641681548208, + "grad_norm": 8.799347877502441, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8624709844589233, + "num_tokens": 674744480.0, + "step": 17682 + }, + { + "epoch": 2.2494593563159904, + "ewc_loss": 0.07631170004606247, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039446467417292297, + "grad_norm": 8.87510871887207, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8641067743301392, + "num_tokens": 674783104.0, + "step": 17683 + }, + { + "epoch": 2.249586566594581, + "ewc_loss": 0.07589297741651535, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003902774187736213, + "grad_norm": 8.794071197509766, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8564729690551758, + "num_tokens": 674823488.0, + "step": 17684 + }, + { + "epoch": 2.2497137768731714, + "ewc_loss": 0.07623685896396637, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003937162400688976, + "grad_norm": 8.995830535888672, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.869428277015686, + "num_tokens": 674857322.0, + "step": 17685 + }, + { + "epoch": 2.249840987151762, + "ewc_loss": 0.07563348114490509, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038768252125009894, + "grad_norm": 8.699090003967285, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8539534211158752, + "num_tokens": 674901566.0, + "step": 17686 + }, + { + "epoch": 2.2499681974303525, + "ewc_loss": 0.07682961225509644, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003972023550886661, + "grad_norm": 8.941973686218262, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8691607713699341, + "num_tokens": 674942803.0, + "step": 17687 + }, + { + "epoch": 2.250095407708943, + "ewc_loss": 0.07601018249988556, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003890080261044204, + "grad_norm": 8.860716819763184, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8549368381500244, + "num_tokens": 674983160.0, + "step": 17688 + }, + { + "epoch": 2.2502226179875335, + "ewc_loss": 0.07623498141765594, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039369743899442255, + "grad_norm": 8.831621170043945, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8640949726104736, + "num_tokens": 675021243.0, + "step": 17689 + }, + { + "epoch": 2.250349828266124, + "ewc_loss": 0.07625207304954529, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039142693276517093, + "grad_norm": 8.860284805297852, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8612020015716553, + "num_tokens": 675059435.0, + "step": 17690 + }, + { + "epoch": 2.2504770385447146, + "ewc_loss": 0.07623212784528732, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003912275133188814, + "grad_norm": 8.870719909667969, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8678765892982483, + "num_tokens": 675091689.0, + "step": 17691 + }, + { + "epoch": 2.250604248823305, + "ewc_loss": 0.07609019428491592, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039224958163686097, + "grad_norm": 8.803752899169922, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.858609139919281, + "num_tokens": 675130368.0, + "step": 17692 + }, + { + "epoch": 2.250731459101895, + "ewc_loss": 0.07637202739715576, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039262656355276704, + "grad_norm": 8.813117980957031, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8677907586097717, + "num_tokens": 675171745.0, + "step": 17693 + }, + { + "epoch": 2.250858669380486, + "ewc_loss": 0.07603680342435837, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003917156718671322, + "grad_norm": 8.837888717651367, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8629540801048279, + "num_tokens": 675206052.0, + "step": 17694 + }, + { + "epoch": 2.2509858796590763, + "ewc_loss": 0.0763213187456131, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039211942930705845, + "grad_norm": 8.844792366027832, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.858251690864563, + "num_tokens": 675243231.0, + "step": 17695 + }, + { + "epoch": 2.251113089937667, + "ewc_loss": 0.07611007988452911, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003924484772142023, + "grad_norm": 8.901464462280273, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8566116094589233, + "num_tokens": 675273582.0, + "step": 17696 + }, + { + "epoch": 2.2512403002162573, + "ewc_loss": 0.07583872973918915, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003897349233739078, + "grad_norm": 8.753890991210938, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8609561920166016, + "num_tokens": 675314247.0, + "step": 17697 + }, + { + "epoch": 2.251367510494848, + "ewc_loss": 0.07647453248500824, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039365157135762274, + "grad_norm": 8.83569049835205, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8648347854614258, + "num_tokens": 675355675.0, + "step": 17698 + }, + { + "epoch": 2.2514947207734384, + "ewc_loss": 0.07581015676259995, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038944921107031405, + "grad_norm": 8.781394004821777, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8613263368606567, + "num_tokens": 675394290.0, + "step": 17699 + }, + { + "epoch": 2.251621931052029, + "ewc_loss": 0.07615361362695694, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039288378320634365, + "grad_norm": 8.832439422607422, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8667505979537964, + "num_tokens": 675434251.0, + "step": 17700 + }, + { + "epoch": 2.2517491413306194, + "ewc_loss": 0.0759701058268547, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003910487284883857, + "grad_norm": 8.72292709350586, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8701352477073669, + "num_tokens": 675474731.0, + "step": 17701 + }, + { + "epoch": 2.25187635160921, + "ewc_loss": 0.0763104110956192, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003944518102798611, + "grad_norm": 8.8140869140625, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8596038222312927, + "num_tokens": 675515180.0, + "step": 17702 + }, + { + "epoch": 2.2520035618878005, + "ewc_loss": 0.07592514157295227, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003905990452039987, + "grad_norm": 8.792000770568848, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8514267206192017, + "num_tokens": 675553429.0, + "step": 17703 + }, + { + "epoch": 2.252130772166391, + "ewc_loss": 0.0763331726193428, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003946793731302023, + "grad_norm": 8.858373641967773, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8671494126319885, + "num_tokens": 675592919.0, + "step": 17704 + }, + { + "epoch": 2.2522579824449815, + "ewc_loss": 0.07600213587284088, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039136901614256203, + "grad_norm": 8.823688507080078, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8720579147338867, + "num_tokens": 675627528.0, + "step": 17705 + }, + { + "epoch": 2.252385192723572, + "ewc_loss": 0.07623542845249176, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003937019209843129, + "grad_norm": 8.858824729919434, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8735335469245911, + "num_tokens": 675663721.0, + "step": 17706 + }, + { + "epoch": 2.2525124030021626, + "ewc_loss": 0.07604895532131195, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003918372094631195, + "grad_norm": 8.81017780303955, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.858063817024231, + "num_tokens": 675705187.0, + "step": 17707 + }, + { + "epoch": 2.252639613280753, + "ewc_loss": 0.07607264071702957, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003920740564353764, + "grad_norm": 8.830394744873047, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8747733235359192, + "num_tokens": 675752713.0, + "step": 17708 + }, + { + "epoch": 2.2527668235593437, + "ewc_loss": 0.0761677622795105, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039302531513385475, + "grad_norm": 8.879898071289062, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8640916347503662, + "num_tokens": 675791009.0, + "step": 17709 + }, + { + "epoch": 2.252894033837934, + "ewc_loss": 0.07587013393640518, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003900490119121969, + "grad_norm": 8.77358627319336, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8675591945648193, + "num_tokens": 675828890.0, + "step": 17710 + }, + { + "epoch": 2.2530212441165247, + "ewc_loss": 0.07615680992603302, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039291579741984606, + "grad_norm": 8.865532875061035, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8653508424758911, + "num_tokens": 675862855.0, + "step": 17711 + }, + { + "epoch": 2.2531484543951152, + "ewc_loss": 0.07601775228977203, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039152512908913195, + "grad_norm": 8.836872100830078, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8621107339859009, + "num_tokens": 675903032.0, + "step": 17712 + }, + { + "epoch": 2.2532756646737058, + "ewc_loss": 0.07612958550453186, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039264350198209286, + "grad_norm": 8.854372024536133, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8590413928031921, + "num_tokens": 675943035.0, + "step": 17713 + }, + { + "epoch": 2.2534028749522963, + "ewc_loss": 0.07580846548080444, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003894323599524796, + "grad_norm": 8.781689643859863, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8666664361953735, + "num_tokens": 675983073.0, + "step": 17714 + }, + { + "epoch": 2.253530085230887, + "ewc_loss": 0.07623539865016937, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039370160084217787, + "grad_norm": 8.844198226928711, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8702369928359985, + "num_tokens": 676022341.0, + "step": 17715 + }, + { + "epoch": 2.2536572955094774, + "ewc_loss": 0.07576341181993484, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038898177444934845, + "grad_norm": 8.728818893432617, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8616192936897278, + "num_tokens": 676060907.0, + "step": 17716 + }, + { + "epoch": 2.253784505788068, + "ewc_loss": 0.07638300955295563, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039517777622677386, + "grad_norm": 8.914388656616211, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8619188666343689, + "num_tokens": 676100693.0, + "step": 17717 + }, + { + "epoch": 2.253911716066658, + "ewc_loss": 0.07577896118164062, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003891372762154788, + "grad_norm": 8.742064476013184, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8789644241333008, + "num_tokens": 676137526.0, + "step": 17718 + }, + { + "epoch": 2.254038926345249, + "ewc_loss": 0.0764903724193573, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039625135832466185, + "grad_norm": 8.945063591003418, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8666760325431824, + "num_tokens": 676177245.0, + "step": 17719 + }, + { + "epoch": 2.254166136623839, + "ewc_loss": 0.07564816623926163, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003878293209709227, + "grad_norm": 8.719975471496582, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8704712390899658, + "num_tokens": 676214568.0, + "step": 17720 + }, + { + "epoch": 2.2542933469024296, + "ewc_loss": 0.07663150131702423, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039766269037500024, + "grad_norm": 8.91819953918457, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8652445077896118, + "num_tokens": 676252552.0, + "step": 17721 + }, + { + "epoch": 2.25442055718102, + "ewc_loss": 0.07597041130065918, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038861040957272053, + "grad_norm": 8.71442699432373, + "learning_rate": 1e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8457853198051453, + "num_tokens": 676290480.0, + "step": 17722 + }, + { + "epoch": 2.2545477674596106, + "ewc_loss": 0.07670523971319199, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039840006502345204, + "grad_norm": 8.996373176574707, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8762401938438416, + "num_tokens": 676324950.0, + "step": 17723 + }, + { + "epoch": 2.254674977738201, + "ewc_loss": 0.07573096454143524, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003886572958435863, + "grad_norm": 8.726972579956055, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8568142056465149, + "num_tokens": 676363851.0, + "step": 17724 + }, + { + "epoch": 2.2548021880167917, + "ewc_loss": 0.07672486454248428, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039859628304839134, + "grad_norm": 8.945137023925781, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8733103275299072, + "num_tokens": 676399920.0, + "step": 17725 + }, + { + "epoch": 2.254929398295382, + "ewc_loss": 0.07578781247138977, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038922575186006725, + "grad_norm": 8.690248489379883, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8616249561309814, + "num_tokens": 676443243.0, + "step": 17726 + }, + { + "epoch": 2.2550566085739727, + "ewc_loss": 0.07670244574546814, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003983721253462136, + "grad_norm": 8.976471900939941, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8503220677375793, + "num_tokens": 676480272.0, + "step": 17727 + }, + { + "epoch": 2.2551838188525632, + "ewc_loss": 0.07575017213821411, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038884932291693985, + "grad_norm": 8.747703552246094, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.860251784324646, + "num_tokens": 676516630.0, + "step": 17728 + }, + { + "epoch": 2.2553110291311538, + "ewc_loss": 0.07677623629570007, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003991100238636136, + "grad_norm": 8.933364868164062, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.868079662322998, + "num_tokens": 676557842.0, + "step": 17729 + }, + { + "epoch": 2.2554382394097443, + "ewc_loss": 0.07584648579359055, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038981251418590546, + "grad_norm": 8.695120811462402, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8740009665489197, + "num_tokens": 676599413.0, + "step": 17730 + }, + { + "epoch": 2.255565449688335, + "ewc_loss": 0.07682327926158905, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039958039997145534, + "grad_norm": 8.99271011352539, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8551419973373413, + "num_tokens": 676637791.0, + "step": 17731 + }, + { + "epoch": 2.2556926599669254, + "ewc_loss": 0.07579541206359863, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038930177106522024, + "grad_norm": 8.708783149719238, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8593182563781738, + "num_tokens": 676677108.0, + "step": 17732 + }, + { + "epoch": 2.255819870245516, + "ewc_loss": 0.07692281901836395, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00040057586738839746, + "grad_norm": 8.990989685058594, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.867667555809021, + "num_tokens": 676720698.0, + "step": 17733 + }, + { + "epoch": 2.2559470805241064, + "ewc_loss": 0.07581919431686401, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038953960756771266, + "grad_norm": 8.739706039428711, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8521220684051514, + "num_tokens": 676763719.0, + "step": 17734 + }, + { + "epoch": 2.256074290802697, + "ewc_loss": 0.07690100371837616, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00040035773417912424, + "grad_norm": 9.056136131286621, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.865082859992981, + "num_tokens": 676801083.0, + "step": 17735 + }, + { + "epoch": 2.2562015010812875, + "ewc_loss": 0.07575780153274536, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003889256331603974, + "grad_norm": 8.750454902648926, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8558102250099182, + "num_tokens": 676840861.0, + "step": 17736 + }, + { + "epoch": 2.256328711359878, + "ewc_loss": 0.07687995582818985, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004001472261734307, + "grad_norm": 8.984638214111328, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8764346241950989, + "num_tokens": 676876864.0, + "step": 17737 + }, + { + "epoch": 2.2564559216384685, + "ewc_loss": 0.0757230669260025, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038857830804772675, + "grad_norm": 8.772462844848633, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.883707582950592, + "num_tokens": 676920983.0, + "step": 17738 + }, + { + "epoch": 2.256583131917059, + "ewc_loss": 0.07689765095710754, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003978828026447445, + "grad_norm": 8.926526069641113, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8742274045944214, + "num_tokens": 676961294.0, + "step": 17739 + }, + { + "epoch": 2.2567103421956496, + "ewc_loss": 0.07592488825321198, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003905964840669185, + "grad_norm": 8.80121898651123, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8571041822433472, + "num_tokens": 676996318.0, + "step": 17740 + }, + { + "epoch": 2.2568375524742397, + "ewc_loss": 0.07632476836442947, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003945953503716737, + "grad_norm": 8.887009620666504, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8665058016777039, + "num_tokens": 677034083.0, + "step": 17741 + }, + { + "epoch": 2.2569647627528306, + "ewc_loss": 0.07602405548095703, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003915881970897317, + "grad_norm": 8.810197830200195, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.865315318107605, + "num_tokens": 677071206.0, + "step": 17742 + }, + { + "epoch": 2.2570919730314207, + "ewc_loss": 0.07631473243236542, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003944950003642589, + "grad_norm": 8.906294822692871, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8691245913505554, + "num_tokens": 677103573.0, + "step": 17743 + }, + { + "epoch": 2.2572191833100113, + "ewc_loss": 0.0759487897157669, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000390835601137951, + "grad_norm": 8.77759075164795, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.873440146446228, + "num_tokens": 677141406.0, + "step": 17744 + }, + { + "epoch": 2.257346393588602, + "ewc_loss": 0.07631611824035645, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003945087955798954, + "grad_norm": 8.846524238586426, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8540236949920654, + "num_tokens": 677185968.0, + "step": 17745 + }, + { + "epoch": 2.2574736038671923, + "ewc_loss": 0.07598094642162323, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000391157140256837, + "grad_norm": 8.806134223937988, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8566863536834717, + "num_tokens": 677229004.0, + "step": 17746 + }, + { + "epoch": 2.257600814145783, + "ewc_loss": 0.07629499584436417, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003942976181861013, + "grad_norm": 8.88326358795166, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8755546808242798, + "num_tokens": 677266317.0, + "step": 17747 + }, + { + "epoch": 2.2577280244243734, + "ewc_loss": 0.07620476931333542, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003909539373125881, + "grad_norm": 8.762619972229004, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8740842938423157, + "num_tokens": 677302344.0, + "step": 17748 + }, + { + "epoch": 2.257855234702964, + "ewc_loss": 0.07653670758008957, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003967147204093635, + "grad_norm": 8.937714576721191, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8558462858200073, + "num_tokens": 677342103.0, + "step": 17749 + }, + { + "epoch": 2.2579824449815544, + "ewc_loss": 0.07594108581542969, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003907585341949016, + "grad_norm": 8.804566383361816, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8509727120399475, + "num_tokens": 677378602.0, + "step": 17750 + }, + { + "epoch": 2.258109655260145, + "ewc_loss": 0.07667189091444016, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039806656423024833, + "grad_norm": 8.932570457458496, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.863359272480011, + "num_tokens": 677415005.0, + "step": 17751 + }, + { + "epoch": 2.2582368655387355, + "ewc_loss": 0.07599692791700363, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039131694938987494, + "grad_norm": 8.794746398925781, + "learning_rate": 1e-06, + "loss": 0.542, + "mean_token_accuracy": 0.845895528793335, + "num_tokens": 677457908.0, + "step": 17752 + }, + { + "epoch": 2.258364075817326, + "ewc_loss": 0.07654084265232086, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000396756106056273, + "grad_norm": 8.89450454711914, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8540560603141785, + "num_tokens": 677492759.0, + "step": 17753 + }, + { + "epoch": 2.2584912860959165, + "ewc_loss": 0.07602986693382263, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003916463756468147, + "grad_norm": 8.760358810424805, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8621326684951782, + "num_tokens": 677535771.0, + "step": 17754 + }, + { + "epoch": 2.258618496374507, + "ewc_loss": 0.07654689252376556, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003968165838159621, + "grad_norm": 8.877456665039062, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8772240877151489, + "num_tokens": 677575325.0, + "step": 17755 + }, + { + "epoch": 2.2587457066530976, + "ewc_loss": 0.07604604214429855, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003918080765288323, + "grad_norm": 8.830434799194336, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8651944398880005, + "num_tokens": 677614132.0, + "step": 17756 + }, + { + "epoch": 2.258872916931688, + "ewc_loss": 0.07633137702941895, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039466138696298003, + "grad_norm": 8.896580696105957, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8578824996948242, + "num_tokens": 677645657.0, + "step": 17757 + }, + { + "epoch": 2.2590001272102787, + "ewc_loss": 0.07613375782966614, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039268526597879827, + "grad_norm": 8.837553024291992, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8726716637611389, + "num_tokens": 677686238.0, + "step": 17758 + }, + { + "epoch": 2.259127337488869, + "ewc_loss": 0.07619144022464752, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039326201658695936, + "grad_norm": 8.854951858520508, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8674145340919495, + "num_tokens": 677724080.0, + "step": 17759 + }, + { + "epoch": 2.2592545477674597, + "ewc_loss": 0.07638417184352875, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003927479556296021, + "grad_norm": 15.901201248168945, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8633628487586975, + "num_tokens": 677765768.0, + "step": 17760 + }, + { + "epoch": 2.2593817580460502, + "ewc_loss": 0.08700427412986755, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0005013904301449656, + "grad_norm": 9.953700065612793, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.864041268825531, + "num_tokens": 677803937.0, + "step": 17761 + }, + { + "epoch": 2.2595089683246408, + "ewc_loss": 0.07690326869487762, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000397938973037526, + "grad_norm": 8.887052536010742, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8760513663291931, + "num_tokens": 677839685.0, + "step": 17762 + }, + { + "epoch": 2.2596361786032313, + "ewc_loss": 0.07792238891124725, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040813011582940817, + "grad_norm": 9.198080062866211, + "learning_rate": 1e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.8499243259429932, + "num_tokens": 677880411.0, + "step": 17763 + }, + { + "epoch": 2.259763388881822, + "ewc_loss": 0.07779436558485031, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004092913295608014, + "grad_norm": 8.998777389526367, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8726567625999451, + "num_tokens": 677921177.0, + "step": 17764 + }, + { + "epoch": 2.2598905991604123, + "ewc_loss": 0.07714197039604187, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004027673858217895, + "grad_norm": 9.026955604553223, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8729898929595947, + "num_tokens": 677956393.0, + "step": 17765 + }, + { + "epoch": 2.2600178094390024, + "ewc_loss": 0.076811783015728, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039946549804881215, + "grad_norm": 8.904563903808594, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8638534545898438, + "num_tokens": 677988364.0, + "step": 17766 + }, + { + "epoch": 2.2601450197175934, + "ewc_loss": 0.07708453387022018, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0004021929926238954, + "grad_norm": 9.009892463684082, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8716869354248047, + "num_tokens": 678028093.0, + "step": 17767 + }, + { + "epoch": 2.2602722299961835, + "ewc_loss": 0.07671816647052765, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039852934423834085, + "grad_norm": 8.882540702819824, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8654778599739075, + "num_tokens": 678068174.0, + "step": 17768 + }, + { + "epoch": 2.260399440274774, + "ewc_loss": 0.07668428122997284, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039819045923650265, + "grad_norm": 8.867537498474121, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8673169612884521, + "num_tokens": 678105784.0, + "step": 17769 + }, + { + "epoch": 2.2605266505533645, + "ewc_loss": 0.07659520208835602, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039729970740154386, + "grad_norm": 8.855182647705078, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8755160570144653, + "num_tokens": 678142873.0, + "step": 17770 + }, + { + "epoch": 2.260653860831955, + "ewc_loss": 0.07656550407409668, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039700267370790243, + "grad_norm": 8.901602745056152, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8611806035041809, + "num_tokens": 678181527.0, + "step": 17771 + }, + { + "epoch": 2.2607810711105456, + "ewc_loss": 0.07653026282787323, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039665025542490184, + "grad_norm": 8.828259468078613, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8658575415611267, + "num_tokens": 678221225.0, + "step": 17772 + }, + { + "epoch": 2.260908281389136, + "ewc_loss": 0.07668688893318176, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003982165944762528, + "grad_norm": 8.949575424194336, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.873192310333252, + "num_tokens": 678254767.0, + "step": 17773 + }, + { + "epoch": 2.2610354916677267, + "ewc_loss": 0.07617025822401047, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039305025711655617, + "grad_norm": 8.834047317504883, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.866100549697876, + "num_tokens": 678290830.0, + "step": 17774 + }, + { + "epoch": 2.261162701946317, + "ewc_loss": 0.07652994990348816, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003966471122112125, + "grad_norm": 8.836503982543945, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8733452558517456, + "num_tokens": 678327102.0, + "step": 17775 + }, + { + "epoch": 2.2612899122249077, + "ewc_loss": 0.0762920156121254, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003942678158637136, + "grad_norm": 8.969115257263184, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8771787285804749, + "num_tokens": 678364872.0, + "step": 17776 + }, + { + "epoch": 2.2614171225034982, + "ewc_loss": 0.07603858411312103, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003917335416190326, + "grad_norm": 8.820234298706055, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8577664494514465, + "num_tokens": 678399233.0, + "step": 17777 + }, + { + "epoch": 2.2615443327820888, + "ewc_loss": 0.07672886550426483, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039863635902293026, + "grad_norm": 8.960317611694336, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8733251094818115, + "num_tokens": 678434577.0, + "step": 17778 + }, + { + "epoch": 2.2616715430606793, + "ewc_loss": 0.07579991966485977, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003893468528985977, + "grad_norm": 8.811956405639648, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8707798719406128, + "num_tokens": 678466835.0, + "step": 17779 + }, + { + "epoch": 2.26179875333927, + "ewc_loss": 0.07652851939201355, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039663282223045826, + "grad_norm": 8.867300987243652, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.880683183670044, + "num_tokens": 678506817.0, + "step": 17780 + }, + { + "epoch": 2.2619259636178604, + "ewc_loss": 0.07588987052440643, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039024639409035444, + "grad_norm": 8.765810012817383, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8653622269630432, + "num_tokens": 678543734.0, + "step": 17781 + }, + { + "epoch": 2.262053173896451, + "ewc_loss": 0.07639047503471375, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039525236934423447, + "grad_norm": 8.868237495422363, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8601956963539124, + "num_tokens": 678585066.0, + "step": 17782 + }, + { + "epoch": 2.2621803841750414, + "ewc_loss": 0.07603812217712402, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039172888500615954, + "grad_norm": 8.869081497192383, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8796549439430237, + "num_tokens": 678614951.0, + "step": 17783 + }, + { + "epoch": 2.262307594453632, + "ewc_loss": 0.07617570459842682, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003931046521756798, + "grad_norm": 8.842854499816895, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8673256635665894, + "num_tokens": 678649671.0, + "step": 17784 + }, + { + "epoch": 2.2624348047322225, + "ewc_loss": 0.07610705494880676, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003924181801266968, + "grad_norm": 8.775763511657715, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8821671605110168, + "num_tokens": 678693418.0, + "step": 17785 + }, + { + "epoch": 2.262562015010813, + "ewc_loss": 0.0761733204126358, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003930808452423662, + "grad_norm": 8.822803497314453, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8747873306274414, + "num_tokens": 678730891.0, + "step": 17786 + }, + { + "epoch": 2.2626892252894035, + "ewc_loss": 0.07622550427913666, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003936027060262859, + "grad_norm": 8.820377349853516, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8732452392578125, + "num_tokens": 678766369.0, + "step": 17787 + }, + { + "epoch": 2.262816435567994, + "ewc_loss": 0.07629072666168213, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039425495197065175, + "grad_norm": 8.832832336425781, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8554677963256836, + "num_tokens": 678809118.0, + "step": 17788 + }, + { + "epoch": 2.2629436458465846, + "ewc_loss": 0.07618328928947449, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039318055496551096, + "grad_norm": 8.844054222106934, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8636652827262878, + "num_tokens": 678850057.0, + "step": 17789 + }, + { + "epoch": 2.263070856125175, + "ewc_loss": 0.07620955258607864, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039344318793155253, + "grad_norm": 8.834894180297852, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8651967644691467, + "num_tokens": 678890253.0, + "step": 17790 + }, + { + "epoch": 2.263198066403765, + "ewc_loss": 0.0762055516242981, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003934031701646745, + "grad_norm": 8.837077140808105, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.879569411277771, + "num_tokens": 678922535.0, + "step": 17791 + }, + { + "epoch": 2.263325276682356, + "ewc_loss": 0.07616958022117615, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039304347592405975, + "grad_norm": 8.828889846801758, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8639765977859497, + "num_tokens": 678956136.0, + "step": 17792 + }, + { + "epoch": 2.2634524869609463, + "ewc_loss": 0.07629454880952835, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000394293136196211, + "grad_norm": 8.923359870910645, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8677893280982971, + "num_tokens": 678990864.0, + "step": 17793 + }, + { + "epoch": 2.263579697239537, + "ewc_loss": 0.07597823441028595, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039113001548685133, + "grad_norm": 8.776466369628906, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.877790629863739, + "num_tokens": 679028345.0, + "step": 17794 + }, + { + "epoch": 2.2637069075181273, + "ewc_loss": 0.07628194987773895, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003941671457141638, + "grad_norm": 8.917047500610352, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.851959764957428, + "num_tokens": 679073679.0, + "step": 17795 + }, + { + "epoch": 2.263834117796718, + "ewc_loss": 0.0758056789636612, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003894043911714107, + "grad_norm": 8.778961181640625, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8641060590744019, + "num_tokens": 679109274.0, + "step": 17796 + }, + { + "epoch": 2.2639613280753084, + "ewc_loss": 0.07630032300949097, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003943508490920067, + "grad_norm": 8.828990936279297, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8664714694023132, + "num_tokens": 679141644.0, + "step": 17797 + }, + { + "epoch": 2.264088538353899, + "ewc_loss": 0.07605484873056412, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003918961447197944, + "grad_norm": 8.820646286010742, + "learning_rate": 1e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8494121432304382, + "num_tokens": 679184351.0, + "step": 17798 + }, + { + "epoch": 2.2642157486324894, + "ewc_loss": 0.07620112597942352, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039335887413471937, + "grad_norm": 8.867400169372559, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8623719215393066, + "num_tokens": 679222509.0, + "step": 17799 + }, + { + "epoch": 2.26434295891108, + "ewc_loss": 0.07610787451267242, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003924264165107161, + "grad_norm": 8.873616218566895, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8486731052398682, + "num_tokens": 679263362.0, + "step": 17800 + }, + { + "epoch": 2.2644701691896705, + "ewc_loss": 0.07607665657997131, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039211424882523715, + "grad_norm": 8.828346252441406, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8520985841751099, + "num_tokens": 679307859.0, + "step": 17801 + }, + { + "epoch": 2.264597379468261, + "ewc_loss": 0.07621407508850098, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039348838618025184, + "grad_norm": 8.860443115234375, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8747084140777588, + "num_tokens": 679344801.0, + "step": 17802 + }, + { + "epoch": 2.2647245897468515, + "ewc_loss": 0.07621626555919647, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003935102722607553, + "grad_norm": 8.936408996582031, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8572949171066284, + "num_tokens": 679382795.0, + "step": 17803 + }, + { + "epoch": 2.264851800025442, + "ewc_loss": 0.07607100903987885, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003920577000826597, + "grad_norm": 8.854690551757812, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8630342483520508, + "num_tokens": 679425282.0, + "step": 17804 + }, + { + "epoch": 2.2649790103040326, + "ewc_loss": 0.0762404128909111, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003937517758458853, + "grad_norm": 8.93591022491455, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8519657850265503, + "num_tokens": 679466881.0, + "step": 17805 + }, + { + "epoch": 2.265106220582623, + "ewc_loss": 0.07575513422489166, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003888989449478686, + "grad_norm": 8.668070793151855, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8773516416549683, + "num_tokens": 679509268.0, + "step": 17806 + }, + { + "epoch": 2.2652334308612136, + "ewc_loss": 0.07659503817558289, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003972980484832078, + "grad_norm": 8.984360694885254, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8617754578590393, + "num_tokens": 679545082.0, + "step": 17807 + }, + { + "epoch": 2.265360641139804, + "ewc_loss": 0.07561692595481873, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003875168622471392, + "grad_norm": 8.777731895446777, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8654850721359253, + "num_tokens": 679581795.0, + "step": 17808 + }, + { + "epoch": 2.2654878514183947, + "ewc_loss": 0.07687760889530182, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039768239366821945, + "grad_norm": 8.961207389831543, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8629660606384277, + "num_tokens": 679618978.0, + "step": 17809 + }, + { + "epoch": 2.2656150616969852, + "ewc_loss": 0.07565528154373169, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003879004216287285, + "grad_norm": 8.70451545715332, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8706356883049011, + "num_tokens": 679655002.0, + "step": 17810 + }, + { + "epoch": 2.2657422719755758, + "ewc_loss": 0.07678210735321045, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003991687553934753, + "grad_norm": 8.91469669342041, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.88087397813797, + "num_tokens": 679691143.0, + "step": 17811 + }, + { + "epoch": 2.2658694822541663, + "ewc_loss": 0.07578646391630173, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003892122767865658, + "grad_norm": 8.726167678833008, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8667818307876587, + "num_tokens": 679729168.0, + "step": 17812 + }, + { + "epoch": 2.265996692532757, + "ewc_loss": 0.07655000686645508, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039684775401838124, + "grad_norm": 8.909153938293457, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8638662099838257, + "num_tokens": 679773385.0, + "step": 17813 + }, + { + "epoch": 2.2661239028113473, + "ewc_loss": 0.07581252604722977, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003894729306921363, + "grad_norm": 8.746061325073242, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8769318461418152, + "num_tokens": 679813884.0, + "step": 17814 + }, + { + "epoch": 2.266251113089938, + "ewc_loss": 0.0766199380159378, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039754706085659564, + "grad_norm": 8.903436660766602, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8661489486694336, + "num_tokens": 679850834.0, + "step": 17815 + }, + { + "epoch": 2.266378323368528, + "ewc_loss": 0.07593653351068497, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003907129867002368, + "grad_norm": 8.730865478515625, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8705828189849854, + "num_tokens": 679890210.0, + "step": 17816 + }, + { + "epoch": 2.266505533647119, + "ewc_loss": 0.07679930329322815, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039934064261615276, + "grad_norm": 8.991631507873535, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8746817111968994, + "num_tokens": 679926252.0, + "step": 17817 + }, + { + "epoch": 2.266632743925709, + "ewc_loss": 0.07585736364126205, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038992127520032227, + "grad_norm": 8.765853881835938, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8627658486366272, + "num_tokens": 679959626.0, + "step": 17818 + }, + { + "epoch": 2.2667599542042995, + "ewc_loss": 0.07681144773960114, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003994621511083096, + "grad_norm": 8.928168296813965, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8607887625694275, + "num_tokens": 679999973.0, + "step": 17819 + }, + { + "epoch": 2.26688716448289, + "ewc_loss": 0.07599504292011261, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039129803190007806, + "grad_norm": 8.745299339294434, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8594004511833191, + "num_tokens": 680041081.0, + "step": 17820 + }, + { + "epoch": 2.2670143747614806, + "ewc_loss": 0.07663527131080627, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039770035073161125, + "grad_norm": 8.911776542663574, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.869049072265625, + "num_tokens": 680078953.0, + "step": 17821 + }, + { + "epoch": 2.267141585040071, + "ewc_loss": 0.07608014345169067, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039214908611029387, + "grad_norm": 8.733108520507812, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8744345903396606, + "num_tokens": 680112309.0, + "step": 17822 + }, + { + "epoch": 2.2672687953186617, + "ewc_loss": 0.07684116065502167, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003997592721134424, + "grad_norm": 8.930039405822754, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8607505559921265, + "num_tokens": 680148607.0, + "step": 17823 + }, + { + "epoch": 2.267396005597252, + "ewc_loss": 0.07598689198493958, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039121657027862966, + "grad_norm": 8.763480186462402, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8838127851486206, + "num_tokens": 680189617.0, + "step": 17824 + }, + { + "epoch": 2.2675232158758427, + "ewc_loss": 0.07679559290409088, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003993035643361509, + "grad_norm": 8.923861503601074, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8827164769172668, + "num_tokens": 680227940.0, + "step": 17825 + }, + { + "epoch": 2.2676504261544332, + "ewc_loss": 0.07611168920993805, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003924645425286144, + "grad_norm": 8.793313980102539, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8785929679870605, + "num_tokens": 680263473.0, + "step": 17826 + }, + { + "epoch": 2.2677776364330238, + "ewc_loss": 0.07648929208517075, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039624058990739286, + "grad_norm": 8.9269380569458, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8577146530151367, + "num_tokens": 680297895.0, + "step": 17827 + }, + { + "epoch": 2.2679048467116143, + "ewc_loss": 0.07617668807506561, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039311457658186555, + "grad_norm": 8.82207202911377, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.866297721862793, + "num_tokens": 680337658.0, + "step": 17828 + }, + { + "epoch": 2.268032056990205, + "ewc_loss": 0.07640320807695389, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039537972770631313, + "grad_norm": 8.932211875915527, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8610858917236328, + "num_tokens": 680375912.0, + "step": 17829 + }, + { + "epoch": 2.2681592672687954, + "ewc_loss": 0.076398104429245, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003928873047698289, + "grad_norm": 8.84811782836914, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8763341903686523, + "num_tokens": 680415231.0, + "step": 17830 + }, + { + "epoch": 2.268286477547386, + "ewc_loss": 0.07637592405080795, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039510687929578125, + "grad_norm": 8.915736198425293, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8620510101318359, + "num_tokens": 680460037.0, + "step": 17831 + }, + { + "epoch": 2.2684136878259764, + "ewc_loss": 0.07635165005922318, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039242274942807853, + "grad_norm": 8.841479301452637, + "learning_rate": 1e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8422324657440186, + "num_tokens": 680505521.0, + "step": 17832 + }, + { + "epoch": 2.268540898104567, + "ewc_loss": 0.07657252252101898, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039463143912144005, + "grad_norm": 8.900365829467773, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8795733451843262, + "num_tokens": 680547682.0, + "step": 17833 + }, + { + "epoch": 2.2686681083831575, + "ewc_loss": 0.07619501650333405, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003932978434022516, + "grad_norm": 8.881935119628906, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8709877729415894, + "num_tokens": 680587510.0, + "step": 17834 + }, + { + "epoch": 2.268795318661748, + "ewc_loss": 0.07626669108867645, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003940145543310791, + "grad_norm": 8.819369316101074, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8680101633071899, + "num_tokens": 680624233.0, + "step": 17835 + }, + { + "epoch": 2.2689225289403385, + "ewc_loss": 0.07630191743373871, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000394366797991097, + "grad_norm": 8.903644561767578, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8606742024421692, + "num_tokens": 680666091.0, + "step": 17836 + }, + { + "epoch": 2.269049739218929, + "ewc_loss": 0.0761677473783493, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003930250823032111, + "grad_norm": 8.850194931030273, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8737225532531738, + "num_tokens": 680703062.0, + "step": 17837 + }, + { + "epoch": 2.2691769494975196, + "ewc_loss": 0.07676287740468979, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039409362943843007, + "grad_norm": 8.893317222595215, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8790326714515686, + "num_tokens": 680735966.0, + "step": 17838 + }, + { + "epoch": 2.2693041597761097, + "ewc_loss": 0.07638691365718842, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039277534233406186, + "grad_norm": 8.852569580078125, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8661564588546753, + "num_tokens": 680780010.0, + "step": 17839 + }, + { + "epoch": 2.2694313700547006, + "ewc_loss": 0.07656127214431763, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003945190110243857, + "grad_norm": 8.898370742797852, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.86590576171875, + "num_tokens": 680819516.0, + "step": 17840 + }, + { + "epoch": 2.2695585803332907, + "ewc_loss": 0.0763523131608963, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039242932689376175, + "grad_norm": 8.8143949508667, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8646558523178101, + "num_tokens": 680861374.0, + "step": 17841 + }, + { + "epoch": 2.2696857906118812, + "ewc_loss": 0.07673129439353943, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039621919859200716, + "grad_norm": 8.916444778442383, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8574727773666382, + "num_tokens": 680902447.0, + "step": 17842 + }, + { + "epoch": 2.2698130008904718, + "ewc_loss": 0.07599252462387085, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000391272857086733, + "grad_norm": 8.763224601745605, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8830471038818359, + "num_tokens": 680942376.0, + "step": 17843 + }, + { + "epoch": 2.2699402111690623, + "ewc_loss": 0.07685898244380951, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003974961000494659, + "grad_norm": 8.904510498046875, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8663583993911743, + "num_tokens": 680977060.0, + "step": 17844 + }, + { + "epoch": 2.270067421447653, + "ewc_loss": 0.07608279585838318, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003921756288036704, + "grad_norm": 8.886786460876465, + "learning_rate": 1e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8493014574050903, + "num_tokens": 681014504.0, + "step": 17845 + }, + { + "epoch": 2.2701946317262434, + "ewc_loss": 0.07647231221199036, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003936293360311538, + "grad_norm": 8.8506498336792, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8489437103271484, + "num_tokens": 681053919.0, + "step": 17846 + }, + { + "epoch": 2.270321842004834, + "ewc_loss": 0.07661838829517365, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039509014459326863, + "grad_norm": 8.91402816772461, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8655096292495728, + "num_tokens": 681090883.0, + "step": 17847 + }, + { + "epoch": 2.2704490522834244, + "ewc_loss": 0.07625387609004974, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039388646837323904, + "grad_norm": 8.812716484069824, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8810771703720093, + "num_tokens": 681128819.0, + "step": 17848 + }, + { + "epoch": 2.270576262562015, + "ewc_loss": 0.07679340243339539, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003968402452301234, + "grad_norm": 8.895599365234375, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8560656309127808, + "num_tokens": 681171434.0, + "step": 17849 + }, + { + "epoch": 2.2707034728406055, + "ewc_loss": 0.07652374356985092, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039414368802681565, + "grad_norm": 8.824403762817383, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8758747577667236, + "num_tokens": 681206297.0, + "step": 17850 + }, + { + "epoch": 2.270830683119196, + "ewc_loss": 0.07684947550296783, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039740095962770283, + "grad_norm": 8.898391723632812, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8559560179710388, + "num_tokens": 681242521.0, + "step": 17851 + }, + { + "epoch": 2.2709578933977865, + "ewc_loss": 0.0766172781586647, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039507902693003416, + "grad_norm": 8.884136199951172, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.875617265701294, + "num_tokens": 681282474.0, + "step": 17852 + }, + { + "epoch": 2.271085103676377, + "ewc_loss": 0.07667071372270584, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003980548062827438, + "grad_norm": 8.931449890136719, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8620449304580688, + "num_tokens": 681317280.0, + "step": 17853 + }, + { + "epoch": 2.2712123139549676, + "ewc_loss": 0.07622885704040527, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003936362627428025, + "grad_norm": 8.842204093933105, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8633582592010498, + "num_tokens": 681349155.0, + "step": 17854 + }, + { + "epoch": 2.271339524233558, + "ewc_loss": 0.07661782205104828, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039752593147568405, + "grad_norm": 8.955912590026855, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8649859428405762, + "num_tokens": 681380863.0, + "step": 17855 + }, + { + "epoch": 2.2714667345121486, + "ewc_loss": 0.07614514976739883, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039279914926737547, + "grad_norm": 8.78234577178955, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8697598576545715, + "num_tokens": 681421970.0, + "step": 17856 + }, + { + "epoch": 2.271593944790739, + "ewc_loss": 0.07678984105587006, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039924608427099884, + "grad_norm": 8.917634963989258, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.879581093788147, + "num_tokens": 681453511.0, + "step": 17857 + }, + { + "epoch": 2.2717211550693297, + "ewc_loss": 0.07609784603118896, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003923260956071317, + "grad_norm": 8.81728744506836, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.853322446346283, + "num_tokens": 681496293.0, + "step": 17858 + }, + { + "epoch": 2.2718483653479202, + "ewc_loss": 0.07684627920389175, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003973690327256918, + "grad_norm": 8.920236587524414, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8578695058822632, + "num_tokens": 681537074.0, + "step": 17859 + }, + { + "epoch": 2.2719755756265108, + "ewc_loss": 0.07646533101797104, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039355954504571855, + "grad_norm": 8.824885368347168, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8587971925735474, + "num_tokens": 681578911.0, + "step": 17860 + }, + { + "epoch": 2.2721027859051013, + "ewc_loss": 0.07682906091213226, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039719685446470976, + "grad_norm": 8.980911254882812, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8643479943275452, + "num_tokens": 681622134.0, + "step": 17861 + }, + { + "epoch": 2.272229996183692, + "ewc_loss": 0.07636421918869019, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039254839066416025, + "grad_norm": 8.820653915405273, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8666166067123413, + "num_tokens": 681655919.0, + "step": 17862 + }, + { + "epoch": 2.2723572064622823, + "ewc_loss": 0.07663550972938538, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003977027372457087, + "grad_norm": 8.922224044799805, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8563517332077026, + "num_tokens": 681696539.0, + "step": 17863 + }, + { + "epoch": 2.2724844167408724, + "ewc_loss": 0.07621635496616364, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039351123268716037, + "grad_norm": 8.838637351989746, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8630447387695312, + "num_tokens": 681734630.0, + "step": 17864 + }, + { + "epoch": 2.2726116270194634, + "ewc_loss": 0.07653048634529114, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003966525546275079, + "grad_norm": 8.865079879760742, + "learning_rate": 1e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8539496064186096, + "num_tokens": 681779669.0, + "step": 17865 + }, + { + "epoch": 2.2727388372980535, + "ewc_loss": 0.07648995518684387, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003962471673730761, + "grad_norm": 8.863578796386719, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8632862567901611, + "num_tokens": 681821920.0, + "step": 17866 + }, + { + "epoch": 2.272866047576644, + "ewc_loss": 0.07648062705993652, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003961539769079536, + "grad_norm": 8.895236015319824, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8639349937438965, + "num_tokens": 681859239.0, + "step": 17867 + }, + { + "epoch": 2.2729932578552345, + "ewc_loss": 0.0765829011797905, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039717668551020324, + "grad_norm": 8.862944602966309, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8525879979133606, + "num_tokens": 681896569.0, + "step": 17868 + }, + { + "epoch": 2.273120468133825, + "ewc_loss": 0.07659991830587387, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003973468265030533, + "grad_norm": 8.938262939453125, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8799391984939575, + "num_tokens": 681932549.0, + "step": 17869 + }, + { + "epoch": 2.2732476784124156, + "ewc_loss": 0.07635296136140823, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003948772791773081, + "grad_norm": 8.878425598144531, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8609867691993713, + "num_tokens": 681973981.0, + "step": 17870 + }, + { + "epoch": 2.273374888691006, + "ewc_loss": 0.07659561932086945, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039730386924929917, + "grad_norm": 8.912991523742676, + "learning_rate": 1e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8416026830673218, + "num_tokens": 682009512.0, + "step": 17871 + }, + { + "epoch": 2.2735020989695967, + "ewc_loss": 0.07623648643493652, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039371257298626006, + "grad_norm": 8.874411582946777, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8622956871986389, + "num_tokens": 682048058.0, + "step": 17872 + }, + { + "epoch": 2.273629309248187, + "ewc_loss": 0.07647092640399933, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039605694473721087, + "grad_norm": 8.871994972229004, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8579199314117432, + "num_tokens": 682084181.0, + "step": 17873 + }, + { + "epoch": 2.2737565195267777, + "ewc_loss": 0.076361283659935, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039496045792475343, + "grad_norm": 8.880802154541016, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8858163356781006, + "num_tokens": 682119613.0, + "step": 17874 + }, + { + "epoch": 2.2738837298053682, + "ewc_loss": 0.07636164873838425, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039496415411122143, + "grad_norm": 8.881414413452148, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8686155080795288, + "num_tokens": 682159501.0, + "step": 17875 + }, + { + "epoch": 2.2740109400839588, + "ewc_loss": 0.07642245292663574, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039557222044095397, + "grad_norm": 8.876200675964355, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8739046454429626, + "num_tokens": 682197323.0, + "step": 17876 + }, + { + "epoch": 2.2741381503625493, + "ewc_loss": 0.0762614905834198, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039396260399371386, + "grad_norm": 8.908021926879883, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8668068647384644, + "num_tokens": 682231150.0, + "step": 17877 + }, + { + "epoch": 2.27426536064114, + "ewc_loss": 0.0762997567653656, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000394345261156559, + "grad_norm": 8.897119522094727, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8654406070709229, + "num_tokens": 682269351.0, + "step": 17878 + }, + { + "epoch": 2.2743925709197303, + "ewc_loss": 0.07632525265216827, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039460021071136, + "grad_norm": 8.893052101135254, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.875145435333252, + "num_tokens": 682303633.0, + "step": 17879 + }, + { + "epoch": 2.274519781198321, + "ewc_loss": 0.07647676765918732, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039367395220324397, + "grad_norm": 8.890584945678711, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8641246557235718, + "num_tokens": 682340890.0, + "step": 17880 + }, + { + "epoch": 2.2746469914769114, + "ewc_loss": 0.07626546919345856, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039400230161845684, + "grad_norm": 8.906451225280762, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8810104131698608, + "num_tokens": 682374253.0, + "step": 17881 + }, + { + "epoch": 2.274774201755502, + "ewc_loss": 0.07612919807434082, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039263966027647257, + "grad_norm": 8.782927513122559, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8665485382080078, + "num_tokens": 682412675.0, + "step": 17882 + }, + { + "epoch": 2.2749014120340925, + "ewc_loss": 0.07643264532089233, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039567414205521345, + "grad_norm": 8.962084770202637, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8761664032936096, + "num_tokens": 682447864.0, + "step": 17883 + }, + { + "epoch": 2.275028622312683, + "ewc_loss": 0.07597889006137848, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003911365056410432, + "grad_norm": 8.817227363586426, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8669592142105103, + "num_tokens": 682486445.0, + "step": 17884 + }, + { + "epoch": 2.2751558325912735, + "ewc_loss": 0.07675912976264954, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003964975767303258, + "grad_norm": 8.938815116882324, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8647059202194214, + "num_tokens": 682518173.0, + "step": 17885 + }, + { + "epoch": 2.275283042869864, + "ewc_loss": 0.07597658038139343, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003911134845111519, + "grad_norm": 8.824944496154785, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8650758266448975, + "num_tokens": 682558627.0, + "step": 17886 + }, + { + "epoch": 2.2754102531484546, + "ewc_loss": 0.07673903554677963, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039629661478102207, + "grad_norm": 8.951972007751465, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8717081546783447, + "num_tokens": 682600821.0, + "step": 17887 + }, + { + "epoch": 2.275537463427045, + "ewc_loss": 0.07603326439857483, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039168031071312726, + "grad_norm": 8.791098594665527, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8544597625732422, + "num_tokens": 682643947.0, + "step": 17888 + }, + { + "epoch": 2.275664673705635, + "ewc_loss": 0.0766255185008049, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039760282379575074, + "grad_norm": 8.946219444274902, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8607296943664551, + "num_tokens": 682683045.0, + "step": 17889 + }, + { + "epoch": 2.275791883984226, + "ewc_loss": 0.07594390213489532, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003907867067027837, + "grad_norm": 8.787684440612793, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8657401204109192, + "num_tokens": 682721617.0, + "step": 17890 + }, + { + "epoch": 2.2759190942628162, + "ewc_loss": 0.07656430453062057, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039699068292975426, + "grad_norm": 8.941527366638184, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.858283519744873, + "num_tokens": 682758339.0, + "step": 17891 + }, + { + "epoch": 2.2760463045414068, + "ewc_loss": 0.07592692971229553, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003906169440597296, + "grad_norm": 8.768675804138184, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8707051277160645, + "num_tokens": 682800501.0, + "step": 17892 + }, + { + "epoch": 2.2761735148199973, + "ewc_loss": 0.07669854909181595, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.000398333155317232, + "grad_norm": 8.992868423461914, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8645408153533936, + "num_tokens": 682833231.0, + "step": 17893 + }, + { + "epoch": 2.276300725098588, + "ewc_loss": 0.07589573413133621, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003903050092048943, + "grad_norm": 8.781912803649902, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8712766170501709, + "num_tokens": 682873945.0, + "step": 17894 + }, + { + "epoch": 2.2764279353771784, + "ewc_loss": 0.07678425312042236, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039919017581269145, + "grad_norm": 9.010641098022461, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8638148903846741, + "num_tokens": 682918329.0, + "step": 17895 + }, + { + "epoch": 2.276555145655769, + "ewc_loss": 0.07594954967498779, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003908431972377002, + "grad_norm": 8.79455280303955, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8728609681129456, + "num_tokens": 682956277.0, + "step": 17896 + }, + { + "epoch": 2.2766823559343594, + "ewc_loss": 0.07688993215560913, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00040024699410423636, + "grad_norm": 8.95530891418457, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.868709146976471, + "num_tokens": 682990648.0, + "step": 17897 + }, + { + "epoch": 2.27680956621295, + "ewc_loss": 0.07614125311374664, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039276015013456345, + "grad_norm": 8.854879379272461, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8789052367210388, + "num_tokens": 683029046.0, + "step": 17898 + }, + { + "epoch": 2.2769367764915405, + "ewc_loss": 0.0766342282295227, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003976899024564773, + "grad_norm": 8.950014114379883, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8655345439910889, + "num_tokens": 683064541.0, + "step": 17899 + }, + { + "epoch": 2.277063986770131, + "ewc_loss": 0.07644911110401154, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003933973493985832, + "grad_norm": 8.82552719116211, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8649992942810059, + "num_tokens": 683105015.0, + "step": 17900 + }, + { + "epoch": 2.2771911970487215, + "ewc_loss": 0.07661399245262146, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003974875435233116, + "grad_norm": 8.946940422058105, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8582761287689209, + "num_tokens": 683142938.0, + "step": 17901 + }, + { + "epoch": 2.277318407327312, + "ewc_loss": 0.07612693309783936, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003926169592887163, + "grad_norm": 8.873433113098145, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8794876337051392, + "num_tokens": 683181593.0, + "step": 17902 + }, + { + "epoch": 2.2774456176059026, + "ewc_loss": 0.07702130079269409, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039667790406383574, + "grad_norm": 8.931453704833984, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8545594215393066, + "num_tokens": 683218641.0, + "step": 17903 + }, + { + "epoch": 2.277572827884493, + "ewc_loss": 0.07615883648395538, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003929359663743526, + "grad_norm": 8.866692543029785, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8487077355384827, + "num_tokens": 683261479.0, + "step": 17904 + }, + { + "epoch": 2.2777000381630836, + "ewc_loss": 0.07663613557815552, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003952676197513938, + "grad_norm": 8.924569129943848, + "learning_rate": 1e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8397864699363708, + "num_tokens": 683300602.0, + "step": 17905 + }, + { + "epoch": 2.277827248441674, + "ewc_loss": 0.07624485343694687, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039379618829116225, + "grad_norm": 8.856124877929688, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8638420701026917, + "num_tokens": 683337012.0, + "step": 17906 + }, + { + "epoch": 2.2779544587202647, + "ewc_loss": 0.07652861624956131, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003966338117606938, + "grad_norm": 8.982267379760742, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8732507228851318, + "num_tokens": 683368949.0, + "step": 17907 + }, + { + "epoch": 2.2780816689988552, + "ewc_loss": 0.07596950232982635, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003910426748916507, + "grad_norm": 8.750927925109863, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8630741238594055, + "num_tokens": 683408741.0, + "step": 17908 + }, + { + "epoch": 2.2782088792774458, + "ewc_loss": 0.0768008828163147, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003993564750999212, + "grad_norm": 8.960875511169434, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8558214902877808, + "num_tokens": 683447843.0, + "step": 17909 + }, + { + "epoch": 2.2783360895560363, + "ewc_loss": 0.07639408111572266, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003904057084582746, + "grad_norm": 8.756427764892578, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8621898293495178, + "num_tokens": 683489695.0, + "step": 17910 + }, + { + "epoch": 2.278463299834627, + "ewc_loss": 0.07679684460163116, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003993160789832473, + "grad_norm": 8.93547534942627, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8628237247467041, + "num_tokens": 683528381.0, + "step": 17911 + }, + { + "epoch": 2.2785905101132173, + "ewc_loss": 0.07607617229223251, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003921093884855509, + "grad_norm": 8.745643615722656, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8680433034896851, + "num_tokens": 683567261.0, + "step": 17912 + }, + { + "epoch": 2.278717720391808, + "ewc_loss": 0.07682372629642487, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003995849401690066, + "grad_norm": 8.995288848876953, + "learning_rate": 1e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8503037691116333, + "num_tokens": 683601083.0, + "step": 17913 + }, + { + "epoch": 2.278844930670398, + "ewc_loss": 0.07616046816110611, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039295232272706926, + "grad_norm": 8.823955535888672, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8583777546882629, + "num_tokens": 683643494.0, + "step": 17914 + }, + { + "epoch": 2.278972140948989, + "ewc_loss": 0.07680931687355042, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003994407888967544, + "grad_norm": 8.993341445922852, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8608574867248535, + "num_tokens": 683684267.0, + "step": 17915 + }, + { + "epoch": 2.279099351227579, + "ewc_loss": 0.07607341557741165, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039208182715810835, + "grad_norm": 8.809791564941406, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8538127541542053, + "num_tokens": 683723292.0, + "step": 17916 + }, + { + "epoch": 2.2792265615061695, + "ewc_loss": 0.07668235898017883, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003981712507084012, + "grad_norm": 8.950162887573242, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8772329092025757, + "num_tokens": 683761378.0, + "step": 17917 + }, + { + "epoch": 2.27935377178476, + "ewc_loss": 0.07616189867258072, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039296664181165397, + "grad_norm": 8.842504501342773, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8570598363876343, + "num_tokens": 683794838.0, + "step": 17918 + }, + { + "epoch": 2.2794809820633506, + "ewc_loss": 0.07651416957378387, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003964893112424761, + "grad_norm": 8.94536018371582, + "learning_rate": 1e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8480356931686401, + "num_tokens": 683832307.0, + "step": 17919 + }, + { + "epoch": 2.279608192341941, + "ewc_loss": 0.0761297270655632, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039264492806978524, + "grad_norm": 8.850773811340332, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8581281304359436, + "num_tokens": 683871848.0, + "step": 17920 + }, + { + "epoch": 2.2797354026205316, + "ewc_loss": 0.07647338509559631, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003960814792662859, + "grad_norm": 8.854995727539062, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8683383464813232, + "num_tokens": 683915086.0, + "step": 17921 + }, + { + "epoch": 2.279862612899122, + "ewc_loss": 0.07631760835647583, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003945237258449197, + "grad_norm": 8.878000259399414, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8578115701675415, + "num_tokens": 683945411.0, + "step": 17922 + }, + { + "epoch": 2.2799898231777127, + "ewc_loss": 0.0763331726193428, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003946793731302023, + "grad_norm": 8.848631858825684, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8648483753204346, + "num_tokens": 683979664.0, + "step": 17923 + }, + { + "epoch": 2.2801170334563032, + "ewc_loss": 0.07628470659255981, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039419473614543676, + "grad_norm": 8.859382629394531, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8555359840393066, + "num_tokens": 684025526.0, + "step": 17924 + }, + { + "epoch": 2.2802442437348938, + "ewc_loss": 0.07622914761304855, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039363911491818726, + "grad_norm": 8.889154434204102, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8584937453269958, + "num_tokens": 684056992.0, + "step": 17925 + }, + { + "epoch": 2.2803714540134843, + "ewc_loss": 0.07627981156110764, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039414578350260854, + "grad_norm": 8.860984802246094, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8696966171264648, + "num_tokens": 684094515.0, + "step": 17926 + }, + { + "epoch": 2.280498664292075, + "ewc_loss": 0.07640762627124786, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003954239364247769, + "grad_norm": 8.922407150268555, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8711613416671753, + "num_tokens": 684132294.0, + "step": 17927 + }, + { + "epoch": 2.2806258745706653, + "ewc_loss": 0.07615591585636139, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039290677523240447, + "grad_norm": 8.874979019165039, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8679574728012085, + "num_tokens": 684164010.0, + "step": 17928 + }, + { + "epoch": 2.280753084849256, + "ewc_loss": 0.07663589715957642, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003952652041334659, + "grad_norm": 8.893054962158203, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8744735717773438, + "num_tokens": 684202077.0, + "step": 17929 + }, + { + "epoch": 2.2808802951278464, + "ewc_loss": 0.07645656168460846, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039347182610072196, + "grad_norm": 8.873128890991211, + "learning_rate": 1e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8389892578125, + "num_tokens": 684240799.0, + "step": 17930 + }, + { + "epoch": 2.281007505406437, + "ewc_loss": 0.07631093263626099, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003920155286323279, + "grad_norm": 8.786735534667969, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8628501892089844, + "num_tokens": 684284080.0, + "step": 17931 + }, + { + "epoch": 2.2811347156850275, + "ewc_loss": 0.07687440514564514, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039520885911770165, + "grad_norm": 8.932031631469727, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8482125997543335, + "num_tokens": 684317915.0, + "step": 17932 + }, + { + "epoch": 2.281261925963618, + "ewc_loss": 0.0760122537612915, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038902880623936653, + "grad_norm": 8.720290184020996, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.9034513831138611, + "num_tokens": 684354665.0, + "step": 17933 + }, + { + "epoch": 2.2813891362422085, + "ewc_loss": 0.07671460509300232, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003960523463319987, + "grad_norm": 8.900033950805664, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8641723990440369, + "num_tokens": 684392866.0, + "step": 17934 + }, + { + "epoch": 2.281516346520799, + "ewc_loss": 0.07637517899274826, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003926580538973212, + "grad_norm": 8.83159065246582, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.867647647857666, + "num_tokens": 684428422.0, + "step": 17935 + }, + { + "epoch": 2.2816435567993896, + "ewc_loss": 0.0766143649816513, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039504989399574697, + "grad_norm": 8.90363597869873, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8515025973320007, + "num_tokens": 684467129.0, + "step": 17936 + }, + { + "epoch": 2.2817707670779797, + "ewc_loss": 0.07623085379600525, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039121482404880226, + "grad_norm": 8.788817405700684, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8566442728042603, + "num_tokens": 684500093.0, + "step": 17937 + }, + { + "epoch": 2.2818979773565706, + "ewc_loss": 0.0766681656241417, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039558790740557015, + "grad_norm": 8.918885231018066, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8563435077667236, + "num_tokens": 684541422.0, + "step": 17938 + }, + { + "epoch": 2.2820251876351607, + "ewc_loss": 0.07641540467739105, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039061892312020063, + "grad_norm": 8.774971961975098, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8602367639541626, + "num_tokens": 684579164.0, + "step": 17939 + }, + { + "epoch": 2.2821523979137512, + "ewc_loss": 0.07670179754495621, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039592423127032816, + "grad_norm": 8.934842109680176, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8485427498817444, + "num_tokens": 684618617.0, + "step": 17940 + }, + { + "epoch": 2.2822796081923418, + "ewc_loss": 0.07607899606227875, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038969621527940035, + "grad_norm": 8.831960678100586, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.849605917930603, + "num_tokens": 684655525.0, + "step": 17941 + }, + { + "epoch": 2.2824068184709323, + "ewc_loss": 0.07642345130443573, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003931407700292766, + "grad_norm": 8.901103019714355, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8644411563873291, + "num_tokens": 684694525.0, + "step": 17942 + }, + { + "epoch": 2.282534028749523, + "ewc_loss": 0.07611881196498871, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003900943847838789, + "grad_norm": 8.791692733764648, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.864565372467041, + "num_tokens": 684733247.0, + "step": 17943 + }, + { + "epoch": 2.2826612390281134, + "ewc_loss": 0.07663455605506897, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003952517581637949, + "grad_norm": 8.931604385375977, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8760136365890503, + "num_tokens": 684763642.0, + "step": 17944 + }, + { + "epoch": 2.282788449306704, + "ewc_loss": 0.07609215378761292, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038982773548923433, + "grad_norm": 8.750537872314453, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.870083212852478, + "num_tokens": 684806164.0, + "step": 17945 + }, + { + "epoch": 2.2829156595852944, + "ewc_loss": 0.07660871744155884, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039499346166849136, + "grad_norm": 8.85137939453125, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8650249242782593, + "num_tokens": 684844193.0, + "step": 17946 + }, + { + "epoch": 2.283042869863885, + "ewc_loss": 0.07626006007194519, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003915068518836051, + "grad_norm": 8.822413444519043, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8703197240829468, + "num_tokens": 684882103.0, + "step": 17947 + }, + { + "epoch": 2.2831700801424755, + "ewc_loss": 0.07651178538799286, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003940241294912994, + "grad_norm": 8.835143089294434, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8791100382804871, + "num_tokens": 684917667.0, + "step": 17948 + }, + { + "epoch": 2.283297290421066, + "ewc_loss": 0.07643677294254303, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039327400736510754, + "grad_norm": 8.870992660522461, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.868356466293335, + "num_tokens": 684954007.0, + "step": 17949 + }, + { + "epoch": 2.2834245006996565, + "ewc_loss": 0.07635801285505295, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039248637040145695, + "grad_norm": 8.831287384033203, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8595346808433533, + "num_tokens": 684994693.0, + "step": 17950 + }, + { + "epoch": 2.283551710978247, + "ewc_loss": 0.07652480900287628, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003941543400287628, + "grad_norm": 8.79451847076416, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8526150584220886, + "num_tokens": 685032713.0, + "step": 17951 + }, + { + "epoch": 2.2836789212568376, + "ewc_loss": 0.07649531960487366, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039385943091474473, + "grad_norm": 8.913783073425293, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8580905199050903, + "num_tokens": 685066597.0, + "step": 17952 + }, + { + "epoch": 2.283806131535428, + "ewc_loss": 0.07669718563556671, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003934366977773607, + "grad_norm": 8.78433895111084, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8588869571685791, + "num_tokens": 685106243.0, + "step": 17953 + }, + { + "epoch": 2.2839333418140186, + "ewc_loss": 0.07692188024520874, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039568368811160326, + "grad_norm": 8.96120548248291, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8562593460083008, + "num_tokens": 685145177.0, + "step": 17954 + }, + { + "epoch": 2.284060552092609, + "ewc_loss": 0.07642035186290741, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003906683996319771, + "grad_norm": 8.72536563873291, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8714352250099182, + "num_tokens": 685189130.0, + "step": 17955 + }, + { + "epoch": 2.2841877623711997, + "ewc_loss": 0.07727993279695511, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003992641868535429, + "grad_norm": 8.947726249694824, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8676273822784424, + "num_tokens": 685227536.0, + "step": 17956 + }, + { + "epoch": 2.28431497264979, + "ewc_loss": 0.07596834003925323, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00038858968764543533, + "grad_norm": 8.677140235900879, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8698575496673584, + "num_tokens": 685267213.0, + "step": 17957 + }, + { + "epoch": 2.2844421829283807, + "ewc_loss": 0.0772370919585228, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040127718239091337, + "grad_norm": 9.002351760864258, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.862873911857605, + "num_tokens": 685301115.0, + "step": 17958 + }, + { + "epoch": 2.2845693932069713, + "ewc_loss": 0.07591776549816132, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003880838630720973, + "grad_norm": 8.719316482543945, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8701765537261963, + "num_tokens": 685338497.0, + "step": 17959 + }, + { + "epoch": 2.284696603485562, + "ewc_loss": 0.07720797508955002, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040098599856719375, + "grad_norm": 8.971460342407227, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8525378704071045, + "num_tokens": 685374983.0, + "step": 17960 + }, + { + "epoch": 2.2848238137641523, + "ewc_loss": 0.07602374255657196, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003891437081620097, + "grad_norm": 8.773652076721191, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8711870908737183, + "num_tokens": 685408859.0, + "step": 17961 + }, + { + "epoch": 2.2849510240427424, + "ewc_loss": 0.0771363377571106, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040026966598816216, + "grad_norm": 9.011992454528809, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8617289066314697, + "num_tokens": 685445103.0, + "step": 17962 + }, + { + "epoch": 2.2850782343213334, + "ewc_loss": 0.07627429068088531, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038920779479667544, + "grad_norm": 8.760111808776855, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8597865104675293, + "num_tokens": 685484657.0, + "step": 17963 + }, + { + "epoch": 2.2852054445999235, + "ewc_loss": 0.077247254550457, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039893737994134426, + "grad_norm": 9.016502380371094, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8730678558349609, + "num_tokens": 685524034.0, + "step": 17964 + }, + { + "epoch": 2.285332654878514, + "ewc_loss": 0.07630729675292969, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038953780313022435, + "grad_norm": 8.830516815185547, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8709225654602051, + "num_tokens": 685560368.0, + "step": 17965 + }, + { + "epoch": 2.2854598651571045, + "ewc_loss": 0.0769830048084259, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039629486855119467, + "grad_norm": 8.913304328918457, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8610795736312866, + "num_tokens": 685598192.0, + "step": 17966 + }, + { + "epoch": 2.285587075435695, + "ewc_loss": 0.07646693289279938, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003911342064384371, + "grad_norm": 8.823802947998047, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8670842051506042, + "num_tokens": 685636300.0, + "step": 17967 + }, + { + "epoch": 2.2857142857142856, + "ewc_loss": 0.07688955962657928, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003953604318667203, + "grad_norm": 8.989546775817871, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8626785278320312, + "num_tokens": 685671457.0, + "step": 17968 + }, + { + "epoch": 2.285841495992876, + "ewc_loss": 0.07631401717662811, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038960506208240986, + "grad_norm": 8.788026809692383, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8535589575767517, + "num_tokens": 685709687.0, + "step": 17969 + }, + { + "epoch": 2.2859687062714666, + "ewc_loss": 0.07701851427555084, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003966499643865973, + "grad_norm": 9.006221771240234, + "learning_rate": 1e-06, + "loss": 0.544, + "mean_token_accuracy": 0.8430323600769043, + "num_tokens": 685747854.0, + "step": 17970 + }, + { + "epoch": 2.286095916550057, + "ewc_loss": 0.07605427503585815, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003870075452141464, + "grad_norm": 8.693964004516602, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.884048581123352, + "num_tokens": 685785215.0, + "step": 17971 + }, + { + "epoch": 2.2862231268286477, + "ewc_loss": 0.07724137604236603, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039887864841148257, + "grad_norm": 9.02404499053955, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8687006235122681, + "num_tokens": 685818871.0, + "step": 17972 + }, + { + "epoch": 2.2863503371072382, + "ewc_loss": 0.07603226602077484, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038678752025589347, + "grad_norm": 8.77954387664795, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8577406406402588, + "num_tokens": 685859089.0, + "step": 17973 + }, + { + "epoch": 2.2864775473858288, + "ewc_loss": 0.07704894989728928, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003969543322455138, + "grad_norm": 8.939977645874023, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8558639883995056, + "num_tokens": 685900346.0, + "step": 17974 + }, + { + "epoch": 2.2866047576644193, + "ewc_loss": 0.07623206079006195, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038878541090525687, + "grad_norm": 8.808414459228516, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8631415367126465, + "num_tokens": 685938186.0, + "step": 17975 + }, + { + "epoch": 2.28673196794301, + "ewc_loss": 0.07690267264842987, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003954916028305888, + "grad_norm": 8.922623634338379, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.871641993522644, + "num_tokens": 685974911.0, + "step": 17976 + }, + { + "epoch": 2.2868591782216003, + "ewc_loss": 0.07635056972503662, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038997051888145506, + "grad_norm": 8.815385818481445, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8669275045394897, + "num_tokens": 686010430.0, + "step": 17977 + }, + { + "epoch": 2.286986388500191, + "ewc_loss": 0.07665180414915085, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039298288174904883, + "grad_norm": 8.950077056884766, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.874599277973175, + "num_tokens": 686042727.0, + "step": 17978 + }, + { + "epoch": 2.2871135987787814, + "ewc_loss": 0.07622160017490387, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038868089905008674, + "grad_norm": 8.732193946838379, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8664056062698364, + "num_tokens": 686078819.0, + "step": 17979 + }, + { + "epoch": 2.287240809057372, + "ewc_loss": 0.0764968991279602, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003938752051908523, + "grad_norm": 8.865728378295898, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8657957315444946, + "num_tokens": 686117760.0, + "step": 17980 + }, + { + "epoch": 2.2873680193359625, + "ewc_loss": 0.07617729902267456, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038823788054287434, + "grad_norm": 8.841119766235352, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8564636707305908, + "num_tokens": 686152992.0, + "step": 17981 + }, + { + "epoch": 2.287495229614553, + "ewc_loss": 0.07659398019313812, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039240464684553444, + "grad_norm": 8.854351997375488, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8589097261428833, + "num_tokens": 686195353.0, + "step": 17982 + }, + { + "epoch": 2.2876224398931435, + "ewc_loss": 0.07648395001888275, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003913043183274567, + "grad_norm": 8.84455394744873, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8604320883750916, + "num_tokens": 686232990.0, + "step": 17983 + }, + { + "epoch": 2.287749650171734, + "ewc_loss": 0.0764228031039238, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003906928759533912, + "grad_norm": 8.863213539123535, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8695651292800903, + "num_tokens": 686265933.0, + "step": 17984 + }, + { + "epoch": 2.2878768604503246, + "ewc_loss": 0.07652704417705536, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003917353169526905, + "grad_norm": 8.800553321838379, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8705565333366394, + "num_tokens": 686307111.0, + "step": 17985 + }, + { + "epoch": 2.288004070728915, + "ewc_loss": 0.07653321325778961, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003917969879694283, + "grad_norm": 8.905972480773926, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8661302328109741, + "num_tokens": 686343363.0, + "step": 17986 + }, + { + "epoch": 2.288131281007505, + "ewc_loss": 0.0763215571641922, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003896804410032928, + "grad_norm": 8.821084976196289, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8633542060852051, + "num_tokens": 686380888.0, + "step": 17987 + }, + { + "epoch": 2.288258491286096, + "ewc_loss": 0.0765998363494873, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039246317464858294, + "grad_norm": 8.870244979858398, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8566321134567261, + "num_tokens": 686417350.0, + "step": 17988 + }, + { + "epoch": 2.2883857015646862, + "ewc_loss": 0.07629746198654175, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003894394321832806, + "grad_norm": 8.844900131225586, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8724994659423828, + "num_tokens": 686458658.0, + "step": 17989 + }, + { + "epoch": 2.2885129118432768, + "ewc_loss": 0.07666091620922089, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039307400584220886, + "grad_norm": 8.94731616973877, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8673378229141235, + "num_tokens": 686493945.0, + "step": 17990 + }, + { + "epoch": 2.2886401221218673, + "ewc_loss": 0.07615766674280167, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038804151699878275, + "grad_norm": 8.885086059570312, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8630672097206116, + "num_tokens": 686527608.0, + "step": 17991 + }, + { + "epoch": 2.288767332400458, + "ewc_loss": 0.0759606882929802, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.0003909545484930277, + "grad_norm": 8.906118392944336, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8748753070831299, + "num_tokens": 686562272.0, + "step": 17992 + }, + { + "epoch": 2.2888945426790483, + "ewc_loss": 0.07563463598489761, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00038769401726312935, + "grad_norm": 8.826428413391113, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8753129839897156, + "num_tokens": 686599937.0, + "step": 17993 + }, + { + "epoch": 2.289021752957639, + "ewc_loss": 0.07654286921024323, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003918935253750533, + "grad_norm": 8.975014686584473, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8654427528381348, + "num_tokens": 686632103.0, + "step": 17994 + }, + { + "epoch": 2.2891489632362294, + "ewc_loss": 0.07603040337562561, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038676883559674025, + "grad_norm": 8.820631980895996, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8722901344299316, + "num_tokens": 686662802.0, + "step": 17995 + }, + { + "epoch": 2.28927617351482, + "ewc_loss": 0.07662364095449448, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000392701243981719, + "grad_norm": 8.988008499145508, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8852453827857971, + "num_tokens": 686699386.0, + "step": 17996 + }, + { + "epoch": 2.2894033837934105, + "ewc_loss": 0.07600134611129761, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038647832116112113, + "grad_norm": 8.807001113891602, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.868044376373291, + "num_tokens": 686738369.0, + "step": 17997 + }, + { + "epoch": 2.289530594072001, + "ewc_loss": 0.07656504213809967, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039211526745930314, + "grad_norm": 9.001445770263672, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8580441474914551, + "num_tokens": 686774543.0, + "step": 17998 + }, + { + "epoch": 2.2896578043505915, + "ewc_loss": 0.07585133612155914, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003849782224278897, + "grad_norm": 8.74393081665039, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8554172515869141, + "num_tokens": 686807048.0, + "step": 17999 + }, + { + "epoch": 2.289785014629182, + "ewc_loss": 0.07681906223297119, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003946554788853973, + "grad_norm": 9.041712760925293, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8664716482162476, + "num_tokens": 686844678.0, + "step": 18000 + }, + { + "epoch": 2.2899122249077726, + "ewc_loss": 0.0758880227804184, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038534507621079683, + "grad_norm": 8.752866744995117, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8656367063522339, + "num_tokens": 686877721.0, + "step": 18001 + }, + { + "epoch": 2.290039435186363, + "ewc_loss": 0.0770178958773613, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039664379437454045, + "grad_norm": 9.071710586547852, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8556424379348755, + "num_tokens": 686918329.0, + "step": 18002 + }, + { + "epoch": 2.2901666454649536, + "ewc_loss": 0.07584668695926666, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038493171450681984, + "grad_norm": 8.748929023742676, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8602147102355957, + "num_tokens": 686960050.0, + "step": 18003 + }, + { + "epoch": 2.290293855743544, + "ewc_loss": 0.07702010124921799, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039666585507802665, + "grad_norm": 9.030959129333496, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8747599124908447, + "num_tokens": 686997849.0, + "step": 18004 + }, + { + "epoch": 2.2904210660221347, + "ewc_loss": 0.07604033499956131, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038686819607391953, + "grad_norm": 8.81270694732666, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8680152893066406, + "num_tokens": 687032363.0, + "step": 18005 + }, + { + "epoch": 2.290548276300725, + "ewc_loss": 0.07680382579565048, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003945031203329563, + "grad_norm": 9.006865501403809, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8496596813201904, + "num_tokens": 687069606.0, + "step": 18006 + }, + { + "epoch": 2.2906754865793157, + "ewc_loss": 0.07610423862934113, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038750722887925804, + "grad_norm": 8.774260520935059, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.85517418384552, + "num_tokens": 687112857.0, + "step": 18007 + }, + { + "epoch": 2.2908026968579063, + "ewc_loss": 0.07690402865409851, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003955051361117512, + "grad_norm": 8.970647811889648, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8626493215560913, + "num_tokens": 687152245.0, + "step": 18008 + }, + { + "epoch": 2.290929907136497, + "ewc_loss": 0.07607035338878632, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003871683729812503, + "grad_norm": 8.766709327697754, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8644807934761047, + "num_tokens": 687193988.0, + "step": 18009 + }, + { + "epoch": 2.2910571174150873, + "ewc_loss": 0.07692991942167282, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003957640437874943, + "grad_norm": 9.074359893798828, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8591967821121216, + "num_tokens": 687232375.0, + "step": 18010 + }, + { + "epoch": 2.291184327693678, + "ewc_loss": 0.07590292394161224, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038549411692656577, + "grad_norm": 8.702786445617676, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8686977624893188, + "num_tokens": 687274566.0, + "step": 18011 + }, + { + "epoch": 2.291311537972268, + "ewc_loss": 0.07719700038433075, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039843484410084784, + "grad_norm": 9.109984397888184, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8645274639129639, + "num_tokens": 687313231.0, + "step": 18012 + }, + { + "epoch": 2.291438748250859, + "ewc_loss": 0.07581859081983566, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038465074612759054, + "grad_norm": 8.630387306213379, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8702560663223267, + "num_tokens": 687354502.0, + "step": 18013 + }, + { + "epoch": 2.291565958529449, + "ewc_loss": 0.07767221331596375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004031869466416538, + "grad_norm": 9.159300804138184, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8617556095123291, + "num_tokens": 687399160.0, + "step": 18014 + }, + { + "epoch": 2.2916931688080395, + "ewc_loss": 0.07584016770124435, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038486652192659676, + "grad_norm": 8.696369171142578, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8548960089683533, + "num_tokens": 687438775.0, + "step": 18015 + }, + { + "epoch": 2.29182037908663, + "ewc_loss": 0.07773829996585846, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040384780731983483, + "grad_norm": 9.179003715515137, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8705919981002808, + "num_tokens": 687471435.0, + "step": 18016 + }, + { + "epoch": 2.2919475893652206, + "ewc_loss": 0.07599752396345139, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000386440078727901, + "grad_norm": 8.74281120300293, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.865841269493103, + "num_tokens": 687508657.0, + "step": 18017 + }, + { + "epoch": 2.292074799643811, + "ewc_loss": 0.07759921252727509, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004024570225737989, + "grad_norm": 9.183076858520508, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8741194009780884, + "num_tokens": 687545024.0, + "step": 18018 + }, + { + "epoch": 2.2922020099224016, + "ewc_loss": 0.0760350152850151, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003868149942718446, + "grad_norm": 8.79837417602539, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8683005571365356, + "num_tokens": 687579432.0, + "step": 18019 + }, + { + "epoch": 2.292329220200992, + "ewc_loss": 0.07754075527191162, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040187244303524494, + "grad_norm": 9.186610221862793, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8547185659408569, + "num_tokens": 687618378.0, + "step": 18020 + }, + { + "epoch": 2.2924564304795827, + "ewc_loss": 0.0761067271232605, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038753211265429854, + "grad_norm": 8.796092987060547, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8616364002227783, + "num_tokens": 687657119.0, + "step": 18021 + }, + { + "epoch": 2.2925836407581732, + "ewc_loss": 0.07738947868347168, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040035968413576484, + "grad_norm": 9.188775062561035, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8549824357032776, + "num_tokens": 687691924.0, + "step": 18022 + }, + { + "epoch": 2.2927108510367638, + "ewc_loss": 0.07579927146434784, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038445761310867965, + "grad_norm": 8.789992332458496, + "learning_rate": 1e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.8403815031051636, + "num_tokens": 687724221.0, + "step": 18023 + }, + { + "epoch": 2.2928380613153543, + "ewc_loss": 0.07744088768959045, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004008737450931221, + "grad_norm": 9.127908706665039, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8656080961227417, + "num_tokens": 687765049.0, + "step": 18024 + }, + { + "epoch": 2.292965271593945, + "ewc_loss": 0.07591718435287476, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003856366383843124, + "grad_norm": 8.771031379699707, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.853343665599823, + "num_tokens": 687810222.0, + "step": 18025 + }, + { + "epoch": 2.2930924818725353, + "ewc_loss": 0.07735587656497955, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040002362220548093, + "grad_norm": 9.210332870483398, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8584028482437134, + "num_tokens": 687851961.0, + "step": 18026 + }, + { + "epoch": 2.293219692151126, + "ewc_loss": 0.07580730319023132, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003845378232654184, + "grad_norm": 8.71864128112793, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8535927534103394, + "num_tokens": 687890448.0, + "step": 18027 + }, + { + "epoch": 2.2933469024297164, + "ewc_loss": 0.07758350670337677, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004022999492008239, + "grad_norm": 9.189270973205566, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8730490207672119, + "num_tokens": 687929509.0, + "step": 18028 + }, + { + "epoch": 2.293474112708307, + "ewc_loss": 0.07598558068275452, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038632063660770655, + "grad_norm": 8.759778022766113, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8545993566513062, + "num_tokens": 687966814.0, + "step": 18029 + }, + { + "epoch": 2.2936013229868975, + "ewc_loss": 0.07742677628993988, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004007326497230679, + "grad_norm": 9.143022537231445, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8712944984436035, + "num_tokens": 688006398.0, + "step": 18030 + }, + { + "epoch": 2.293728533265488, + "ewc_loss": 0.07612654566764832, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003877303097397089, + "grad_norm": 8.839673042297363, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8616370558738708, + "num_tokens": 688044604.0, + "step": 18031 + }, + { + "epoch": 2.2938557435440785, + "ewc_loss": 0.0772138386964798, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039860320976004004, + "grad_norm": 9.0882568359375, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.860779881477356, + "num_tokens": 688082969.0, + "step": 18032 + }, + { + "epoch": 2.293982953822669, + "ewc_loss": 0.0762329027056694, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003887938801199198, + "grad_norm": 8.850455284118652, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8716461062431335, + "num_tokens": 688121442.0, + "step": 18033 + }, + { + "epoch": 2.2941101641012596, + "ewc_loss": 0.07715190201997757, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039798388024792075, + "grad_norm": 9.091358184814453, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8646857142448425, + "num_tokens": 688161096.0, + "step": 18034 + }, + { + "epoch": 2.2942373743798496, + "ewc_loss": 0.07629486918449402, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038941350067034364, + "grad_norm": 8.874929428100586, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8657761812210083, + "num_tokens": 688200782.0, + "step": 18035 + }, + { + "epoch": 2.2943645846584406, + "ewc_loss": 0.07691498845815659, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003956147120334208, + "grad_norm": 9.010881423950195, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8728341460227966, + "num_tokens": 688241333.0, + "step": 18036 + }, + { + "epoch": 2.2944917949370307, + "ewc_loss": 0.07658771425485611, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003923419862985611, + "grad_norm": 8.987177848815918, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8621898889541626, + "num_tokens": 688278297.0, + "step": 18037 + }, + { + "epoch": 2.2946190052156212, + "ewc_loss": 0.07664281129837036, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039289292180910707, + "grad_norm": 8.873868942260742, + "learning_rate": 1e-06, + "loss": 0.5497, + "mean_token_accuracy": 0.8403641581535339, + "num_tokens": 688318835.0, + "step": 18038 + }, + { + "epoch": 2.2947462154942118, + "ewc_loss": 0.07674656808376312, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003939304733648896, + "grad_norm": 8.97021770477295, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8672736883163452, + "num_tokens": 688354814.0, + "step": 18039 + }, + { + "epoch": 2.2948734257728023, + "ewc_loss": 0.07636360079050064, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003901008458342403, + "grad_norm": 8.894651412963867, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8680852651596069, + "num_tokens": 688390008.0, + "step": 18040 + }, + { + "epoch": 2.295000636051393, + "ewc_loss": 0.0768372118473053, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039483694126829505, + "grad_norm": 8.943946838378906, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.865282416343689, + "num_tokens": 688430938.0, + "step": 18041 + }, + { + "epoch": 2.2951278463299833, + "ewc_loss": 0.07638698816299438, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039033469511196017, + "grad_norm": 8.92569351196289, + "learning_rate": 1e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8335016369819641, + "num_tokens": 688466730.0, + "step": 18042 + }, + { + "epoch": 2.295255056608574, + "ewc_loss": 0.07662752270698547, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039274003938771784, + "grad_norm": 8.901100158691406, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8630414009094238, + "num_tokens": 688506666.0, + "step": 18043 + }, + { + "epoch": 2.2953822668871644, + "ewc_loss": 0.07653877139091492, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039185257628560066, + "grad_norm": 8.927339553833008, + "learning_rate": 1e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8497908711433411, + "num_tokens": 688547726.0, + "step": 18044 + }, + { + "epoch": 2.295509477165755, + "ewc_loss": 0.07650569826364517, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039152184035629034, + "grad_norm": 8.916787147521973, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8608234524726868, + "num_tokens": 688587487.0, + "step": 18045 + }, + { + "epoch": 2.2956366874443455, + "ewc_loss": 0.07659581303596497, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039242292405106127, + "grad_norm": 8.869943618774414, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8580094575881958, + "num_tokens": 688632454.0, + "step": 18046 + }, + { + "epoch": 2.295763897722936, + "ewc_loss": 0.07663729786872864, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039283777005039155, + "grad_norm": 8.94153118133545, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8713993430137634, + "num_tokens": 688668489.0, + "step": 18047 + }, + { + "epoch": 2.2958911080015265, + "ewc_loss": 0.07645723223686218, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003910371451638639, + "grad_norm": 8.888638496398926, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8670175671577454, + "num_tokens": 688707899.0, + "step": 18048 + }, + { + "epoch": 2.296018318280117, + "ewc_loss": 0.07701106369495392, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003965754294767976, + "grad_norm": 8.94765567779541, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.869472861289978, + "num_tokens": 688747171.0, + "step": 18049 + }, + { + "epoch": 2.2961455285587076, + "ewc_loss": 0.07648350298404694, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039129986544139683, + "grad_norm": 8.872360229492188, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8627057075500488, + "num_tokens": 688789553.0, + "step": 18050 + }, + { + "epoch": 2.296272738837298, + "ewc_loss": 0.07671892642974854, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000393654074287042, + "grad_norm": 8.91573429107666, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8674603700637817, + "num_tokens": 688827632.0, + "step": 18051 + }, + { + "epoch": 2.2963999491158886, + "ewc_loss": 0.07661937177181244, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000392658548662439, + "grad_norm": 8.863252639770508, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8742517232894897, + "num_tokens": 688861505.0, + "step": 18052 + }, + { + "epoch": 2.296527159394479, + "ewc_loss": 0.07666870951652527, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039315191679634154, + "grad_norm": 8.945051193237305, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8562474250793457, + "num_tokens": 688898270.0, + "step": 18053 + }, + { + "epoch": 2.2966543696730697, + "ewc_loss": 0.07669267058372498, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003933915577363223, + "grad_norm": 8.809541702270508, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.864600419998169, + "num_tokens": 688937420.0, + "step": 18054 + }, + { + "epoch": 2.29678157995166, + "ewc_loss": 0.07700787484645844, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003965435898862779, + "grad_norm": 8.959623336791992, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8706351518630981, + "num_tokens": 688972426.0, + "step": 18055 + }, + { + "epoch": 2.2969087902302507, + "ewc_loss": 0.07653623819351196, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003918272559531033, + "grad_norm": 8.90479850769043, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8632713556289673, + "num_tokens": 689005801.0, + "step": 18056 + }, + { + "epoch": 2.2970360005088413, + "ewc_loss": 0.07696299254894257, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003960947797168046, + "grad_norm": 8.95380973815918, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8591779470443726, + "num_tokens": 689039999.0, + "step": 18057 + }, + { + "epoch": 2.297163210787432, + "ewc_loss": 0.07654912024736404, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039195604040287435, + "grad_norm": 8.82900333404541, + "learning_rate": 1e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.8483361005783081, + "num_tokens": 689079404.0, + "step": 18058 + }, + { + "epoch": 2.2972904210660223, + "ewc_loss": 0.07692141085863113, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003956789441872388, + "grad_norm": 8.93802261352539, + "learning_rate": 1e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8414341807365417, + "num_tokens": 689116197.0, + "step": 18059 + }, + { + "epoch": 2.2974176313446124, + "ewc_loss": 0.07667452841997147, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000393210124457255, + "grad_norm": 8.862383842468262, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8642850518226624, + "num_tokens": 689153446.0, + "step": 18060 + }, + { + "epoch": 2.2975448416232034, + "ewc_loss": 0.07686229050159454, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003950877289753407, + "grad_norm": 8.91240406036377, + "learning_rate": 1e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8474704623222351, + "num_tokens": 689185310.0, + "step": 18061 + }, + { + "epoch": 2.2976720519017935, + "ewc_loss": 0.07662954926490784, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039276035386137664, + "grad_norm": 8.806689262390137, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8735852241516113, + "num_tokens": 689224150.0, + "step": 18062 + }, + { + "epoch": 2.297799262180384, + "ewc_loss": 0.07706237584352493, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003970885882154107, + "grad_norm": 8.918169975280762, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8555852770805359, + "num_tokens": 689261206.0, + "step": 18063 + }, + { + "epoch": 2.2979264724589745, + "ewc_loss": 0.07651379704475403, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003916027781087905, + "grad_norm": 8.937573432922363, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8696524500846863, + "num_tokens": 689299129.0, + "step": 18064 + }, + { + "epoch": 2.298053682737565, + "ewc_loss": 0.07664680480957031, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039293288136832416, + "grad_norm": 8.88227653503418, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8804900646209717, + "num_tokens": 689330554.0, + "step": 18065 + }, + { + "epoch": 2.2981808930161556, + "ewc_loss": 0.07673178613185883, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039378268411383033, + "grad_norm": 8.872011184692383, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8702211976051331, + "num_tokens": 689364084.0, + "step": 18066 + }, + { + "epoch": 2.298308103294746, + "ewc_loss": 0.07661750167608261, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039263986400328577, + "grad_norm": 8.830793380737305, + "learning_rate": 1e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8529577255249023, + "num_tokens": 689405254.0, + "step": 18067 + }, + { + "epoch": 2.2984353135733366, + "ewc_loss": 0.07687219977378845, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003951868275180459, + "grad_norm": 8.934030532836914, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8561025261878967, + "num_tokens": 689441109.0, + "step": 18068 + }, + { + "epoch": 2.298562523851927, + "ewc_loss": 0.0766448900103569, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003929137601517141, + "grad_norm": 8.840062141418457, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8620494604110718, + "num_tokens": 689474937.0, + "step": 18069 + }, + { + "epoch": 2.2986897341305177, + "ewc_loss": 0.07686532288789749, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003951180842705071, + "grad_norm": 8.821125984191895, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.858910322189331, + "num_tokens": 689515971.0, + "step": 18070 + }, + { + "epoch": 2.298816944409108, + "ewc_loss": 0.07688625156879425, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000395327340811491, + "grad_norm": 8.90431022644043, + "learning_rate": 1e-06, + "loss": 0.5443, + "mean_token_accuracy": 0.8408087491989136, + "num_tokens": 689553620.0, + "step": 18071 + }, + { + "epoch": 2.2989441546876987, + "ewc_loss": 0.07669433951377869, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039340826333500445, + "grad_norm": 8.836714744567871, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8678606748580933, + "num_tokens": 689592157.0, + "step": 18072 + }, + { + "epoch": 2.2990713649662893, + "ewc_loss": 0.07696141302585602, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039607894723303616, + "grad_norm": 8.881842613220215, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8768572807312012, + "num_tokens": 689626889.0, + "step": 18073 + }, + { + "epoch": 2.29919857524488, + "ewc_loss": 0.07667472958564758, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000393212161725387, + "grad_norm": 8.83862590789795, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.875290036201477, + "num_tokens": 689666161.0, + "step": 18074 + }, + { + "epoch": 2.2993257855234703, + "ewc_loss": 0.07693049311637878, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003957697772420943, + "grad_norm": 8.897638320922852, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8551046848297119, + "num_tokens": 689713713.0, + "step": 18075 + }, + { + "epoch": 2.299452995802061, + "ewc_loss": 0.07666737586259842, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039313858724199235, + "grad_norm": 8.817926406860352, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8556275367736816, + "num_tokens": 689755146.0, + "step": 18076 + }, + { + "epoch": 2.2995802060806514, + "ewc_loss": 0.0770406424999237, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003968712699133903, + "grad_norm": 8.916865348815918, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8746505975723267, + "num_tokens": 689795925.0, + "step": 18077 + }, + { + "epoch": 2.299707416359242, + "ewc_loss": 0.07676088809967041, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003940736933145672, + "grad_norm": 9.01539421081543, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8754233717918396, + "num_tokens": 689830673.0, + "step": 18078 + }, + { + "epoch": 2.2998346266378324, + "ewc_loss": 0.07645591348409653, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039102399023249745, + "grad_norm": 8.817229270935059, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.862834632396698, + "num_tokens": 689863649.0, + "step": 18079 + }, + { + "epoch": 2.299961836916423, + "ewc_loss": 0.07712487876415253, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039771359297446907, + "grad_norm": 8.884657859802246, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8625355958938599, + "num_tokens": 689901946.0, + "step": 18080 + }, + { + "epoch": 2.3000890471950135, + "ewc_loss": 0.07659296691417694, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003923944605048746, + "grad_norm": 8.818032264709473, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8581358194351196, + "num_tokens": 689939968.0, + "step": 18081 + }, + { + "epoch": 2.300216257473604, + "ewc_loss": 0.07700185477733612, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003964833449572325, + "grad_norm": 8.856165885925293, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8723505735397339, + "num_tokens": 689979792.0, + "step": 18082 + }, + { + "epoch": 2.3003434677521946, + "ewc_loss": 0.07671499252319336, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000393614755012095, + "grad_norm": 8.819482803344727, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8766077756881714, + "num_tokens": 690013974.0, + "step": 18083 + }, + { + "epoch": 2.300470678030785, + "ewc_loss": 0.07681913673877716, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039465626468881965, + "grad_norm": 8.848227500915527, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.87204909324646, + "num_tokens": 690056540.0, + "step": 18084 + }, + { + "epoch": 2.300597888309375, + "ewc_loss": 0.07691919803619385, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039565685437992215, + "grad_norm": 8.793057441711426, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8577179312705994, + "num_tokens": 690101032.0, + "step": 18085 + }, + { + "epoch": 2.300725098587966, + "ewc_loss": 0.07689271867275238, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039539203862659633, + "grad_norm": 8.86370849609375, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8770766258239746, + "num_tokens": 690135417.0, + "step": 18086 + }, + { + "epoch": 2.3008523088665562, + "ewc_loss": 0.07680636644363403, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039452852797694504, + "grad_norm": 8.86803150177002, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8660051822662354, + "num_tokens": 690178552.0, + "step": 18087 + }, + { + "epoch": 2.3009795191451468, + "ewc_loss": 0.07721248269081116, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0003961482143495232, + "grad_norm": 8.822433471679688, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8612919449806213, + "num_tokens": 690221928.0, + "step": 18088 + }, + { + "epoch": 2.3011067294237373, + "ewc_loss": 0.07721909880638123, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00039621442556381226, + "grad_norm": 8.909626960754395, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8706941604614258, + "num_tokens": 690257040.0, + "step": 18089 + }, + { + "epoch": 2.301233939702328, + "ewc_loss": 0.07675547152757645, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003940195601899177, + "grad_norm": 8.802781105041504, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8645098209381104, + "num_tokens": 690302720.0, + "step": 18090 + }, + { + "epoch": 2.3013611499809183, + "ewc_loss": 0.07716632634401321, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039812808972783387, + "grad_norm": 8.930977821350098, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8579999208450317, + "num_tokens": 690337042.0, + "step": 18091 + }, + { + "epoch": 2.301488360259509, + "ewc_loss": 0.07671349495649338, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003935997956432402, + "grad_norm": 8.78444766998291, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8708873987197876, + "num_tokens": 690380247.0, + "step": 18092 + }, + { + "epoch": 2.3016155705380994, + "ewc_loss": 0.07748784124851227, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004013432771898806, + "grad_norm": 8.899934768676758, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8671586513519287, + "num_tokens": 690425361.0, + "step": 18093 + }, + { + "epoch": 2.30174278081669, + "ewc_loss": 0.07665793597698212, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003930442326236516, + "grad_norm": 8.766583442687988, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8588957786560059, + "num_tokens": 690465137.0, + "step": 18094 + }, + { + "epoch": 2.3018699910952805, + "ewc_loss": 0.0773208886384964, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003996737068518996, + "grad_norm": 8.927804946899414, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8620740175247192, + "num_tokens": 690501437.0, + "step": 18095 + }, + { + "epoch": 2.301997201373871, + "ewc_loss": 0.07670814543962479, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039354630280286074, + "grad_norm": 8.76815128326416, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8590115308761597, + "num_tokens": 690544134.0, + "step": 18096 + }, + { + "epoch": 2.3021244116524615, + "ewc_loss": 0.07742805778980255, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040074536809697747, + "grad_norm": 8.961891174316406, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8725475668907166, + "num_tokens": 690581062.0, + "step": 18097 + }, + { + "epoch": 2.302251621931052, + "ewc_loss": 0.07678568363189697, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003943217161577195, + "grad_norm": 8.81978988647461, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8695151209831238, + "num_tokens": 690615145.0, + "step": 18098 + }, + { + "epoch": 2.3023788322096426, + "ewc_loss": 0.07740894705057144, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040055433055385947, + "grad_norm": 8.93283748626709, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8705366849899292, + "num_tokens": 690655483.0, + "step": 18099 + }, + { + "epoch": 2.302506042488233, + "ewc_loss": 0.07683275640010834, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003947924415115267, + "grad_norm": 8.76280689239502, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8615841269493103, + "num_tokens": 690694133.0, + "step": 18100 + }, + { + "epoch": 2.3026332527668236, + "ewc_loss": 0.07751590013504028, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040162389632314444, + "grad_norm": 8.974685668945312, + "learning_rate": 1e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.843498170375824, + "num_tokens": 690729939.0, + "step": 18101 + }, + { + "epoch": 2.302760463045414, + "ewc_loss": 0.07676894962787628, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003941543400287628, + "grad_norm": 8.79694938659668, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8698091506958008, + "num_tokens": 690770139.0, + "step": 18102 + }, + { + "epoch": 2.3028876733240047, + "ewc_loss": 0.07745354622602463, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040100031765177846, + "grad_norm": 8.912970542907715, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8666073083877563, + "num_tokens": 690811379.0, + "step": 18103 + }, + { + "epoch": 2.303014883602595, + "ewc_loss": 0.07685858011245728, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003950506797991693, + "grad_norm": 8.94019603729248, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8601022958755493, + "num_tokens": 690844194.0, + "step": 18104 + }, + { + "epoch": 2.3031420938811857, + "ewc_loss": 0.07700447738170624, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039650959661230445, + "grad_norm": 8.845765113830566, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8583676815032959, + "num_tokens": 690882129.0, + "step": 18105 + }, + { + "epoch": 2.3032693041597763, + "ewc_loss": 0.07708396762609482, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003973045386373997, + "grad_norm": 8.880256652832031, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8779471516609192, + "num_tokens": 690915568.0, + "step": 18106 + }, + { + "epoch": 2.303396514438367, + "ewc_loss": 0.07697150111198425, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003961798211093992, + "grad_norm": 8.883687973022461, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8609007000923157, + "num_tokens": 690953412.0, + "step": 18107 + }, + { + "epoch": 2.3035237247169573, + "ewc_loss": 0.07710372656583786, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039750209543854, + "grad_norm": 8.925224304199219, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8616553544998169, + "num_tokens": 690990047.0, + "step": 18108 + }, + { + "epoch": 2.303650934995548, + "ewc_loss": 0.07680031657218933, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039446805021725595, + "grad_norm": 8.804267883300781, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8706026673316956, + "num_tokens": 691031010.0, + "step": 18109 + }, + { + "epoch": 2.303778145274138, + "ewc_loss": 0.077132448554039, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003977893793489784, + "grad_norm": 8.886116027832031, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8662745952606201, + "num_tokens": 691065830.0, + "step": 18110 + }, + { + "epoch": 2.303905355552729, + "ewc_loss": 0.07699103653430939, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003963752242270857, + "grad_norm": 8.844454765319824, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8768793940544128, + "num_tokens": 691101418.0, + "step": 18111 + }, + { + "epoch": 2.304032565831319, + "ewc_loss": 0.07706214487552643, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039708625990897417, + "grad_norm": 8.914382934570312, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8483909368515015, + "num_tokens": 691139447.0, + "step": 18112 + }, + { + "epoch": 2.3041597761099095, + "ewc_loss": 0.07676683366298676, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039413318154402077, + "grad_norm": 8.911876678466797, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.844429612159729, + "num_tokens": 691168809.0, + "step": 18113 + }, + { + "epoch": 2.3042869863885, + "ewc_loss": 0.07680416107177734, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003945064090657979, + "grad_norm": 8.887472152709961, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8693047165870667, + "num_tokens": 691200420.0, + "step": 18114 + }, + { + "epoch": 2.3044141966670906, + "ewc_loss": 0.07680095732212067, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039447436574846506, + "grad_norm": 8.827587127685547, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8584699630737305, + "num_tokens": 691238587.0, + "step": 18115 + }, + { + "epoch": 2.304541406945681, + "ewc_loss": 0.07675443589687347, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039400917012244463, + "grad_norm": 8.855392456054688, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8619869351387024, + "num_tokens": 691277137.0, + "step": 18116 + }, + { + "epoch": 2.3046686172242716, + "ewc_loss": 0.07688410580158234, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003953058912884444, + "grad_norm": 8.859834671020508, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8618743419647217, + "num_tokens": 691319564.0, + "step": 18117 + }, + { + "epoch": 2.304795827502862, + "ewc_loss": 0.07679033279418945, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003943681949749589, + "grad_norm": 8.835680961608887, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.862483024597168, + "num_tokens": 691359652.0, + "step": 18118 + }, + { + "epoch": 2.3049230377814527, + "ewc_loss": 0.07685046643018723, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039496950921602547, + "grad_norm": 8.843355178833008, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8645535111427307, + "num_tokens": 691399705.0, + "step": 18119 + }, + { + "epoch": 2.305050248060043, + "ewc_loss": 0.07685776054859161, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003950424143113196, + "grad_norm": 8.85493278503418, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.856463611125946, + "num_tokens": 691439447.0, + "step": 18120 + }, + { + "epoch": 2.3051774583386337, + "ewc_loss": 0.07697466760873795, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003962115151807666, + "grad_norm": 8.844483375549316, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8527373671531677, + "num_tokens": 691477453.0, + "step": 18121 + }, + { + "epoch": 2.3053046686172243, + "ewc_loss": 0.07699371129274368, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000396401941543445, + "grad_norm": 8.909236907958984, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8644630908966064, + "num_tokens": 691517766.0, + "step": 18122 + }, + { + "epoch": 2.305431878895815, + "ewc_loss": 0.07677595317363739, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039422433474101126, + "grad_norm": 8.939204216003418, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8632079362869263, + "num_tokens": 691553956.0, + "step": 18123 + }, + { + "epoch": 2.3055590891744053, + "ewc_loss": 0.07662463188171387, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039271111018024385, + "grad_norm": 8.805608749389648, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8619800806045532, + "num_tokens": 691592522.0, + "step": 18124 + }, + { + "epoch": 2.305686299452996, + "ewc_loss": 0.07693329453468323, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003957977460231632, + "grad_norm": 8.918917655944824, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8642672300338745, + "num_tokens": 691629202.0, + "step": 18125 + }, + { + "epoch": 2.3058135097315864, + "ewc_loss": 0.07656529545783997, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003921177703887224, + "grad_norm": 8.79210376739502, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8671739101409912, + "num_tokens": 691665324.0, + "step": 18126 + }, + { + "epoch": 2.305940720010177, + "ewc_loss": 0.07713921368122101, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003978569875471294, + "grad_norm": 8.877653121948242, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8734960556030273, + "num_tokens": 691704223.0, + "step": 18127 + }, + { + "epoch": 2.3060679302887674, + "ewc_loss": 0.07671669125556946, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039363178075291216, + "grad_norm": 8.82999324798584, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8613695502281189, + "num_tokens": 691745061.0, + "step": 18128 + }, + { + "epoch": 2.306195140567358, + "ewc_loss": 0.07704637944698334, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003969286917708814, + "grad_norm": 8.918386459350586, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8756066560745239, + "num_tokens": 691783630.0, + "step": 18129 + }, + { + "epoch": 2.3063223508459485, + "ewc_loss": 0.07664240151643753, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003928888472728431, + "grad_norm": 8.839978218078613, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8627020716667175, + "num_tokens": 691818283.0, + "step": 18130 + }, + { + "epoch": 2.306449561124539, + "ewc_loss": 0.07704655826091766, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003969304671045393, + "grad_norm": 8.911707878112793, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8529018759727478, + "num_tokens": 691854585.0, + "step": 18131 + }, + { + "epoch": 2.3065767714031296, + "ewc_loss": 0.07664885371923447, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003929533704649657, + "grad_norm": 8.837994575500488, + "learning_rate": 1e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8471367955207825, + "num_tokens": 691891784.0, + "step": 18132 + }, + { + "epoch": 2.3067039816817196, + "ewc_loss": 0.07703602313995361, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039682508213445544, + "grad_norm": 8.929536819458008, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8660341501235962, + "num_tokens": 691927389.0, + "step": 18133 + }, + { + "epoch": 2.3068311919603106, + "ewc_loss": 0.07665573060512543, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003930221137125045, + "grad_norm": 8.796736717224121, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8614792227745056, + "num_tokens": 691962928.0, + "step": 18134 + }, + { + "epoch": 2.3069584022389007, + "ewc_loss": 0.07704950869083405, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039695994928479195, + "grad_norm": 8.951404571533203, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8630424737930298, + "num_tokens": 692007064.0, + "step": 18135 + }, + { + "epoch": 2.3070856125174912, + "ewc_loss": 0.07658196985721588, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003922845353372395, + "grad_norm": 8.797114372253418, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8690680265426636, + "num_tokens": 692040668.0, + "step": 18136 + }, + { + "epoch": 2.3072128227960818, + "ewc_loss": 0.07715226709842682, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039798746001906693, + "grad_norm": 8.93355655670166, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8651739358901978, + "num_tokens": 692077710.0, + "step": 18137 + }, + { + "epoch": 2.3073400330746723, + "ewc_loss": 0.07659178972244263, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000392382760765031, + "grad_norm": 8.829934120178223, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8721940517425537, + "num_tokens": 692112316.0, + "step": 18138 + }, + { + "epoch": 2.307467243353263, + "ewc_loss": 0.07715413719415665, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039800620288588107, + "grad_norm": 8.936086654663086, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8693302869796753, + "num_tokens": 692153201.0, + "step": 18139 + }, + { + "epoch": 2.3075944536318533, + "ewc_loss": 0.07661549746990204, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003926198114641011, + "grad_norm": 8.829593658447266, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8569825887680054, + "num_tokens": 692192942.0, + "step": 18140 + }, + { + "epoch": 2.307721663910444, + "ewc_loss": 0.0770513117313385, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039697790634818375, + "grad_norm": 8.917642593383789, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8789652585983276, + "num_tokens": 692235515.0, + "step": 18141 + }, + { + "epoch": 2.3078488741890344, + "ewc_loss": 0.0767284706234932, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003937495348509401, + "grad_norm": 8.969423294067383, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8534591197967529, + "num_tokens": 692271338.0, + "step": 18142 + }, + { + "epoch": 2.307976084467625, + "ewc_loss": 0.0767441987991333, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003939068119507283, + "grad_norm": 8.891768455505371, + "learning_rate": 1e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8450970649719238, + "num_tokens": 692311469.0, + "step": 18143 + }, + { + "epoch": 2.3081032947462155, + "ewc_loss": 0.0768793448805809, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003952582774218172, + "grad_norm": 8.889618873596191, + "learning_rate": 1e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8457390666007996, + "num_tokens": 692352052.0, + "step": 18144 + }, + { + "epoch": 2.308230505024806, + "ewc_loss": 0.07675021886825562, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039396705687977374, + "grad_norm": 8.947015762329102, + "learning_rate": 1e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8470189571380615, + "num_tokens": 692382978.0, + "step": 18145 + }, + { + "epoch": 2.3083577153033965, + "ewc_loss": 0.07676231861114502, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003940880123991519, + "grad_norm": 8.939188957214355, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8642199635505676, + "num_tokens": 692415651.0, + "step": 18146 + }, + { + "epoch": 2.308484925581987, + "ewc_loss": 0.07682512700557709, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039471607306040823, + "grad_norm": 8.89653491973877, + "learning_rate": 1e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8458980321884155, + "num_tokens": 692455937.0, + "step": 18147 + }, + { + "epoch": 2.3086121358605776, + "ewc_loss": 0.07689345628023148, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003953994018957019, + "grad_norm": 8.920997619628906, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8588889837265015, + "num_tokens": 692496277.0, + "step": 18148 + }, + { + "epoch": 2.308739346139168, + "ewc_loss": 0.07690715789794922, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039553639362566173, + "grad_norm": 8.930160522460938, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8634552955627441, + "num_tokens": 692530485.0, + "step": 18149 + }, + { + "epoch": 2.3088665564177586, + "ewc_loss": 0.07682447135448456, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003947095829062164, + "grad_norm": 8.96026611328125, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8603845834732056, + "num_tokens": 692564794.0, + "step": 18150 + }, + { + "epoch": 2.308993766696349, + "ewc_loss": 0.07676760852336884, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003941409522667527, + "grad_norm": 8.931347846984863, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8538519144058228, + "num_tokens": 692610359.0, + "step": 18151 + }, + { + "epoch": 2.3091209769749397, + "ewc_loss": 0.07683461904525757, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039481103885918856, + "grad_norm": 8.943948745727539, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8720173239707947, + "num_tokens": 692641560.0, + "step": 18152 + }, + { + "epoch": 2.30924818725353, + "ewc_loss": 0.07673020660877228, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003937668807338923, + "grad_norm": 8.892224311828613, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8605462312698364, + "num_tokens": 692680114.0, + "step": 18153 + }, + { + "epoch": 2.3093753975321207, + "ewc_loss": 0.07683226466178894, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039478749386034906, + "grad_norm": 8.922520637512207, + "learning_rate": 1e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8500187397003174, + "num_tokens": 692714199.0, + "step": 18154 + }, + { + "epoch": 2.3095026078107113, + "ewc_loss": 0.07670213282108307, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039348614518530667, + "grad_norm": 8.838258743286133, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8666409254074097, + "num_tokens": 692752331.0, + "step": 18155 + }, + { + "epoch": 2.309629818089302, + "ewc_loss": 0.07695677876472473, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003960326430387795, + "grad_norm": 9.01309585571289, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8687745332717896, + "num_tokens": 692789167.0, + "step": 18156 + }, + { + "epoch": 2.3097570283678923, + "ewc_loss": 0.07648903131484985, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003913551918230951, + "grad_norm": 8.841300964355469, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.86272132396698, + "num_tokens": 692829234.0, + "step": 18157 + }, + { + "epoch": 2.3098842386464824, + "ewc_loss": 0.07702174782752991, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003966822987422347, + "grad_norm": 9.038363456726074, + "learning_rate": 1e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.8420950174331665, + "num_tokens": 692863749.0, + "step": 18158 + }, + { + "epoch": 2.3100114489250734, + "ewc_loss": 0.0762319266796112, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038878413033671677, + "grad_norm": 8.74295425415039, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8607497215270996, + "num_tokens": 692900972.0, + "step": 18159 + }, + { + "epoch": 2.3101386592036635, + "ewc_loss": 0.07741648703813553, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004006297094747424, + "grad_norm": 9.043769836425781, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.85715651512146, + "num_tokens": 692938522.0, + "step": 18160 + }, + { + "epoch": 2.310265869482254, + "ewc_loss": 0.0760846883058548, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003873117675539106, + "grad_norm": 8.778956413269043, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8712755441665649, + "num_tokens": 692973653.0, + "step": 18161 + }, + { + "epoch": 2.3103930797608445, + "ewc_loss": 0.07732602208852768, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039972507511265576, + "grad_norm": 8.983619689941406, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8559218049049377, + "num_tokens": 693016325.0, + "step": 18162 + }, + { + "epoch": 2.310520290039435, + "ewc_loss": 0.07635213434696198, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038998620584607124, + "grad_norm": 8.767908096313477, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8748769164085388, + "num_tokens": 693052022.0, + "step": 18163 + }, + { + "epoch": 2.3106475003180256, + "ewc_loss": 0.0772092342376709, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003985571675002575, + "grad_norm": 8.993368148803711, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8674426078796387, + "num_tokens": 693093936.0, + "step": 18164 + }, + { + "epoch": 2.310774710596616, + "ewc_loss": 0.07649224251508713, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003913872642442584, + "grad_norm": 8.823184967041016, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8772827386856079, + "num_tokens": 693130160.0, + "step": 18165 + }, + { + "epoch": 2.3109019208752066, + "ewc_loss": 0.07724010944366455, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003988659882452339, + "grad_norm": 8.988358497619629, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8661879897117615, + "num_tokens": 693172405.0, + "step": 18166 + }, + { + "epoch": 2.311029131153797, + "ewc_loss": 0.076529860496521, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039176340214908123, + "grad_norm": 8.868599891662598, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.866898238658905, + "num_tokens": 693206499.0, + "step": 18167 + }, + { + "epoch": 2.3111563414323877, + "ewc_loss": 0.07698922604322433, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003963571216445416, + "grad_norm": 9.017007827758789, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8625980615615845, + "num_tokens": 693244287.0, + "step": 18168 + }, + { + "epoch": 2.311283551710978, + "ewc_loss": 0.07635261118412018, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038999092066660523, + "grad_norm": 8.871545791625977, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8551349639892578, + "num_tokens": 693286164.0, + "step": 18169 + }, + { + "epoch": 2.3114107619895687, + "ewc_loss": 0.07702651619911194, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003967300581280142, + "grad_norm": 8.919596672058105, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8700869679450989, + "num_tokens": 693330961.0, + "step": 18170 + }, + { + "epoch": 2.3115379722681593, + "ewc_loss": 0.07649578154087067, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039142268360592425, + "grad_norm": 8.780448913574219, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8688684701919556, + "num_tokens": 693375169.0, + "step": 18171 + }, + { + "epoch": 2.31166518254675, + "ewc_loss": 0.07705388963222504, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003970036923419684, + "grad_norm": 9.003030776977539, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8582302331924438, + "num_tokens": 693414127.0, + "step": 18172 + }, + { + "epoch": 2.3117923928253403, + "ewc_loss": 0.07639668881893158, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003904317272827029, + "grad_norm": 8.82758617401123, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.860765278339386, + "num_tokens": 693450446.0, + "step": 18173 + }, + { + "epoch": 2.311919603103931, + "ewc_loss": 0.07715071737766266, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003979720058850944, + "grad_norm": 9.026586532592773, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8532912731170654, + "num_tokens": 693490973.0, + "step": 18174 + }, + { + "epoch": 2.3120468133825214, + "ewc_loss": 0.07629077136516571, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038937252247706056, + "grad_norm": 8.789006233215332, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8673044443130493, + "num_tokens": 693526815.0, + "step": 18175 + }, + { + "epoch": 2.312174023661112, + "ewc_loss": 0.07738403975963593, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040030520176514983, + "grad_norm": 9.011013984680176, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8571420907974243, + "num_tokens": 693566949.0, + "step": 18176 + }, + { + "epoch": 2.3123012339397024, + "ewc_loss": 0.07637470960617065, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039021193515509367, + "grad_norm": 8.803010940551758, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8560388684272766, + "num_tokens": 693601116.0, + "step": 18177 + }, + { + "epoch": 2.312428444218293, + "ewc_loss": 0.07729824632406235, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003994473081547767, + "grad_norm": 9.014724731445312, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8930271863937378, + "num_tokens": 693640588.0, + "step": 18178 + }, + { + "epoch": 2.3125556544968835, + "ewc_loss": 0.07649271190166473, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003913919790647924, + "grad_norm": 8.823326110839844, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8648648858070374, + "num_tokens": 693683352.0, + "step": 18179 + }, + { + "epoch": 2.312682864775474, + "ewc_loss": 0.07665590941905975, + "ewc_loss_diag": 3.695487976074219e-05, + "ewc_loss_parallel": 0.00039790672599337995, + "grad_norm": 8.98257827758789, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8684185743331909, + "num_tokens": 693722579.0, + "step": 18180 + }, + { + "epoch": 2.3128100750540646, + "ewc_loss": 0.07664212584495544, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003928860533051193, + "grad_norm": 8.86844253540039, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8642414808273315, + "num_tokens": 693766743.0, + "step": 18181 + }, + { + "epoch": 2.312937285332655, + "ewc_loss": 0.07708951085805893, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003973599523305893, + "grad_norm": 8.941988945007324, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8662033081054688, + "num_tokens": 693809699.0, + "step": 18182 + }, + { + "epoch": 2.313064495611245, + "ewc_loss": 0.07656516134738922, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003921164898201823, + "grad_norm": 8.84199047088623, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8690303564071655, + "num_tokens": 693845508.0, + "step": 18183 + }, + { + "epoch": 2.313191705889836, + "ewc_loss": 0.07698015123605728, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039626634679734707, + "grad_norm": 8.960087776184082, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8632258176803589, + "num_tokens": 693883763.0, + "step": 18184 + }, + { + "epoch": 2.313318916168426, + "ewc_loss": 0.07667382806539536, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039320311043411493, + "grad_norm": 8.873360633850098, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8687245845794678, + "num_tokens": 693917093.0, + "step": 18185 + }, + { + "epoch": 2.3134461264470167, + "ewc_loss": 0.07692500948905945, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039571497472934425, + "grad_norm": 8.93175983428955, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.86536705493927, + "num_tokens": 693949219.0, + "step": 18186 + }, + { + "epoch": 2.3135733367256073, + "ewc_loss": 0.07672389596700668, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003937038127332926, + "grad_norm": 8.885482788085938, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8575648665428162, + "num_tokens": 693992755.0, + "step": 18187 + }, + { + "epoch": 2.313700547004198, + "ewc_loss": 0.07693640887737274, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003958289453294128, + "grad_norm": 8.989480972290039, + "learning_rate": 1e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8502550721168518, + "num_tokens": 694030386.0, + "step": 18188 + }, + { + "epoch": 2.3138277572827883, + "ewc_loss": 0.07661764323711395, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003926413191948086, + "grad_norm": 8.827229499816895, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.872545599937439, + "num_tokens": 694065019.0, + "step": 18189 + }, + { + "epoch": 2.313954967561379, + "ewc_loss": 0.07706178724765778, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003970827383454889, + "grad_norm": 8.95763874053955, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8612420558929443, + "num_tokens": 694106645.0, + "step": 18190 + }, + { + "epoch": 2.3140821778399694, + "ewc_loss": 0.07658080756664276, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039227292290888727, + "grad_norm": 8.796247482299805, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8679554462432861, + "num_tokens": 694149146.0, + "step": 18191 + }, + { + "epoch": 2.31420938811856, + "ewc_loss": 0.07726261019706726, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039909096085466444, + "grad_norm": 9.307464599609375, + "learning_rate": 1e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8478798866271973, + "num_tokens": 694193567.0, + "step": 18192 + }, + { + "epoch": 2.3143365983971504, + "ewc_loss": 0.07597926259040833, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000386257452191785, + "grad_norm": 8.712058067321777, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8751765489578247, + "num_tokens": 694232160.0, + "step": 18193 + }, + { + "epoch": 2.314463808675741, + "ewc_loss": 0.07793830335140228, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040584790986031294, + "grad_norm": 9.047484397888184, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.865900993347168, + "num_tokens": 694275153.0, + "step": 18194 + }, + { + "epoch": 2.3145910189543315, + "ewc_loss": 0.07630035281181335, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003894684195984155, + "grad_norm": 8.830830574035645, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8696338534355164, + "num_tokens": 694313722.0, + "step": 18195 + }, + { + "epoch": 2.314718229232922, + "ewc_loss": 0.07756927609443665, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040215760236606, + "grad_norm": 9.063358306884766, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8677098751068115, + "num_tokens": 694350544.0, + "step": 18196 + }, + { + "epoch": 2.3148454395115126, + "ewc_loss": 0.07642151415348053, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003906800120603293, + "grad_norm": 8.814376831054688, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8652786016464233, + "num_tokens": 694386552.0, + "step": 18197 + }, + { + "epoch": 2.314972649790103, + "ewc_loss": 0.07750272750854492, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040149211417883635, + "grad_norm": 9.084376335144043, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8592073917388916, + "num_tokens": 694434368.0, + "step": 18198 + }, + { + "epoch": 2.3150998600686936, + "ewc_loss": 0.07644236832857132, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003908885410055518, + "grad_norm": 8.751845359802246, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.86381995677948, + "num_tokens": 694479856.0, + "step": 18199 + }, + { + "epoch": 2.315227070347284, + "ewc_loss": 0.07753212004899979, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004017860337626189, + "grad_norm": 9.038701057434082, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8635772466659546, + "num_tokens": 694521107.0, + "step": 18200 + }, + { + "epoch": 2.3153542806258747, + "ewc_loss": 0.07647423446178436, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039120722794905305, + "grad_norm": 8.813566207885742, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8635740876197815, + "num_tokens": 694560886.0, + "step": 18201 + }, + { + "epoch": 2.315481490904465, + "ewc_loss": 0.07743917405605316, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004008565447293222, + "grad_norm": 8.951619148254395, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.867720901966095, + "num_tokens": 694598404.0, + "step": 18202 + }, + { + "epoch": 2.3156087011830557, + "ewc_loss": 0.07675734907388687, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003940383321605623, + "grad_norm": 8.868749618530273, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8581821918487549, + "num_tokens": 694637162.0, + "step": 18203 + }, + { + "epoch": 2.3157359114616463, + "ewc_loss": 0.07729395478963852, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039940440910868347, + "grad_norm": 9.00890064239502, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8670106530189514, + "num_tokens": 694677435.0, + "step": 18204 + }, + { + "epoch": 2.315863121740237, + "ewc_loss": 0.0767110139131546, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003935749700758606, + "grad_norm": 8.85932445526123, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8728288412094116, + "num_tokens": 694713980.0, + "step": 18205 + }, + { + "epoch": 2.3159903320188273, + "ewc_loss": 0.07731062173843384, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039957111584953964, + "grad_norm": 8.967206001281738, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8604286313056946, + "num_tokens": 694753884.0, + "step": 18206 + }, + { + "epoch": 2.316117542297418, + "ewc_loss": 0.07680024951696396, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039446732262149453, + "grad_norm": 8.89623737335205, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8767251968383789, + "num_tokens": 694785739.0, + "step": 18207 + }, + { + "epoch": 2.316244752576008, + "ewc_loss": 0.07712312042713165, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003976960142608732, + "grad_norm": 8.84391975402832, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8634970188140869, + "num_tokens": 694827629.0, + "step": 18208 + }, + { + "epoch": 2.316371962854599, + "ewc_loss": 0.07718836516141891, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003983484930358827, + "grad_norm": 8.904496192932129, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8601958751678467, + "num_tokens": 694868241.0, + "step": 18209 + }, + { + "epoch": 2.316499173133189, + "ewc_loss": 0.07688947021961212, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003953595587518066, + "grad_norm": 8.898204803466797, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8530746698379517, + "num_tokens": 694905298.0, + "step": 18210 + }, + { + "epoch": 2.3166263834117795, + "ewc_loss": 0.07716448605060577, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039810972521081567, + "grad_norm": 8.900362014770508, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.855830192565918, + "num_tokens": 694938054.0, + "step": 18211 + }, + { + "epoch": 2.31675359369037, + "ewc_loss": 0.07699082791805267, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003963731287512928, + "grad_norm": 8.880168914794922, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8587791919708252, + "num_tokens": 694978761.0, + "step": 18212 + }, + { + "epoch": 2.3168808039689606, + "ewc_loss": 0.07714395970106125, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039790442679077387, + "grad_norm": 8.890149116516113, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8596674203872681, + "num_tokens": 695015073.0, + "step": 18213 + }, + { + "epoch": 2.317008014247551, + "ewc_loss": 0.07714428007602692, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000397907686419785, + "grad_norm": 8.884331703186035, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8582197427749634, + "num_tokens": 695052288.0, + "step": 18214 + }, + { + "epoch": 2.3171352245261416, + "ewc_loss": 0.07700987160205841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003965635260101408, + "grad_norm": 8.89138412475586, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8517526388168335, + "num_tokens": 695088976.0, + "step": 18215 + }, + { + "epoch": 2.317262434804732, + "ewc_loss": 0.07714613527059555, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003979261964559555, + "grad_norm": 8.859501838684082, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8679072856903076, + "num_tokens": 695128482.0, + "step": 18216 + }, + { + "epoch": 2.3173896450833227, + "ewc_loss": 0.07719603180885315, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039842521073296666, + "grad_norm": 8.899502754211426, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8600096702575684, + "num_tokens": 695164963.0, + "step": 18217 + }, + { + "epoch": 2.317516855361913, + "ewc_loss": 0.07702988386154175, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003967636439483613, + "grad_norm": 8.843674659729004, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8852553367614746, + "num_tokens": 695201515.0, + "step": 18218 + }, + { + "epoch": 2.3176440656405037, + "ewc_loss": 0.07735291123390198, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003999939654022455, + "grad_norm": 8.916792869567871, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8688103556632996, + "num_tokens": 695242057.0, + "step": 18219 + }, + { + "epoch": 2.3177712759190943, + "ewc_loss": 0.0771491602063179, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039795643533580005, + "grad_norm": 8.849048614501953, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8649584054946899, + "num_tokens": 695277128.0, + "step": 18220 + }, + { + "epoch": 2.317898486197685, + "ewc_loss": 0.07743784785270691, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004008433606941253, + "grad_norm": 8.974546432495117, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8769855499267578, + "num_tokens": 695311984.0, + "step": 18221 + }, + { + "epoch": 2.3180256964762753, + "ewc_loss": 0.07702817022800446, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039674650179222226, + "grad_norm": 8.843425750732422, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8510571718215942, + "num_tokens": 695348457.0, + "step": 18222 + }, + { + "epoch": 2.318152906754866, + "ewc_loss": 0.0775064080953598, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004015289305243641, + "grad_norm": 9.013814926147461, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8657485246658325, + "num_tokens": 695386518.0, + "step": 18223 + }, + { + "epoch": 2.3182801170334564, + "ewc_loss": 0.07706929743289948, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039715785533189774, + "grad_norm": 8.867105484008789, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8598208427429199, + "num_tokens": 695425365.0, + "step": 18224 + }, + { + "epoch": 2.318407327312047, + "ewc_loss": 0.07748736441135406, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040133847505785525, + "grad_norm": 8.951869010925293, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8648479580879211, + "num_tokens": 695459559.0, + "step": 18225 + }, + { + "epoch": 2.3185345375906374, + "ewc_loss": 0.077003613114357, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039650092367082834, + "grad_norm": 8.923689842224121, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.858407735824585, + "num_tokens": 695491386.0, + "step": 18226 + }, + { + "epoch": 2.318661747869228, + "ewc_loss": 0.07724836468696594, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003989485267084092, + "grad_norm": 8.94129467010498, + "learning_rate": 1e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8453896641731262, + "num_tokens": 695534660.0, + "step": 18227 + }, + { + "epoch": 2.3187889581478185, + "ewc_loss": 0.07708821445703506, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039734700112603605, + "grad_norm": 8.852473258972168, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8561902046203613, + "num_tokens": 695578127.0, + "step": 18228 + }, + { + "epoch": 2.318916168426409, + "ewc_loss": 0.07734265923500061, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000399891403503716, + "grad_norm": 9.022436141967773, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8499075770378113, + "num_tokens": 695614181.0, + "step": 18229 + }, + { + "epoch": 2.3190433787049995, + "ewc_loss": 0.07678505778312683, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039431542973034084, + "grad_norm": 8.816859245300293, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8624885082244873, + "num_tokens": 695652753.0, + "step": 18230 + }, + { + "epoch": 2.3191705889835896, + "ewc_loss": 0.07749354094266891, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004014002624899149, + "grad_norm": 9.013641357421875, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.869906485080719, + "num_tokens": 695692449.0, + "step": 18231 + }, + { + "epoch": 2.3192977992621806, + "ewc_loss": 0.0767754316329956, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003942191251553595, + "grad_norm": 8.903035163879395, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8645998239517212, + "num_tokens": 695730812.0, + "step": 18232 + }, + { + "epoch": 2.3194250095407707, + "ewc_loss": 0.0772651731967926, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003991165431216359, + "grad_norm": 8.97260856628418, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.878627359867096, + "num_tokens": 695767577.0, + "step": 18233 + }, + { + "epoch": 2.319552219819361, + "ewc_loss": 0.07687497138977051, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003952145925723016, + "grad_norm": 8.905953407287598, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8626791834831238, + "num_tokens": 695812372.0, + "step": 18234 + }, + { + "epoch": 2.3196794300979517, + "ewc_loss": 0.07707537710666656, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003972185659222305, + "grad_norm": 8.948500633239746, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8811031579971313, + "num_tokens": 695847470.0, + "step": 18235 + }, + { + "epoch": 2.3198066403765423, + "ewc_loss": 0.07693611085414886, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003958259185310453, + "grad_norm": 8.914233207702637, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8801242113113403, + "num_tokens": 695888082.0, + "step": 18236 + }, + { + "epoch": 2.319933850655133, + "ewc_loss": 0.07700155675411224, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039648046367801726, + "grad_norm": 8.90274715423584, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8720360994338989, + "num_tokens": 695928904.0, + "step": 18237 + }, + { + "epoch": 2.3200610609337233, + "ewc_loss": 0.07709266990423203, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003973915590904653, + "grad_norm": 8.965689659118652, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8515737056732178, + "num_tokens": 695967549.0, + "step": 18238 + }, + { + "epoch": 2.320188271212314, + "ewc_loss": 0.07689551264047623, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003954199783038348, + "grad_norm": 8.936919212341309, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8584862947463989, + "num_tokens": 696005611.0, + "step": 18239 + }, + { + "epoch": 2.3203154814909044, + "ewc_loss": 0.0770411342382431, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003968762175645679, + "grad_norm": 8.986763954162598, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8670383095741272, + "num_tokens": 696034864.0, + "step": 18240 + }, + { + "epoch": 2.320442691769495, + "ewc_loss": 0.07684770971536636, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039494194788858294, + "grad_norm": 8.879619598388672, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8605793714523315, + "num_tokens": 696072137.0, + "step": 18241 + }, + { + "epoch": 2.3205699020480854, + "ewc_loss": 0.0770055502653122, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003965203941334039, + "grad_norm": 8.965092658996582, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8660929203033447, + "num_tokens": 696115744.0, + "step": 18242 + }, + { + "epoch": 2.320697112326676, + "ewc_loss": 0.07672528922557831, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039371769526042044, + "grad_norm": 8.897260665893555, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8585837483406067, + "num_tokens": 696151293.0, + "step": 18243 + }, + { + "epoch": 2.3208243226052665, + "ewc_loss": 0.07703264057636261, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039679123437963426, + "grad_norm": 8.9201021194458, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.881271481513977, + "num_tokens": 696190742.0, + "step": 18244 + }, + { + "epoch": 2.320951532883857, + "ewc_loss": 0.07671050727367401, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039356996421702206, + "grad_norm": 8.864572525024414, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8520721793174744, + "num_tokens": 696232807.0, + "step": 18245 + }, + { + "epoch": 2.3210787431624476, + "ewc_loss": 0.07708923518657684, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039735715836286545, + "grad_norm": 8.932868957519531, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8611184358596802, + "num_tokens": 696269416.0, + "step": 18246 + }, + { + "epoch": 2.321205953441038, + "ewc_loss": 0.07673883438110352, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039385323179885745, + "grad_norm": 8.87579345703125, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8662739992141724, + "num_tokens": 696305144.0, + "step": 18247 + }, + { + "epoch": 2.3213331637196286, + "ewc_loss": 0.07711029797792435, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039756784099154174, + "grad_norm": 8.892326354980469, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8547062873840332, + "num_tokens": 696344854.0, + "step": 18248 + }, + { + "epoch": 2.321460373998219, + "ewc_loss": 0.07702305912971497, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003966954245697707, + "grad_norm": 8.918326377868652, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8831059336662292, + "num_tokens": 696380154.0, + "step": 18249 + }, + { + "epoch": 2.3215875842768097, + "ewc_loss": 0.0770413875579834, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039687869139015675, + "grad_norm": 8.942651748657227, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8496277332305908, + "num_tokens": 696416458.0, + "step": 18250 + }, + { + "epoch": 2.3217147945554, + "ewc_loss": 0.07703432440757751, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039680805639363825, + "grad_norm": 8.922951698303223, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8641241192817688, + "num_tokens": 696453232.0, + "step": 18251 + }, + { + "epoch": 2.3218420048339907, + "ewc_loss": 0.07713261991739273, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039779103826731443, + "grad_norm": 8.962507247924805, + "learning_rate": 1e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8432931900024414, + "num_tokens": 696487930.0, + "step": 18252 + }, + { + "epoch": 2.3219692151125813, + "ewc_loss": 0.07707703113555908, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039723512600176036, + "grad_norm": 8.96163558959961, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8541814088821411, + "num_tokens": 696523824.0, + "step": 18253 + }, + { + "epoch": 2.322096425391172, + "ewc_loss": 0.07701794058084488, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039664426003582776, + "grad_norm": 8.875569343566895, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8735544085502625, + "num_tokens": 696567007.0, + "step": 18254 + }, + { + "epoch": 2.3222236356697623, + "ewc_loss": 0.07709410786628723, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039740587817505, + "grad_norm": 8.873429298400879, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8802379369735718, + "num_tokens": 696610055.0, + "step": 18255 + }, + { + "epoch": 2.3223508459483524, + "ewc_loss": 0.07704402506351471, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039690511766821146, + "grad_norm": 8.970894813537598, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8486508131027222, + "num_tokens": 696644058.0, + "step": 18256 + }, + { + "epoch": 2.3224780562269434, + "ewc_loss": 0.07691779732704163, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000395642826333642, + "grad_norm": 8.931336402893066, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8712708950042725, + "num_tokens": 696686381.0, + "step": 18257 + }, + { + "epoch": 2.3226052665055335, + "ewc_loss": 0.07701350748538971, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039659987669438124, + "grad_norm": 8.913567543029785, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8588517904281616, + "num_tokens": 696727567.0, + "step": 18258 + }, + { + "epoch": 2.322732476784124, + "ewc_loss": 0.07681602239608765, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003946250944864005, + "grad_norm": 9.003140449523926, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8544527292251587, + "num_tokens": 696767266.0, + "step": 18259 + }, + { + "epoch": 2.3228596870627145, + "ewc_loss": 0.07679127901792526, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003943776246160269, + "grad_norm": 8.866988182067871, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8771832585334778, + "num_tokens": 696806059.0, + "step": 18260 + }, + { + "epoch": 2.322986897341305, + "ewc_loss": 0.07727225124835968, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039918735274113715, + "grad_norm": 9.015398979187012, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8610256314277649, + "num_tokens": 696844232.0, + "step": 18261 + }, + { + "epoch": 2.3231141076198956, + "ewc_loss": 0.07672051340341568, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039366999408230186, + "grad_norm": 8.889300346374512, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8661682605743408, + "num_tokens": 696886223.0, + "step": 18262 + }, + { + "epoch": 2.323241317898486, + "ewc_loss": 0.07738757878541946, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004003406211268157, + "grad_norm": 9.058799743652344, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8503309488296509, + "num_tokens": 696922625.0, + "step": 18263 + }, + { + "epoch": 2.3233685281770766, + "ewc_loss": 0.07673145830631256, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039377945358864963, + "grad_norm": 8.851545333862305, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8512998819351196, + "num_tokens": 696965160.0, + "step": 18264 + }, + { + "epoch": 2.323495738455667, + "ewc_loss": 0.077328622341156, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003997510939370841, + "grad_norm": 9.085807800292969, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8688519597053528, + "num_tokens": 697000854.0, + "step": 18265 + }, + { + "epoch": 2.3236229487342577, + "ewc_loss": 0.076697438955307, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039343920070677996, + "grad_norm": 8.857510566711426, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.878597617149353, + "num_tokens": 697041264.0, + "step": 18266 + }, + { + "epoch": 2.323750159012848, + "ewc_loss": 0.07737106084823608, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040017548599280417, + "grad_norm": 9.019050598144531, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8608428239822388, + "num_tokens": 697080079.0, + "step": 18267 + }, + { + "epoch": 2.3238773692914387, + "ewc_loss": 0.07676684111356735, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039413326885551214, + "grad_norm": 8.862944602966309, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8592153787612915, + "num_tokens": 697123755.0, + "step": 18268 + }, + { + "epoch": 2.3240045795700293, + "ewc_loss": 0.07790544629096985, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004006364615634084, + "grad_norm": 9.158708572387695, + "learning_rate": 1e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8347920179367065, + "num_tokens": 697162012.0, + "step": 18269 + }, + { + "epoch": 2.32413178984862, + "ewc_loss": 0.07635680586099625, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039003288839012384, + "grad_norm": 8.760879516601562, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8727627992630005, + "num_tokens": 697200188.0, + "step": 18270 + }, + { + "epoch": 2.3242590001272103, + "ewc_loss": 0.07775509357452393, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004040157946292311, + "grad_norm": 9.12546443939209, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8668117523193359, + "num_tokens": 697240536.0, + "step": 18271 + }, + { + "epoch": 2.324386210405801, + "ewc_loss": 0.07621979713439941, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003886628255713731, + "grad_norm": 8.754504203796387, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8750434517860413, + "num_tokens": 697279459.0, + "step": 18272 + }, + { + "epoch": 2.3245134206843914, + "ewc_loss": 0.07784944772720337, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040495936991646886, + "grad_norm": 9.0429105758667, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8817642331123352, + "num_tokens": 697312594.0, + "step": 18273 + }, + { + "epoch": 2.324640630962982, + "ewc_loss": 0.07648128271102905, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039127771742641926, + "grad_norm": 8.767821311950684, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8487908840179443, + "num_tokens": 697357834.0, + "step": 18274 + }, + { + "epoch": 2.3247678412415724, + "ewc_loss": 0.0777229592204094, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040369443013332784, + "grad_norm": 9.045896530151367, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8627417683601379, + "num_tokens": 697401522.0, + "step": 18275 + }, + { + "epoch": 2.324895051520163, + "ewc_loss": 0.07660377770662308, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003925026103388518, + "grad_norm": 8.785672187805176, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8614333271980286, + "num_tokens": 697440415.0, + "step": 18276 + }, + { + "epoch": 2.3250222617987535, + "ewc_loss": 0.07772514969110489, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040371634531766176, + "grad_norm": 9.097962379455566, + "learning_rate": 1e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.846815824508667, + "num_tokens": 697477609.0, + "step": 18277 + }, + { + "epoch": 2.325149472077344, + "ewc_loss": 0.07665437459945679, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003930085804313421, + "grad_norm": 8.827083587646484, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8589344024658203, + "num_tokens": 697515566.0, + "step": 18278 + }, + { + "epoch": 2.3252766823559345, + "ewc_loss": 0.07772611081600189, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040372597868554294, + "grad_norm": 9.027159690856934, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8678908348083496, + "num_tokens": 697549537.0, + "step": 18279 + }, + { + "epoch": 2.325403892634525, + "ewc_loss": 0.0767417848110199, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003938827430829406, + "grad_norm": 8.807476997375488, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8685911893844604, + "num_tokens": 697582618.0, + "step": 18280 + }, + { + "epoch": 2.325531102913115, + "ewc_loss": 0.0776582658290863, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040304751018993556, + "grad_norm": 8.9828519821167, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8767486214637756, + "num_tokens": 697622234.0, + "step": 18281 + }, + { + "epoch": 2.325658313191706, + "ewc_loss": 0.07684896886348724, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003949545498471707, + "grad_norm": 8.92952823638916, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8556554317474365, + "num_tokens": 697657504.0, + "step": 18282 + }, + { + "epoch": 2.325785523470296, + "ewc_loss": 0.07712823152542114, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003977471496909857, + "grad_norm": 8.886246681213379, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8504558801651001, + "num_tokens": 697699496.0, + "step": 18283 + }, + { + "epoch": 2.3259127337488867, + "ewc_loss": 0.0770334005355835, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000396798801375553, + "grad_norm": 8.84382438659668, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8769829273223877, + "num_tokens": 697739969.0, + "step": 18284 + }, + { + "epoch": 2.3260399440274773, + "ewc_loss": 0.07722393423318863, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003987042000517249, + "grad_norm": 8.927769660949707, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.852423906326294, + "num_tokens": 697774346.0, + "step": 18285 + }, + { + "epoch": 2.326167154306068, + "ewc_loss": 0.07711967825889587, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039766167174093425, + "grad_norm": 8.870386123657227, + "learning_rate": 1e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8489221930503845, + "num_tokens": 697813013.0, + "step": 18286 + }, + { + "epoch": 2.3262943645846583, + "ewc_loss": 0.07732447981834412, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003997095918748528, + "grad_norm": 8.9208984375, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8567699790000916, + "num_tokens": 697854163.0, + "step": 18287 + }, + { + "epoch": 2.326421574863249, + "ewc_loss": 0.07717981934547424, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039826298598200083, + "grad_norm": 8.869181632995605, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8714950084686279, + "num_tokens": 697898049.0, + "step": 18288 + }, + { + "epoch": 2.3265487851418394, + "ewc_loss": 0.0772135853767395, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003986007359344512, + "grad_norm": 8.87571907043457, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8666049242019653, + "num_tokens": 697936086.0, + "step": 18289 + }, + { + "epoch": 2.32667599542043, + "ewc_loss": 0.07726102322340012, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039907507016323507, + "grad_norm": 8.906087875366211, + "learning_rate": 1e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8398557305335999, + "num_tokens": 697981529.0, + "step": 18290 + }, + { + "epoch": 2.3268032056990204, + "ewc_loss": 0.0770980566740036, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003974454302806407, + "grad_norm": 8.844971656799316, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8802654147148132, + "num_tokens": 698016047.0, + "step": 18291 + }, + { + "epoch": 2.326930415977611, + "ewc_loss": 0.07747594267129898, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000401224271627143, + "grad_norm": 8.93540096282959, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8754639625549316, + "num_tokens": 698052408.0, + "step": 18292 + }, + { + "epoch": 2.3270576262562015, + "ewc_loss": 0.0770682692527771, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003971475816797465, + "grad_norm": 8.85405445098877, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8559671640396118, + "num_tokens": 698089751.0, + "step": 18293 + }, + { + "epoch": 2.327184836534792, + "ewc_loss": 0.07738211005926132, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004002859350293875, + "grad_norm": 8.946086883544922, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8619804382324219, + "num_tokens": 698121274.0, + "step": 18294 + }, + { + "epoch": 2.3273120468133826, + "ewc_loss": 0.07714531570672989, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003979179891757667, + "grad_norm": 8.8350248336792, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8739180564880371, + "num_tokens": 698155880.0, + "step": 18295 + }, + { + "epoch": 2.327439257091973, + "ewc_loss": 0.07765655219554901, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040303036803379655, + "grad_norm": 8.987793922424316, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8667420148849487, + "num_tokens": 698191448.0, + "step": 18296 + }, + { + "epoch": 2.3275664673705636, + "ewc_loss": 0.07697875052690506, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003962523478548974, + "grad_norm": 8.962859153747559, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.858769953250885, + "num_tokens": 698228437.0, + "step": 18297 + }, + { + "epoch": 2.327693677649154, + "ewc_loss": 0.07726241648197174, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003990890400018543, + "grad_norm": 8.87125015258789, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8547896146774292, + "num_tokens": 698268714.0, + "step": 18298 + }, + { + "epoch": 2.3278208879277447, + "ewc_loss": 0.07717987895011902, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003982636262662709, + "grad_norm": 8.861567497253418, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8632823824882507, + "num_tokens": 698305575.0, + "step": 18299 + }, + { + "epoch": 2.327948098206335, + "ewc_loss": 0.07722015678882599, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039866642327979207, + "grad_norm": 8.9092378616333, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8605788350105286, + "num_tokens": 698347701.0, + "step": 18300 + }, + { + "epoch": 2.3280753084849257, + "ewc_loss": 0.07716495543718338, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039811438182368875, + "grad_norm": 8.878171920776367, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8798481225967407, + "num_tokens": 698385906.0, + "step": 18301 + }, + { + "epoch": 2.3282025187635162, + "ewc_loss": 0.07729588449001312, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003994236758444458, + "grad_norm": 8.929831504821777, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8629559278488159, + "num_tokens": 698415511.0, + "step": 18302 + }, + { + "epoch": 2.3283297290421068, + "ewc_loss": 0.07708682119846344, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039733308949507773, + "grad_norm": 8.82699966430664, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8587484359741211, + "num_tokens": 698460528.0, + "step": 18303 + }, + { + "epoch": 2.3284569393206973, + "ewc_loss": 0.07735845446586609, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004000493499916047, + "grad_norm": 8.927755355834961, + "learning_rate": 1e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8481060266494751, + "num_tokens": 698500800.0, + "step": 18304 + }, + { + "epoch": 2.328584149599288, + "ewc_loss": 0.07706629484891891, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039712779107503593, + "grad_norm": 8.805520057678223, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8627688884735107, + "num_tokens": 698539373.0, + "step": 18305 + }, + { + "epoch": 2.328711359877878, + "ewc_loss": 0.07743512839078903, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004008161195088178, + "grad_norm": 8.871992111206055, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8840864896774292, + "num_tokens": 698577067.0, + "step": 18306 + }, + { + "epoch": 2.328838570156469, + "ewc_loss": 0.07717226445674896, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039818749064579606, + "grad_norm": 8.984699249267578, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.863049328327179, + "num_tokens": 698613515.0, + "step": 18307 + }, + { + "epoch": 2.328965780435059, + "ewc_loss": 0.07717563211917877, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039822119288146496, + "grad_norm": 8.868304252624512, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8650075197219849, + "num_tokens": 698647278.0, + "step": 18308 + }, + { + "epoch": 2.3290929907136495, + "ewc_loss": 0.0774153470993042, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004006183007732034, + "grad_norm": 8.926054954528809, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8720791339874268, + "num_tokens": 698688738.0, + "step": 18309 + }, + { + "epoch": 2.32922020099224, + "ewc_loss": 0.07705448567867279, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003970096877310425, + "grad_norm": 8.8168306350708, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8672295808792114, + "num_tokens": 698725003.0, + "step": 18310 + }, + { + "epoch": 2.3293474112708306, + "ewc_loss": 0.07755975425243378, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004020624328404665, + "grad_norm": 8.885106086730957, + "learning_rate": 1e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8464693427085876, + "num_tokens": 698768205.0, + "step": 18311 + }, + { + "epoch": 2.329474621549421, + "ewc_loss": 0.07727479934692383, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039921284769661725, + "grad_norm": 8.892796516418457, + "learning_rate": 1e-06, + "loss": 0.541, + "mean_token_accuracy": 0.8438106775283813, + "num_tokens": 698805352.0, + "step": 18312 + }, + { + "epoch": 2.3296018318280116, + "ewc_loss": 0.0775187760591507, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004016525926999748, + "grad_norm": 8.913952827453613, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8743381500244141, + "num_tokens": 698836610.0, + "step": 18313 + }, + { + "epoch": 2.329729042106602, + "ewc_loss": 0.07749506831169128, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004014155128970742, + "grad_norm": 8.94682502746582, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8722485303878784, + "num_tokens": 698877514.0, + "step": 18314 + }, + { + "epoch": 2.3298562523851927, + "ewc_loss": 0.07720116525888443, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003984764916822314, + "grad_norm": 8.892599105834961, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8665251731872559, + "num_tokens": 698916699.0, + "step": 18315 + }, + { + "epoch": 2.329983462663783, + "ewc_loss": 0.07740940898656845, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040055892895907164, + "grad_norm": 8.91933536529541, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8652535676956177, + "num_tokens": 698953568.0, + "step": 18316 + }, + { + "epoch": 2.3301106729423737, + "ewc_loss": 0.07728156447410583, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003992805432062596, + "grad_norm": 8.904997825622559, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8590863943099976, + "num_tokens": 698990562.0, + "step": 18317 + }, + { + "epoch": 2.3302378832209643, + "ewc_loss": 0.07738475501537323, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004003124195151031, + "grad_norm": 8.88726806640625, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8662585616111755, + "num_tokens": 699024095.0, + "step": 18318 + }, + { + "epoch": 2.330365093499555, + "ewc_loss": 0.0773770809173584, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004002356727141887, + "grad_norm": 8.928528785705566, + "learning_rate": 1e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8438570499420166, + "num_tokens": 699062327.0, + "step": 18319 + }, + { + "epoch": 2.3304923037781453, + "ewc_loss": 0.07726088911294937, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039907373138703406, + "grad_norm": 8.921284675598145, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8682399988174438, + "num_tokens": 699101591.0, + "step": 18320 + }, + { + "epoch": 2.330619514056736, + "ewc_loss": 0.07713928818702698, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003978577733505517, + "grad_norm": 8.93826675415039, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8548917770385742, + "num_tokens": 699134836.0, + "step": 18321 + }, + { + "epoch": 2.3307467243353264, + "ewc_loss": 0.07705119252204895, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003969767421949655, + "grad_norm": 8.919665336608887, + "learning_rate": 1e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8544697761535645, + "num_tokens": 699171218.0, + "step": 18322 + }, + { + "epoch": 2.330873934613917, + "ewc_loss": 0.07716389000415802, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003981037298217416, + "grad_norm": 8.948273658752441, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8818303942680359, + "num_tokens": 699200891.0, + "step": 18323 + }, + { + "epoch": 2.3310011448925074, + "ewc_loss": 0.07714007794857025, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039786563138477504, + "grad_norm": 8.937296867370605, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8792766332626343, + "num_tokens": 699238080.0, + "step": 18324 + }, + { + "epoch": 2.331128355171098, + "ewc_loss": 0.07702739536762238, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039673884748481214, + "grad_norm": 8.81668758392334, + "learning_rate": 1e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8526357412338257, + "num_tokens": 699276051.0, + "step": 18325 + }, + { + "epoch": 2.3312555654496885, + "ewc_loss": 0.07741241157054901, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000400588964112103, + "grad_norm": 8.898771286010742, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8711229562759399, + "num_tokens": 699315721.0, + "step": 18326 + }, + { + "epoch": 2.331382775728279, + "ewc_loss": 0.0768873393535614, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003953382547479123, + "grad_norm": 8.831802368164062, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8745496273040771, + "num_tokens": 699350342.0, + "step": 18327 + }, + { + "epoch": 2.3315099860068695, + "ewc_loss": 0.07733963429927826, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003998611937277019, + "grad_norm": 8.942571640014648, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8692959547042847, + "num_tokens": 699390301.0, + "step": 18328 + }, + { + "epoch": 2.3316371962854596, + "ewc_loss": 0.07692289352416992, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039569378714077175, + "grad_norm": 8.837450981140137, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8488579988479614, + "num_tokens": 699427135.0, + "step": 18329 + }, + { + "epoch": 2.3317644065640506, + "ewc_loss": 0.07737059146165848, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004001707420684397, + "grad_norm": 8.931995391845703, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.868815004825592, + "num_tokens": 699466461.0, + "step": 18330 + }, + { + "epoch": 2.3318916168426407, + "ewc_loss": 0.07684718817472458, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003949367383029312, + "grad_norm": 8.862595558166504, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8573411703109741, + "num_tokens": 699502446.0, + "step": 18331 + }, + { + "epoch": 2.332018827121231, + "ewc_loss": 0.07730761170387268, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000399540935177356, + "grad_norm": 8.931777954101562, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8626272082328796, + "num_tokens": 699546357.0, + "step": 18332 + }, + { + "epoch": 2.3321460373998217, + "ewc_loss": 0.07685373723506927, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003950021928176284, + "grad_norm": 8.874083518981934, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8632487058639526, + "num_tokens": 699581879.0, + "step": 18333 + }, + { + "epoch": 2.3322732476784123, + "ewc_loss": 0.07729533314704895, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003994181752204895, + "grad_norm": 8.931838989257812, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.877545177936554, + "num_tokens": 699616287.0, + "step": 18334 + }, + { + "epoch": 2.332400457957003, + "ewc_loss": 0.07693710178136826, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039583584293723106, + "grad_norm": 8.819389343261719, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8609998822212219, + "num_tokens": 699655409.0, + "step": 18335 + }, + { + "epoch": 2.3325276682355933, + "ewc_loss": 0.07734209299087524, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039988572825677693, + "grad_norm": 8.931313514709473, + "learning_rate": 1e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8376275897026062, + "num_tokens": 699701420.0, + "step": 18336 + }, + { + "epoch": 2.332654878514184, + "ewc_loss": 0.0770040899515152, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003965057258028537, + "grad_norm": 8.873241424560547, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8663079738616943, + "num_tokens": 699736151.0, + "step": 18337 + }, + { + "epoch": 2.3327820887927744, + "ewc_loss": 0.07732068002223969, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003996716986875981, + "grad_norm": 9.045464515686035, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8555472493171692, + "num_tokens": 699770111.0, + "step": 18338 + }, + { + "epoch": 2.332909299071365, + "ewc_loss": 0.07676941156387329, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000394158938433975, + "grad_norm": 8.782960891723633, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8607795238494873, + "num_tokens": 699807442.0, + "step": 18339 + }, + { + "epoch": 2.3330365093499554, + "ewc_loss": 0.0775986760854721, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040245160926133394, + "grad_norm": 8.995755195617676, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8613723516464233, + "num_tokens": 699849364.0, + "step": 18340 + }, + { + "epoch": 2.333163719628546, + "ewc_loss": 0.07678583264350891, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039432314224541187, + "grad_norm": 8.821283340454102, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8766420483589172, + "num_tokens": 699886234.0, + "step": 18341 + }, + { + "epoch": 2.3332909299071365, + "ewc_loss": 0.0775371789932251, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040183664532378316, + "grad_norm": 8.954379081726074, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8453152179718018, + "num_tokens": 699925660.0, + "step": 18342 + }, + { + "epoch": 2.333418140185727, + "ewc_loss": 0.07701878249645233, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003966526419389993, + "grad_norm": 8.888885498046875, + "learning_rate": 1e-06, + "loss": 0.5606, + "mean_token_accuracy": 0.837677538394928, + "num_tokens": 699967894.0, + "step": 18343 + }, + { + "epoch": 2.3335453504643175, + "ewc_loss": 0.07728491723537445, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039931407081894577, + "grad_norm": 8.976271629333496, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8566386103630066, + "num_tokens": 699999211.0, + "step": 18344 + }, + { + "epoch": 2.333672560742908, + "ewc_loss": 0.07698006182909012, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039626547368243337, + "grad_norm": 8.86121940612793, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8709458112716675, + "num_tokens": 700037639.0, + "step": 18345 + }, + { + "epoch": 2.3337997710214986, + "ewc_loss": 0.0772518664598465, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039898348040878773, + "grad_norm": 8.971927642822266, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8562090992927551, + "num_tokens": 700075677.0, + "step": 18346 + }, + { + "epoch": 2.333926981300089, + "ewc_loss": 0.07693943381309509, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003958592133130878, + "grad_norm": 8.906661033630371, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8699773550033569, + "num_tokens": 700115697.0, + "step": 18347 + }, + { + "epoch": 2.3340541915786797, + "ewc_loss": 0.07710735499858856, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039753841701895, + "grad_norm": 8.901104927062988, + "learning_rate": 1e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.8453952074050903, + "num_tokens": 700152553.0, + "step": 18348 + }, + { + "epoch": 2.33418140185727, + "ewc_loss": 0.07691945880651474, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003956594446208328, + "grad_norm": 8.910886764526367, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8696461915969849, + "num_tokens": 700190958.0, + "step": 18349 + }, + { + "epoch": 2.3343086121358607, + "ewc_loss": 0.07692183554172516, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039568316424265504, + "grad_norm": 8.88394832611084, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8595897555351257, + "num_tokens": 700224244.0, + "step": 18350 + }, + { + "epoch": 2.3344358224144512, + "ewc_loss": 0.0769386887550354, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039585173362866044, + "grad_norm": 8.844866752624512, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8895983099937439, + "num_tokens": 700259736.0, + "step": 18351 + }, + { + "epoch": 2.3345630326930418, + "ewc_loss": 0.07692418992519379, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039570670924149454, + "grad_norm": 8.88493537902832, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8641411066055298, + "num_tokens": 700293624.0, + "step": 18352 + }, + { + "epoch": 2.3346902429716323, + "ewc_loss": 0.07676632702350616, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039412808837369084, + "grad_norm": 8.821252822875977, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8662408590316772, + "num_tokens": 700328052.0, + "step": 18353 + }, + { + "epoch": 2.3348174532502224, + "ewc_loss": 0.07716970145702362, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039816187927499413, + "grad_norm": 8.912034034729004, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8567740321159363, + "num_tokens": 700364652.0, + "step": 18354 + }, + { + "epoch": 2.3349446635288134, + "ewc_loss": 0.07679148018360138, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003943796327803284, + "grad_norm": 8.816001892089844, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.876816987991333, + "num_tokens": 700403698.0, + "step": 18355 + }, + { + "epoch": 2.3350718738074034, + "ewc_loss": 0.07730704545974731, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039953525993041694, + "grad_norm": 8.928243637084961, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8777533769607544, + "num_tokens": 700439391.0, + "step": 18356 + }, + { + "epoch": 2.335199084085994, + "ewc_loss": 0.07674328982830048, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039389776065945625, + "grad_norm": 8.872567176818848, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8652831315994263, + "num_tokens": 700475095.0, + "step": 18357 + }, + { + "epoch": 2.3353262943645845, + "ewc_loss": 0.07719605416059494, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003984253853559494, + "grad_norm": 8.972801208496094, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8679263591766357, + "num_tokens": 700514167.0, + "step": 18358 + }, + { + "epoch": 2.335453504643175, + "ewc_loss": 0.07670934498310089, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039355826447717845, + "grad_norm": 8.8056640625, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8713933825492859, + "num_tokens": 700548333.0, + "step": 18359 + }, + { + "epoch": 2.3355807149217656, + "ewc_loss": 0.07721678912639618, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039863275014795363, + "grad_norm": 8.875627517700195, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8679465651512146, + "num_tokens": 700594261.0, + "step": 18360 + }, + { + "epoch": 2.335707925200356, + "ewc_loss": 0.07692301273345947, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039569495129399, + "grad_norm": 8.85256576538086, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.870469331741333, + "num_tokens": 700632513.0, + "step": 18361 + }, + { + "epoch": 2.3358351354789466, + "ewc_loss": 0.07709257304668427, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003973906277678907, + "grad_norm": 8.863570213317871, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8700994253158569, + "num_tokens": 700672430.0, + "step": 18362 + }, + { + "epoch": 2.335962345757537, + "ewc_loss": 0.07707802206277847, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039724507951177657, + "grad_norm": 8.890816688537598, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8695023655891418, + "num_tokens": 700714321.0, + "step": 18363 + }, + { + "epoch": 2.3360895560361277, + "ewc_loss": 0.07692095637321472, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003956743748858571, + "grad_norm": 8.889216423034668, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8632701635360718, + "num_tokens": 700751714.0, + "step": 18364 + }, + { + "epoch": 2.336216766314718, + "ewc_loss": 0.076935775578022, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039582260069437325, + "grad_norm": 8.833650588989258, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8615580201148987, + "num_tokens": 700794087.0, + "step": 18365 + }, + { + "epoch": 2.3363439765933087, + "ewc_loss": 0.07717475295066833, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003982123453170061, + "grad_norm": 8.90622329711914, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8573978543281555, + "num_tokens": 700832031.0, + "step": 18366 + }, + { + "epoch": 2.3364711868718993, + "ewc_loss": 0.07704512029886246, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003969160607084632, + "grad_norm": 8.882242202758789, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8661787509918213, + "num_tokens": 700867175.0, + "step": 18367 + }, + { + "epoch": 2.33659839715049, + "ewc_loss": 0.07700290530920029, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039649390964768827, + "grad_norm": 8.854533195495605, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.87028968334198, + "num_tokens": 700906762.0, + "step": 18368 + }, + { + "epoch": 2.3367256074290803, + "ewc_loss": 0.07725778222084045, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039904264849610627, + "grad_norm": 8.93093490600586, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8665375113487244, + "num_tokens": 700944316.0, + "step": 18369 + }, + { + "epoch": 2.336852817707671, + "ewc_loss": 0.07691431045532227, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003956079308409244, + "grad_norm": 8.882035255432129, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8600170612335205, + "num_tokens": 700979369.0, + "step": 18370 + }, + { + "epoch": 2.3369800279862614, + "ewc_loss": 0.0772198736667633, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003986635711044073, + "grad_norm": 8.931278228759766, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8617234230041504, + "num_tokens": 701016213.0, + "step": 18371 + }, + { + "epoch": 2.337107238264852, + "ewc_loss": 0.07701615989208221, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003966263902839273, + "grad_norm": 8.872410774230957, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8608002662658691, + "num_tokens": 701055270.0, + "step": 18372 + }, + { + "epoch": 2.3372344485434424, + "ewc_loss": 0.07714270055294037, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039789179572835565, + "grad_norm": 8.882760047912598, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8615909814834595, + "num_tokens": 701091058.0, + "step": 18373 + }, + { + "epoch": 2.337361658822033, + "ewc_loss": 0.0771496593952179, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039796147029846907, + "grad_norm": 8.920351028442383, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8722949028015137, + "num_tokens": 701129446.0, + "step": 18374 + }, + { + "epoch": 2.3374888691006235, + "ewc_loss": 0.07695865631103516, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039605144411325455, + "grad_norm": 8.827021598815918, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8686084747314453, + "num_tokens": 701168726.0, + "step": 18375 + }, + { + "epoch": 2.337616079379214, + "ewc_loss": 0.07727555930614471, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039922044379636645, + "grad_norm": 8.897631645202637, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8591338396072388, + "num_tokens": 701204041.0, + "step": 18376 + }, + { + "epoch": 2.3377432896578045, + "ewc_loss": 0.07691030204296112, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003955679130740464, + "grad_norm": 8.892705917358398, + "learning_rate": 1e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.841334342956543, + "num_tokens": 701244660.0, + "step": 18377 + }, + { + "epoch": 2.337870499936395, + "ewc_loss": 0.07723591476678848, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003988239914178848, + "grad_norm": 8.884824752807617, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.85378497838974, + "num_tokens": 701284830.0, + "step": 18378 + }, + { + "epoch": 2.337997710214985, + "ewc_loss": 0.07711366564035416, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039760148501954973, + "grad_norm": 8.844841003417969, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8697155117988586, + "num_tokens": 701319720.0, + "step": 18379 + }, + { + "epoch": 2.338124920493576, + "ewc_loss": 0.07718401402235031, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003983049828093499, + "grad_norm": 8.921581268310547, + "learning_rate": 1e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8430489897727966, + "num_tokens": 701361770.0, + "step": 18380 + }, + { + "epoch": 2.338252130772166, + "ewc_loss": 0.07719708979129791, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039843577542342246, + "grad_norm": 8.923105239868164, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8620234727859497, + "num_tokens": 701399777.0, + "step": 18381 + }, + { + "epoch": 2.3383793410507567, + "ewc_loss": 0.0771307647228241, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039777252823114395, + "grad_norm": 8.872471809387207, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8626017570495605, + "num_tokens": 701432680.0, + "step": 18382 + }, + { + "epoch": 2.3385065513293473, + "ewc_loss": 0.07720252126455307, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039849005406722426, + "grad_norm": 8.874459266662598, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8750988245010376, + "num_tokens": 701474101.0, + "step": 18383 + }, + { + "epoch": 2.338633761607938, + "ewc_loss": 0.07711537927389145, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039761862717568874, + "grad_norm": 8.886285781860352, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.875959575176239, + "num_tokens": 701517568.0, + "step": 18384 + }, + { + "epoch": 2.3387609718865283, + "ewc_loss": 0.07727782428264618, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039924305747263134, + "grad_norm": 8.927203178405762, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8585835695266724, + "num_tokens": 701558466.0, + "step": 18385 + }, + { + "epoch": 2.338888182165119, + "ewc_loss": 0.07701904326677322, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003966552612837404, + "grad_norm": 8.915630340576172, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8581454157829285, + "num_tokens": 701591104.0, + "step": 18386 + }, + { + "epoch": 2.3390153924437094, + "ewc_loss": 0.0771460011601448, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003979248576797545, + "grad_norm": 8.95850944519043, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8577381372451782, + "num_tokens": 701633242.0, + "step": 18387 + }, + { + "epoch": 2.3391426027223, + "ewc_loss": 0.07697178423404694, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039618267328478396, + "grad_norm": 8.896658897399902, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8629244565963745, + "num_tokens": 701668864.0, + "step": 18388 + }, + { + "epoch": 2.3392698130008904, + "ewc_loss": 0.0771205872297287, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003976706648245454, + "grad_norm": 8.94993782043457, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8666281700134277, + "num_tokens": 701703648.0, + "step": 18389 + }, + { + "epoch": 2.339397023279481, + "ewc_loss": 0.07691651582717896, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039563002064824104, + "grad_norm": 8.904718399047852, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8600612878799438, + "num_tokens": 701744417.0, + "step": 18390 + }, + { + "epoch": 2.3395242335580715, + "ewc_loss": 0.07693381607532501, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003958029847126454, + "grad_norm": 8.930182456970215, + "learning_rate": 1e-06, + "loss": 0.563, + "mean_token_accuracy": 0.8325521349906921, + "num_tokens": 701783541.0, + "step": 18391 + }, + { + "epoch": 2.339651443836662, + "ewc_loss": 0.0770639181137085, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003971039841417223, + "grad_norm": 8.900178909301758, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8592060804367065, + "num_tokens": 701825054.0, + "step": 18392 + }, + { + "epoch": 2.3397786541152525, + "ewc_loss": 0.07698112726211548, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003962760674767196, + "grad_norm": 8.95543384552002, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8720794916152954, + "num_tokens": 701866173.0, + "step": 18393 + }, + { + "epoch": 2.339905864393843, + "ewc_loss": 0.07687210291624069, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039518586709164083, + "grad_norm": 8.876788139343262, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8587642908096313, + "num_tokens": 701904812.0, + "step": 18394 + }, + { + "epoch": 2.3400330746724336, + "ewc_loss": 0.07719603180885315, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039842515252530575, + "grad_norm": 8.966228485107422, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8529342412948608, + "num_tokens": 701942590.0, + "step": 18395 + }, + { + "epoch": 2.340160284951024, + "ewc_loss": 0.07682523131370544, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003947171790059656, + "grad_norm": 8.9052734375, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8659377098083496, + "num_tokens": 701980773.0, + "step": 18396 + }, + { + "epoch": 2.3402874952296147, + "ewc_loss": 0.0770988017320633, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003974528517574072, + "grad_norm": 8.973011016845703, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8741610050201416, + "num_tokens": 702014508.0, + "step": 18397 + }, + { + "epoch": 2.340414705508205, + "ewc_loss": 0.07684963941574097, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003949612728320062, + "grad_norm": 8.88841724395752, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8752151131629944, + "num_tokens": 702055754.0, + "step": 18398 + }, + { + "epoch": 2.3405419157867957, + "ewc_loss": 0.07701779156923294, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039664277574047446, + "grad_norm": 8.879049301147461, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8629688024520874, + "num_tokens": 702100022.0, + "step": 18399 + }, + { + "epoch": 2.3406691260653862, + "ewc_loss": 0.07700780034065247, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003965428040828556, + "grad_norm": 8.934578895568848, + "learning_rate": 1e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8523455858230591, + "num_tokens": 702133217.0, + "step": 18400 + }, + { + "epoch": 2.3407963363439768, + "ewc_loss": 0.07689353078603745, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039540015859529376, + "grad_norm": 8.867890357971191, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.876570463180542, + "num_tokens": 702176998.0, + "step": 18401 + }, + { + "epoch": 2.3409235466225673, + "ewc_loss": 0.07715436071157455, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039800844388082623, + "grad_norm": 8.925533294677734, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8703139424324036, + "num_tokens": 702216219.0, + "step": 18402 + }, + { + "epoch": 2.341050756901158, + "ewc_loss": 0.07688891887664795, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003953540581278503, + "grad_norm": 8.919757843017578, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.857674777507782, + "num_tokens": 702246502.0, + "step": 18403 + }, + { + "epoch": 2.341177967179748, + "ewc_loss": 0.07711230963468552, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003975879226345569, + "grad_norm": 8.950425148010254, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8710954189300537, + "num_tokens": 702277990.0, + "step": 18404 + }, + { + "epoch": 2.341305177458339, + "ewc_loss": 0.07693032920360565, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003957681474275887, + "grad_norm": 8.939835548400879, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.86457359790802, + "num_tokens": 702312742.0, + "step": 18405 + }, + { + "epoch": 2.341432387736929, + "ewc_loss": 0.07692503184080124, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000395715149352327, + "grad_norm": 8.8974609375, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8789687156677246, + "num_tokens": 702348575.0, + "step": 18406 + }, + { + "epoch": 2.3415595980155195, + "ewc_loss": 0.07695435732603073, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003960084286518395, + "grad_norm": 8.891831398010254, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8527366518974304, + "num_tokens": 702385956.0, + "step": 18407 + }, + { + "epoch": 2.34168680829411, + "ewc_loss": 0.07698175311088562, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039628238300792873, + "grad_norm": 8.921406745910645, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8694522380828857, + "num_tokens": 702423973.0, + "step": 18408 + }, + { + "epoch": 2.3418140185727006, + "ewc_loss": 0.07684934139251709, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003949582460336387, + "grad_norm": 8.915363311767578, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8597153425216675, + "num_tokens": 702455508.0, + "step": 18409 + }, + { + "epoch": 2.341941228851291, + "ewc_loss": 0.07698145508766174, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039627935620956123, + "grad_norm": 8.884571075439453, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8761308193206787, + "num_tokens": 702493365.0, + "step": 18410 + }, + { + "epoch": 2.3420684391298816, + "ewc_loss": 0.07690396904945374, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003955045249313116, + "grad_norm": 8.839218139648438, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8662583231925964, + "num_tokens": 702528036.0, + "step": 18411 + }, + { + "epoch": 2.342195649408472, + "ewc_loss": 0.07721138000488281, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003985786170233041, + "grad_norm": 8.973404884338379, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8589427471160889, + "num_tokens": 702571238.0, + "step": 18412 + }, + { + "epoch": 2.3423228596870627, + "ewc_loss": 0.07689738273620605, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003954386920668185, + "grad_norm": 8.810559272766113, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8590647578239441, + "num_tokens": 702611247.0, + "step": 18413 + }, + { + "epoch": 2.342450069965653, + "ewc_loss": 0.07747727632522583, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004012376011814922, + "grad_norm": 8.958641052246094, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8647401332855225, + "num_tokens": 702649661.0, + "step": 18414 + }, + { + "epoch": 2.3425772802442437, + "ewc_loss": 0.07679005712270737, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039436540100723505, + "grad_norm": 8.847188949584961, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8559402823448181, + "num_tokens": 702693573.0, + "step": 18415 + }, + { + "epoch": 2.3427044905228342, + "ewc_loss": 0.07752800732851028, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040174491005018353, + "grad_norm": 8.90245532989502, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.874309778213501, + "num_tokens": 702735351.0, + "step": 18416 + }, + { + "epoch": 2.3428317008014248, + "ewc_loss": 0.07709914445877075, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003974562860094011, + "grad_norm": 8.913637161254883, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8699944615364075, + "num_tokens": 702773406.0, + "step": 18417 + }, + { + "epoch": 2.3429589110800153, + "ewc_loss": 0.07720549404621124, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039851979818195105, + "grad_norm": 8.905631065368652, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8710138201713562, + "num_tokens": 702809939.0, + "step": 18418 + }, + { + "epoch": 2.343086121358606, + "ewc_loss": 0.07733310759067535, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039979591383598745, + "grad_norm": 8.934924125671387, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8618404269218445, + "num_tokens": 702851120.0, + "step": 18419 + }, + { + "epoch": 2.3432133316371964, + "ewc_loss": 0.0771779865026474, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039824473788030446, + "grad_norm": 8.927270889282227, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8719198703765869, + "num_tokens": 702889492.0, + "step": 18420 + }, + { + "epoch": 2.343340541915787, + "ewc_loss": 0.07728631794452667, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039932806976139545, + "grad_norm": 8.86338996887207, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8580232858657837, + "num_tokens": 702930486.0, + "step": 18421 + }, + { + "epoch": 2.3434677521943774, + "ewc_loss": 0.07735224068164825, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039998721331357956, + "grad_norm": 8.936493873596191, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8560714721679688, + "num_tokens": 702969866.0, + "step": 18422 + }, + { + "epoch": 2.343594962472968, + "ewc_loss": 0.07726901769638062, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039915498928166926, + "grad_norm": 8.9264554977417, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8548277616500854, + "num_tokens": 703013703.0, + "step": 18423 + }, + { + "epoch": 2.3437221727515585, + "ewc_loss": 0.07732885330915451, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003997533640358597, + "grad_norm": 8.910721778869629, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8548816442489624, + "num_tokens": 703054849.0, + "step": 18424 + }, + { + "epoch": 2.343849383030149, + "ewc_loss": 0.07740887999534607, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040055366116575897, + "grad_norm": 8.984596252441406, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8591116666793823, + "num_tokens": 703088250.0, + "step": 18425 + }, + { + "epoch": 2.3439765933087395, + "ewc_loss": 0.07720862329006195, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003985510556958616, + "grad_norm": 8.940702438354492, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8591599464416504, + "num_tokens": 703124072.0, + "step": 18426 + }, + { + "epoch": 2.3441038035873296, + "ewc_loss": 0.07736349105834961, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040009975782595575, + "grad_norm": 8.999185562133789, + "learning_rate": 1e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8454110622406006, + "num_tokens": 703158869.0, + "step": 18427 + }, + { + "epoch": 2.3442310138659206, + "ewc_loss": 0.07708919793367386, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039735680911689997, + "grad_norm": 8.891596794128418, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8656165599822998, + "num_tokens": 703193342.0, + "step": 18428 + }, + { + "epoch": 2.3443582241445107, + "ewc_loss": 0.07745276391506195, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040099251782521605, + "grad_norm": 8.974753379821777, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8756784200668335, + "num_tokens": 703229899.0, + "step": 18429 + }, + { + "epoch": 2.344485434423101, + "ewc_loss": 0.07708276808261871, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039729251875542104, + "grad_norm": 8.869677543640137, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8616611361503601, + "num_tokens": 703266735.0, + "step": 18430 + }, + { + "epoch": 2.3446126447016917, + "ewc_loss": 0.07749141752719879, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040137904579751194, + "grad_norm": 9.005086898803711, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8602614998817444, + "num_tokens": 703307515.0, + "step": 18431 + }, + { + "epoch": 2.3447398549802823, + "ewc_loss": 0.0770258754491806, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003967235970776528, + "grad_norm": 8.89993667602539, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.873554527759552, + "num_tokens": 703341237.0, + "step": 18432 + }, + { + "epoch": 2.344867065258873, + "ewc_loss": 0.07735013961791992, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039996628765948117, + "grad_norm": 8.92659854888916, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8740969896316528, + "num_tokens": 703378900.0, + "step": 18433 + }, + { + "epoch": 2.3449942755374633, + "ewc_loss": 0.0771884024143219, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003983489004895091, + "grad_norm": 8.885626792907715, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8720130920410156, + "num_tokens": 703415260.0, + "step": 18434 + }, + { + "epoch": 2.345121485816054, + "ewc_loss": 0.07743485271930695, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040081338374875486, + "grad_norm": 9.013677597045898, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.870311975479126, + "num_tokens": 703452399.0, + "step": 18435 + }, + { + "epoch": 2.3452486960946444, + "ewc_loss": 0.07701674103736877, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003966322692576796, + "grad_norm": 8.794881820678711, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8648702502250671, + "num_tokens": 703492700.0, + "step": 18436 + }, + { + "epoch": 2.345375906373235, + "ewc_loss": 0.07775335013866425, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004039983032271266, + "grad_norm": 8.995387077331543, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8753488063812256, + "num_tokens": 703530805.0, + "step": 18437 + }, + { + "epoch": 2.3455031166518254, + "ewc_loss": 0.0770099088549614, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039656393346376717, + "grad_norm": 8.82922077178955, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8729971647262573, + "num_tokens": 703571894.0, + "step": 18438 + }, + { + "epoch": 2.345630326930416, + "ewc_loss": 0.07761929929256439, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004026578099001199, + "grad_norm": 8.968461036682129, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8696416616439819, + "num_tokens": 703612583.0, + "step": 18439 + }, + { + "epoch": 2.3457575372090065, + "ewc_loss": 0.07718618214130402, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003983266360592097, + "grad_norm": 8.936306953430176, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8567484617233276, + "num_tokens": 703649100.0, + "step": 18440 + }, + { + "epoch": 2.345884747487597, + "ewc_loss": 0.07735231518745422, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003999879991170019, + "grad_norm": 9.005173683166504, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8789092302322388, + "num_tokens": 703680425.0, + "step": 18441 + }, + { + "epoch": 2.3460119577661875, + "ewc_loss": 0.07712125778198242, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039767747512087226, + "grad_norm": 8.866215705871582, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.870466947555542, + "num_tokens": 703721819.0, + "step": 18442 + }, + { + "epoch": 2.346139168044778, + "ewc_loss": 0.07742341607809067, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004006990056950599, + "grad_norm": 8.995341300964355, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8609768748283386, + "num_tokens": 703755432.0, + "step": 18443 + }, + { + "epoch": 2.3462663783233686, + "ewc_loss": 0.07704903930425644, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039695523446425796, + "grad_norm": 8.878273010253906, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8640481233596802, + "num_tokens": 703795903.0, + "step": 18444 + }, + { + "epoch": 2.346393588601959, + "ewc_loss": 0.07756844162940979, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040214930777437985, + "grad_norm": 8.951064109802246, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8760414123535156, + "num_tokens": 703836074.0, + "step": 18445 + }, + { + "epoch": 2.3465207988805497, + "ewc_loss": 0.07721194624900818, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003985842631664127, + "grad_norm": 8.889358520507812, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8694953918457031, + "num_tokens": 703870609.0, + "step": 18446 + }, + { + "epoch": 2.34664800915914, + "ewc_loss": 0.07724940776824951, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003989588876720518, + "grad_norm": 8.867253303527832, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8687974810600281, + "num_tokens": 703909034.0, + "step": 18447 + }, + { + "epoch": 2.3467752194377307, + "ewc_loss": 0.07748837769031525, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004013486613985151, + "grad_norm": 8.964128494262695, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8571659326553345, + "num_tokens": 703946976.0, + "step": 18448 + }, + { + "epoch": 2.3469024297163212, + "ewc_loss": 0.07719241082668304, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039838894736021757, + "grad_norm": 8.873936653137207, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8666268587112427, + "num_tokens": 703986379.0, + "step": 18449 + }, + { + "epoch": 2.3470296399949118, + "ewc_loss": 0.07756626605987549, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004021275381091982, + "grad_norm": 8.9412260055542, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8819295763969421, + "num_tokens": 704019461.0, + "step": 18450 + }, + { + "epoch": 2.3471568502735023, + "ewc_loss": 0.07729056477546692, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003993704740423709, + "grad_norm": 9.0103178024292, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.853217363357544, + "num_tokens": 704053411.0, + "step": 18451 + }, + { + "epoch": 2.3472840605520924, + "ewc_loss": 0.07708141207695007, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039727892726659775, + "grad_norm": 8.943747520446777, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8687243461608887, + "num_tokens": 704092633.0, + "step": 18452 + }, + { + "epoch": 2.3474112708306833, + "ewc_loss": 0.07732445001602173, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003997093590442091, + "grad_norm": 8.924509048461914, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8676217794418335, + "num_tokens": 704126955.0, + "step": 18453 + }, + { + "epoch": 2.3475384811092734, + "ewc_loss": 0.07721522450447083, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003986170922871679, + "grad_norm": 8.907390594482422, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8802320957183838, + "num_tokens": 704162222.0, + "step": 18454 + }, + { + "epoch": 2.347665691387864, + "ewc_loss": 0.07719212770462036, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003983861533924937, + "grad_norm": 8.932014465332031, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8649796843528748, + "num_tokens": 704198949.0, + "step": 18455 + }, + { + "epoch": 2.3477929016664545, + "ewc_loss": 0.07723459601402283, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039881077827885747, + "grad_norm": 8.954521179199219, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8594653010368347, + "num_tokens": 704235327.0, + "step": 18456 + }, + { + "epoch": 2.347920111945045, + "ewc_loss": 0.07709874212741852, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003974522987846285, + "grad_norm": 8.912668228149414, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8712216019630432, + "num_tokens": 704272783.0, + "step": 18457 + }, + { + "epoch": 2.3480473222236355, + "ewc_loss": 0.07716864347457886, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003981512563768774, + "grad_norm": 8.962328910827637, + "learning_rate": 1e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8452825546264648, + "num_tokens": 704314396.0, + "step": 18458 + }, + { + "epoch": 2.348174532502226, + "ewc_loss": 0.07706350088119507, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039709985139779747, + "grad_norm": 8.900147438049316, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8796913027763367, + "num_tokens": 704346903.0, + "step": 18459 + }, + { + "epoch": 2.3483017427808166, + "ewc_loss": 0.07739324867725372, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040039734449237585, + "grad_norm": 8.997199058532715, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.872140109539032, + "num_tokens": 704380456.0, + "step": 18460 + }, + { + "epoch": 2.348428953059407, + "ewc_loss": 0.07686598598957062, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039512471994385123, + "grad_norm": 8.861865043640137, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8763379454612732, + "num_tokens": 704416403.0, + "step": 18461 + }, + { + "epoch": 2.3485561633379977, + "ewc_loss": 0.07758438587188721, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004023086803499609, + "grad_norm": 8.985445976257324, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8664823770523071, + "num_tokens": 704460431.0, + "step": 18462 + }, + { + "epoch": 2.348683373616588, + "ewc_loss": 0.07692167162895203, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000395681505324319, + "grad_norm": 8.877333641052246, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8719636797904968, + "num_tokens": 704497872.0, + "step": 18463 + }, + { + "epoch": 2.3488105838951787, + "ewc_loss": 0.07742345333099365, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004006993258371949, + "grad_norm": 9.006651878356934, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8666064739227295, + "num_tokens": 704538180.0, + "step": 18464 + }, + { + "epoch": 2.3489377941737692, + "ewc_loss": 0.0771128386259079, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039759327773936093, + "grad_norm": 8.921921730041504, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8778477907180786, + "num_tokens": 704578342.0, + "step": 18465 + }, + { + "epoch": 2.3490650044523598, + "ewc_loss": 0.07738575339317322, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040032240212894976, + "grad_norm": 8.984882354736328, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8755935430526733, + "num_tokens": 704611792.0, + "step": 18466 + }, + { + "epoch": 2.3491922147309503, + "ewc_loss": 0.07707157731056213, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003971806145273149, + "grad_norm": 9.011237144470215, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8677993416786194, + "num_tokens": 704649898.0, + "step": 18467 + }, + { + "epoch": 2.349319425009541, + "ewc_loss": 0.0770580992102623, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039704583468846977, + "grad_norm": 8.912819862365723, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8735085129737854, + "num_tokens": 704690632.0, + "step": 18468 + }, + { + "epoch": 2.3494466352881314, + "ewc_loss": 0.07740519940853119, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004005168448202312, + "grad_norm": 9.143438339233398, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8543928861618042, + "num_tokens": 704722955.0, + "step": 18469 + }, + { + "epoch": 2.349573845566722, + "ewc_loss": 0.07658054679632187, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039227030356414616, + "grad_norm": 8.817770004272461, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8656139373779297, + "num_tokens": 704762678.0, + "step": 18470 + }, + { + "epoch": 2.3497010558453124, + "ewc_loss": 0.07779903709888458, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004044551751576364, + "grad_norm": 9.151625633239746, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8918087482452393, + "num_tokens": 704805440.0, + "step": 18471 + }, + { + "epoch": 2.349828266123903, + "ewc_loss": 0.07631876319646835, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003896524722222239, + "grad_norm": 8.742449760437012, + "learning_rate": 1e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.8468276262283325, + "num_tokens": 704848153.0, + "step": 18472 + }, + { + "epoch": 2.3499554764024935, + "ewc_loss": 0.07808376848697662, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040730254841037095, + "grad_norm": 9.307229042053223, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8627219796180725, + "num_tokens": 704886739.0, + "step": 18473 + }, + { + "epoch": 2.350082686681084, + "ewc_loss": 0.07617835700511932, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003882484161294997, + "grad_norm": 8.675829887390137, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8647028207778931, + "num_tokens": 704926145.0, + "step": 18474 + }, + { + "epoch": 2.3502098969596745, + "ewc_loss": 0.07879451662302017, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004144099948462099, + "grad_norm": 9.377555847167969, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8477174043655396, + "num_tokens": 704964135.0, + "step": 18475 + }, + { + "epoch": 2.350337107238265, + "ewc_loss": 0.07629906386137009, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038945546839386225, + "grad_norm": 8.73038101196289, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8525353670120239, + "num_tokens": 705002171.0, + "step": 18476 + }, + { + "epoch": 2.350464317516855, + "ewc_loss": 0.07894477993249893, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004159126547165215, + "grad_norm": 9.222146987915039, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.883735716342926, + "num_tokens": 705040660.0, + "step": 18477 + }, + { + "epoch": 2.350591527795446, + "ewc_loss": 0.07655394077301025, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039200420724228024, + "grad_norm": 8.885459899902344, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8711304664611816, + "num_tokens": 705073655.0, + "step": 18478 + }, + { + "epoch": 2.350718738074036, + "ewc_loss": 0.07832856476306915, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040975044248625636, + "grad_norm": 9.159804344177246, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8613063097000122, + "num_tokens": 705115497.0, + "step": 18479 + }, + { + "epoch": 2.3508459483526267, + "ewc_loss": 0.0769544392824173, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003960092144552618, + "grad_norm": 8.899161338806152, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8829374313354492, + "num_tokens": 705152642.0, + "step": 18480 + }, + { + "epoch": 2.3509731586312173, + "ewc_loss": 0.0779833048582077, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004062978841830045, + "grad_norm": 9.122443199157715, + "learning_rate": 1e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8485487699508667, + "num_tokens": 705187282.0, + "step": 18481 + }, + { + "epoch": 2.351100368909808, + "ewc_loss": 0.07705055177211761, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000396970339352265, + "grad_norm": 8.890982627868652, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8695532083511353, + "num_tokens": 705225635.0, + "step": 18482 + }, + { + "epoch": 2.3512275791883983, + "ewc_loss": 0.07783547788858414, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004048196133226156, + "grad_norm": 9.115180969238281, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8736778497695923, + "num_tokens": 705260376.0, + "step": 18483 + }, + { + "epoch": 2.351354789466989, + "ewc_loss": 0.07704664766788483, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000396931340219453, + "grad_norm": 8.9213285446167, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8630651235580444, + "num_tokens": 705298799.0, + "step": 18484 + }, + { + "epoch": 2.3514819997455794, + "ewc_loss": 0.0776199996471405, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004026647948194295, + "grad_norm": 9.04132080078125, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8470112085342407, + "num_tokens": 705332570.0, + "step": 18485 + }, + { + "epoch": 2.35160921002417, + "ewc_loss": 0.07710668444633484, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003975316940341145, + "grad_norm": 8.935713768005371, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8667712211608887, + "num_tokens": 705371214.0, + "step": 18486 + }, + { + "epoch": 2.3517364203027604, + "ewc_loss": 0.07750408351421356, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004015056474599987, + "grad_norm": 9.003153800964355, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8613128066062927, + "num_tokens": 705405051.0, + "step": 18487 + }, + { + "epoch": 2.351863630581351, + "ewc_loss": 0.07737865298986435, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004002513596788049, + "grad_norm": 8.964771270751953, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8646537065505981, + "num_tokens": 705442460.0, + "step": 18488 + }, + { + "epoch": 2.3519908408599415, + "ewc_loss": 0.07750582695007324, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004015230806544423, + "grad_norm": 9.013132095336914, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8643685579299927, + "num_tokens": 705482929.0, + "step": 18489 + }, + { + "epoch": 2.352118051138532, + "ewc_loss": 0.07735560834407806, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000400020886445418, + "grad_norm": 8.885226249694824, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8592550754547119, + "num_tokens": 705525313.0, + "step": 18490 + }, + { + "epoch": 2.3522452614171225, + "ewc_loss": 0.07777783274650574, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004042431537527591, + "grad_norm": 9.069928169250488, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8610327243804932, + "num_tokens": 705563730.0, + "step": 18491 + }, + { + "epoch": 2.352372471695713, + "ewc_loss": 0.0769706517457962, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039617132279090583, + "grad_norm": 8.8473539352417, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8686888813972473, + "num_tokens": 705602459.0, + "step": 18492 + }, + { + "epoch": 2.3524996819743036, + "ewc_loss": 0.0779348611831665, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004058135091327131, + "grad_norm": 9.029472351074219, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8619251251220703, + "num_tokens": 705636223.0, + "step": 18493 + }, + { + "epoch": 2.352626892252894, + "ewc_loss": 0.07717891037464142, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039825396379455924, + "grad_norm": 8.940262794494629, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8623412847518921, + "num_tokens": 705676178.0, + "step": 18494 + }, + { + "epoch": 2.3527541025314846, + "ewc_loss": 0.07762566208839417, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040272148908115923, + "grad_norm": 9.014264106750488, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8719623684883118, + "num_tokens": 705716687.0, + "step": 18495 + }, + { + "epoch": 2.352881312810075, + "ewc_loss": 0.07728153467178345, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039928016485646367, + "grad_norm": 8.904558181762695, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8732562065124512, + "num_tokens": 705759163.0, + "step": 18496 + }, + { + "epoch": 2.3530085230886657, + "ewc_loss": 0.07761985063552856, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004026633978355676, + "grad_norm": 8.995237350463867, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8670976758003235, + "num_tokens": 705795982.0, + "step": 18497 + }, + { + "epoch": 2.3531357333672562, + "ewc_loss": 0.07735516130924225, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004000164335593581, + "grad_norm": 8.926403045654297, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8682093620300293, + "num_tokens": 705835601.0, + "step": 18498 + }, + { + "epoch": 2.3532629436458468, + "ewc_loss": 0.07756666839122772, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004021315253339708, + "grad_norm": 9.023967742919922, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8625388741493225, + "num_tokens": 705876858.0, + "step": 18499 + }, + { + "epoch": 2.353390153924437, + "ewc_loss": 0.07722868025302887, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039875166839919984, + "grad_norm": 8.898319244384766, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8560613393783569, + "num_tokens": 705921944.0, + "step": 18500 + }, + { + "epoch": 2.353517364203028, + "ewc_loss": 0.07767582684755325, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040322309359908104, + "grad_norm": 9.060436248779297, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8658703565597534, + "num_tokens": 705967999.0, + "step": 18501 + }, + { + "epoch": 2.353644574481618, + "ewc_loss": 0.07717545330524445, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003982193593401462, + "grad_norm": 8.949918746948242, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8580029606819153, + "num_tokens": 706003370.0, + "step": 18502 + }, + { + "epoch": 2.353771784760209, + "ewc_loss": 0.07764121890068054, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004028770490549505, + "grad_norm": 9.085150718688965, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8531458377838135, + "num_tokens": 706036551.0, + "step": 18503 + }, + { + "epoch": 2.353898995038799, + "ewc_loss": 0.07706677913665771, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039713268051855266, + "grad_norm": 8.856797218322754, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.882716715335846, + "num_tokens": 706075965.0, + "step": 18504 + }, + { + "epoch": 2.3540262053173895, + "ewc_loss": 0.07778489589691162, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040431381785310805, + "grad_norm": 9.084718704223633, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8533532023429871, + "num_tokens": 706117943.0, + "step": 18505 + }, + { + "epoch": 2.35415341559598, + "ewc_loss": 0.07709930092096329, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039745785761624575, + "grad_norm": 8.89026927947998, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.868847668170929, + "num_tokens": 706157183.0, + "step": 18506 + }, + { + "epoch": 2.3542806258745705, + "ewc_loss": 0.07779690623283386, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040443390025757253, + "grad_norm": 9.02549934387207, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8727023601531982, + "num_tokens": 706196878.0, + "step": 18507 + }, + { + "epoch": 2.354407836153161, + "ewc_loss": 0.07719126343727112, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003983775095548481, + "grad_norm": 8.943488121032715, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8515576124191284, + "num_tokens": 706231102.0, + "step": 18508 + }, + { + "epoch": 2.3545350464317516, + "ewc_loss": 0.07752592116594315, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040172404260374606, + "grad_norm": 9.017033576965332, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8728548288345337, + "num_tokens": 706273372.0, + "step": 18509 + }, + { + "epoch": 2.354662256710342, + "ewc_loss": 0.0773032158613205, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039949698839336634, + "grad_norm": 8.956461906433105, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8603695631027222, + "num_tokens": 706308106.0, + "step": 18510 + }, + { + "epoch": 2.3547894669889327, + "ewc_loss": 0.07756132632493973, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004020780907012522, + "grad_norm": 9.00912094116211, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8717710971832275, + "num_tokens": 706344267.0, + "step": 18511 + }, + { + "epoch": 2.354916677267523, + "ewc_loss": 0.07746237516403198, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040108864777721465, + "grad_norm": 8.978958129882812, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8837604522705078, + "num_tokens": 706385019.0, + "step": 18512 + }, + { + "epoch": 2.3550438875461137, + "ewc_loss": 0.07748011499643326, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000401266006520018, + "grad_norm": 8.991129875183105, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8633253574371338, + "num_tokens": 706424913.0, + "step": 18513 + }, + { + "epoch": 2.3551710978247042, + "ewc_loss": 0.07732297480106354, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039969454519450665, + "grad_norm": 8.999773979187012, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8649446964263916, + "num_tokens": 706466064.0, + "step": 18514 + }, + { + "epoch": 2.3552983081032948, + "ewc_loss": 0.07746211439371109, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004010859702248126, + "grad_norm": 9.020854949951172, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8706098198890686, + "num_tokens": 706504572.0, + "step": 18515 + }, + { + "epoch": 2.3554255183818853, + "ewc_loss": 0.07737600803375244, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040022493340075016, + "grad_norm": 9.002269744873047, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8577991724014282, + "num_tokens": 706543901.0, + "step": 18516 + }, + { + "epoch": 2.355552728660476, + "ewc_loss": 0.07742837071418762, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040074859862215817, + "grad_norm": 9.0136137008667, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8584943413734436, + "num_tokens": 706579095.0, + "step": 18517 + }, + { + "epoch": 2.3556799389390664, + "ewc_loss": 0.07740232348442078, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004004881193395704, + "grad_norm": 8.992817878723145, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8564773201942444, + "num_tokens": 706614078.0, + "step": 18518 + }, + { + "epoch": 2.355807149217657, + "ewc_loss": 0.07746250182390213, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040108987013809383, + "grad_norm": 8.994730949401855, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8719630837440491, + "num_tokens": 706652596.0, + "step": 18519 + }, + { + "epoch": 2.3559343594962474, + "ewc_loss": 0.07751290500164032, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040159394848160446, + "grad_norm": 9.017616271972656, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8880139589309692, + "num_tokens": 706691297.0, + "step": 18520 + }, + { + "epoch": 2.356061569774838, + "ewc_loss": 0.077381432056427, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040027921204455197, + "grad_norm": 8.961042404174805, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8536401391029358, + "num_tokens": 706727344.0, + "step": 18521 + }, + { + "epoch": 2.3561887800534285, + "ewc_loss": 0.07753636687994003, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004018284962512553, + "grad_norm": 9.006638526916504, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8693111538887024, + "num_tokens": 706770411.0, + "step": 18522 + }, + { + "epoch": 2.356315990332019, + "ewc_loss": 0.0773797482252121, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004002623609267175, + "grad_norm": 8.961749076843262, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8693037033081055, + "num_tokens": 706816215.0, + "step": 18523 + }, + { + "epoch": 2.3564432006106095, + "ewc_loss": 0.07755153626203537, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004019802145194262, + "grad_norm": 8.983344078063965, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8594530820846558, + "num_tokens": 706857400.0, + "step": 18524 + }, + { + "epoch": 2.3565704108891996, + "ewc_loss": 0.07746627926826477, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040112764691002667, + "grad_norm": 9.033260345458984, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8570832014083862, + "num_tokens": 706892715.0, + "step": 18525 + }, + { + "epoch": 2.3566976211677906, + "ewc_loss": 0.07742998003959656, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040076463483273983, + "grad_norm": 8.984796524047852, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8648209571838379, + "num_tokens": 706934373.0, + "step": 18526 + }, + { + "epoch": 2.3568248314463807, + "ewc_loss": 0.07771037518978119, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040356864337809384, + "grad_norm": 9.0625638961792, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8657997846603394, + "num_tokens": 706976885.0, + "step": 18527 + }, + { + "epoch": 2.356952041724971, + "ewc_loss": 0.07717718183994293, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003982366470154375, + "grad_norm": 8.937042236328125, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8715063333511353, + "num_tokens": 707016957.0, + "step": 18528 + }, + { + "epoch": 2.3570792520035617, + "ewc_loss": 0.07773661613464355, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040383104351349175, + "grad_norm": 9.053321838378906, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8750076293945312, + "num_tokens": 707053687.0, + "step": 18529 + }, + { + "epoch": 2.3572064622821522, + "ewc_loss": 0.07717578113079071, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039822261896915734, + "grad_norm": 8.950246810913086, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8594475388526917, + "num_tokens": 707087327.0, + "step": 18530 + }, + { + "epoch": 2.3573336725607428, + "ewc_loss": 0.07763510197401047, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004028158728033304, + "grad_norm": 8.98968505859375, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8771457672119141, + "num_tokens": 707124479.0, + "step": 18531 + }, + { + "epoch": 2.3574608828393333, + "ewc_loss": 0.0774533748626709, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004009985423181206, + "grad_norm": 9.043835639953613, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8680729866027832, + "num_tokens": 707162452.0, + "step": 18532 + }, + { + "epoch": 2.357588093117924, + "ewc_loss": 0.07739946246147156, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004004594520665705, + "grad_norm": 9.065364837646484, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8732410669326782, + "num_tokens": 707196765.0, + "step": 18533 + }, + { + "epoch": 2.3577153033965144, + "ewc_loss": 0.0774582177400589, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004010470292996615, + "grad_norm": 8.972233772277832, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8714159727096558, + "num_tokens": 707230151.0, + "step": 18534 + }, + { + "epoch": 2.357842513675105, + "ewc_loss": 0.07746666669845581, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040113148861564696, + "grad_norm": 9.055131912231445, + "learning_rate": 1e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8481268882751465, + "num_tokens": 707264373.0, + "step": 18535 + }, + { + "epoch": 2.3579697239536954, + "ewc_loss": 0.07723554968833923, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003988202952314168, + "grad_norm": 8.968223571777344, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8595479130744934, + "num_tokens": 707304844.0, + "step": 18536 + }, + { + "epoch": 2.358096934232286, + "ewc_loss": 0.0776442140340805, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004029070260003209, + "grad_norm": 9.049674034118652, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8670220375061035, + "num_tokens": 707347081.0, + "step": 18537 + }, + { + "epoch": 2.3582241445108765, + "ewc_loss": 0.07714840769767761, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039794895565137267, + "grad_norm": 8.969584465026855, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8587632775306702, + "num_tokens": 707388243.0, + "step": 18538 + }, + { + "epoch": 2.358351354789467, + "ewc_loss": 0.07747820019721985, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040124679799191654, + "grad_norm": 9.066025733947754, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8525331616401672, + "num_tokens": 707427109.0, + "step": 18539 + }, + { + "epoch": 2.3584785650680575, + "ewc_loss": 0.07714004814624786, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039786528213880956, + "grad_norm": 8.945348739624023, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8665732145309448, + "num_tokens": 707462502.0, + "step": 18540 + }, + { + "epoch": 2.358605775346648, + "ewc_loss": 0.07747924327850342, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004012573044747114, + "grad_norm": 9.112071990966797, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8596274852752686, + "num_tokens": 707502861.0, + "step": 18541 + }, + { + "epoch": 2.3587329856252386, + "ewc_loss": 0.0770462229847908, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039692706195637584, + "grad_norm": 8.920645713806152, + "learning_rate": 1e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8450360298156738, + "num_tokens": 707535868.0, + "step": 18542 + }, + { + "epoch": 2.358860195903829, + "ewc_loss": 0.07760736346244812, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040253851329907775, + "grad_norm": 9.081692695617676, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8680041432380676, + "num_tokens": 707566128.0, + "step": 18543 + }, + { + "epoch": 2.3589874061824196, + "ewc_loss": 0.07696431875228882, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003961080510634929, + "grad_norm": 8.979918479919434, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8718015551567078, + "num_tokens": 707605113.0, + "step": 18544 + }, + { + "epoch": 2.35911461646101, + "ewc_loss": 0.07733599841594696, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000399824813939631, + "grad_norm": 8.940566062927246, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.862638533115387, + "num_tokens": 707647968.0, + "step": 18545 + }, + { + "epoch": 2.3592418267396007, + "ewc_loss": 0.07753530144691467, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040181787335313857, + "grad_norm": 9.111129760742188, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8732280135154724, + "num_tokens": 707682183.0, + "step": 18546 + }, + { + "epoch": 2.3593690370181912, + "ewc_loss": 0.07689203321933746, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003953851410187781, + "grad_norm": 9.037932395935059, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8734723925590515, + "num_tokens": 707720938.0, + "step": 18547 + }, + { + "epoch": 2.3594962472967818, + "ewc_loss": 0.07723764330148697, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003988412790931761, + "grad_norm": 8.95297908782959, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8535062074661255, + "num_tokens": 707762624.0, + "step": 18548 + }, + { + "epoch": 2.3596234575753723, + "ewc_loss": 0.07731425762176514, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003996074665337801, + "grad_norm": 9.127496719360352, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8572921752929688, + "num_tokens": 707798406.0, + "step": 18549 + }, + { + "epoch": 2.3597506678539624, + "ewc_loss": 0.07690154016017914, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003954802523367107, + "grad_norm": 8.911589622497559, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8656637668609619, + "num_tokens": 707835935.0, + "step": 18550 + }, + { + "epoch": 2.3598778781325533, + "ewc_loss": 0.07770183682441711, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004034832236357033, + "grad_norm": 9.247597694396973, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8763610124588013, + "num_tokens": 707872971.0, + "step": 18551 + }, + { + "epoch": 2.3600050884111434, + "ewc_loss": 0.07669322192668915, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039339708746410906, + "grad_norm": 8.940446853637695, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8743681907653809, + "num_tokens": 707909798.0, + "step": 18552 + }, + { + "epoch": 2.360132298689734, + "ewc_loss": 0.07809406518936157, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00040496408473700285, + "grad_norm": 9.176774978637695, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.869400143623352, + "num_tokens": 707951130.0, + "step": 18553 + }, + { + "epoch": 2.3602595089683245, + "ewc_loss": 0.0765533447265625, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003919982409570366, + "grad_norm": 8.864813804626465, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8509721755981445, + "num_tokens": 707989502.0, + "step": 18554 + }, + { + "epoch": 2.360386719246915, + "ewc_loss": 0.07800224423408508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040648726280778646, + "grad_norm": 9.21570873260498, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8645778894424438, + "num_tokens": 708023521.0, + "step": 18555 + }, + { + "epoch": 2.3605139295255055, + "ewc_loss": 0.07666009664535522, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039306579856202006, + "grad_norm": 8.894996643066406, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8714025020599365, + "num_tokens": 708057594.0, + "step": 18556 + }, + { + "epoch": 2.360641139804096, + "ewc_loss": 0.07798948884010315, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004063597589265555, + "grad_norm": 9.10966682434082, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8616620302200317, + "num_tokens": 708101393.0, + "step": 18557 + }, + { + "epoch": 2.3607683500826866, + "ewc_loss": 0.07694696635007858, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003959345049224794, + "grad_norm": 8.948744773864746, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.875478982925415, + "num_tokens": 708137141.0, + "step": 18558 + }, + { + "epoch": 2.360895560361277, + "ewc_loss": 0.07758402824401855, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040230515878647566, + "grad_norm": 9.075799942016602, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8700916767120361, + "num_tokens": 708173497.0, + "step": 18559 + }, + { + "epoch": 2.3610227706398677, + "ewc_loss": 0.07725183665752411, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003989831602666527, + "grad_norm": 8.947385787963867, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8660733699798584, + "num_tokens": 708209412.0, + "step": 18560 + }, + { + "epoch": 2.361149980918458, + "ewc_loss": 0.07754427194595337, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004019075131509453, + "grad_norm": 9.159065246582031, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8722676038742065, + "num_tokens": 708239675.0, + "step": 18561 + }, + { + "epoch": 2.3612771911970487, + "ewc_loss": 0.07696223258972168, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003960871836170554, + "grad_norm": 8.874409675598145, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8597545623779297, + "num_tokens": 708274918.0, + "step": 18562 + }, + { + "epoch": 2.3614044014756392, + "ewc_loss": 0.07804112136363983, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004068760317750275, + "grad_norm": 9.117660522460938, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8511031270027161, + "num_tokens": 708313645.0, + "step": 18563 + }, + { + "epoch": 2.3615316117542298, + "ewc_loss": 0.07704292237758636, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003968941164202988, + "grad_norm": 8.969246864318848, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.86668860912323, + "num_tokens": 708357985.0, + "step": 18564 + }, + { + "epoch": 2.3616588220328203, + "ewc_loss": 0.07762672007083893, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004027319955639541, + "grad_norm": 9.087898254394531, + "learning_rate": 1e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8437577486038208, + "num_tokens": 708393606.0, + "step": 18565 + }, + { + "epoch": 2.361786032311411, + "ewc_loss": 0.07730793952941895, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039954419480636716, + "grad_norm": 9.041975021362305, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8707876205444336, + "num_tokens": 708428941.0, + "step": 18566 + }, + { + "epoch": 2.3619132425900013, + "ewc_loss": 0.0773707926273346, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004001727793365717, + "grad_norm": 9.003518104553223, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8696808218955994, + "num_tokens": 708467847.0, + "step": 18567 + }, + { + "epoch": 2.362040452868592, + "ewc_loss": 0.07720249891281128, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039848985034041107, + "grad_norm": 8.943487167358398, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8817732334136963, + "num_tokens": 708503212.0, + "step": 18568 + }, + { + "epoch": 2.3621676631471824, + "ewc_loss": 0.07739447057247162, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004004095681011677, + "grad_norm": 9.038768768310547, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8648982644081116, + "num_tokens": 708542897.0, + "step": 18569 + }, + { + "epoch": 2.362294873425773, + "ewc_loss": 0.07720720767974854, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003985369112342596, + "grad_norm": 9.44179916381836, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8522880673408508, + "num_tokens": 708574725.0, + "step": 18570 + }, + { + "epoch": 2.3624220837043635, + "ewc_loss": 0.07641027867794037, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039056764217093587, + "grad_norm": 8.81750202178955, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8763889670372009, + "num_tokens": 708613557.0, + "step": 18571 + }, + { + "epoch": 2.362549293982954, + "ewc_loss": 0.07807856798171997, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004072505689691752, + "grad_norm": 9.220519065856934, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8574178218841553, + "num_tokens": 708651872.0, + "step": 18572 + }, + { + "epoch": 2.3626765042615445, + "ewc_loss": 0.07624171674251556, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00038888200651854277, + "grad_norm": 8.708944320678711, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8517528176307678, + "num_tokens": 708688654.0, + "step": 18573 + }, + { + "epoch": 2.362803714540135, + "ewc_loss": 0.07843810319900513, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041084588156081736, + "grad_norm": 9.260737419128418, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.870590329170227, + "num_tokens": 708723285.0, + "step": 18574 + }, + { + "epoch": 2.362930924818725, + "ewc_loss": 0.07650455832481384, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039151046075858176, + "grad_norm": 8.778861999511719, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.873999834060669, + "num_tokens": 708760617.0, + "step": 18575 + }, + { + "epoch": 2.363058135097316, + "ewc_loss": 0.078611359000206, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004125784616917372, + "grad_norm": 9.384157180786133, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8699145913124084, + "num_tokens": 708790792.0, + "step": 18576 + }, + { + "epoch": 2.363185345375906, + "ewc_loss": 0.0765448808670044, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039191366522572935, + "grad_norm": 8.810810089111328, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8563560247421265, + "num_tokens": 708829935.0, + "step": 18577 + }, + { + "epoch": 2.3633125556544967, + "ewc_loss": 0.0785883367061615, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041234822128899395, + "grad_norm": 9.240477561950684, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8590914607048035, + "num_tokens": 708870578.0, + "step": 18578 + }, + { + "epoch": 2.3634397659330872, + "ewc_loss": 0.07681122422218323, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003945770440623164, + "grad_norm": 8.892946243286133, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8643474578857422, + "num_tokens": 708907044.0, + "step": 18579 + }, + { + "epoch": 2.3635669762116778, + "ewc_loss": 0.07823961228132248, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040886097121983767, + "grad_norm": 9.256858825683594, + "learning_rate": 1e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8473933339118958, + "num_tokens": 708947443.0, + "step": 18580 + }, + { + "epoch": 2.3636941864902683, + "ewc_loss": 0.076726034283638, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039372514584101737, + "grad_norm": 8.847871780395508, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8663830161094666, + "num_tokens": 708983174.0, + "step": 18581 + }, + { + "epoch": 2.363821396768859, + "ewc_loss": 0.07816486060619354, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004081134975422174, + "grad_norm": 9.208290100097656, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.868942379951477, + "num_tokens": 709021950.0, + "step": 18582 + }, + { + "epoch": 2.3639486070474494, + "ewc_loss": 0.07701478153467178, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039661265327595174, + "grad_norm": 8.963050842285156, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8703960180282593, + "num_tokens": 709057236.0, + "step": 18583 + }, + { + "epoch": 2.36407581732604, + "ewc_loss": 0.07775536179542542, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040401844307780266, + "grad_norm": 9.089593887329102, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8668269515037537, + "num_tokens": 709093376.0, + "step": 18584 + }, + { + "epoch": 2.3642030276046304, + "ewc_loss": 0.07721182703971863, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039858309901319444, + "grad_norm": 9.035257339477539, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8591597676277161, + "num_tokens": 709129502.0, + "step": 18585 + }, + { + "epoch": 2.364330237883221, + "ewc_loss": 0.07737414538860321, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004002062778454274, + "grad_norm": 9.031144142150879, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.862649142742157, + "num_tokens": 709162538.0, + "step": 18586 + }, + { + "epoch": 2.3644574481618115, + "ewc_loss": 0.07723593711853027, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039882425335235894, + "grad_norm": 8.960553169250488, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8541161417961121, + "num_tokens": 709201690.0, + "step": 18587 + }, + { + "epoch": 2.364584658440402, + "ewc_loss": 0.07745043188333511, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040096917655318975, + "grad_norm": 9.061583518981934, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8582514524459839, + "num_tokens": 709240593.0, + "step": 18588 + }, + { + "epoch": 2.3647118687189925, + "ewc_loss": 0.07711711525917053, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003976359439548105, + "grad_norm": 8.924406051635742, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8759493231773376, + "num_tokens": 709278192.0, + "step": 18589 + }, + { + "epoch": 2.364839078997583, + "ewc_loss": 0.0774490088224411, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004009548865724355, + "grad_norm": 8.99622631072998, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8830747604370117, + "num_tokens": 709311797.0, + "step": 18590 + }, + { + "epoch": 2.3649662892761736, + "ewc_loss": 0.07719654589891434, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003984303039032966, + "grad_norm": 8.922879219055176, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8635671734809875, + "num_tokens": 709350379.0, + "step": 18591 + }, + { + "epoch": 2.365093499554764, + "ewc_loss": 0.07758252322673798, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004022900538984686, + "grad_norm": 9.076276779174805, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8903661966323853, + "num_tokens": 709389030.0, + "step": 18592 + }, + { + "epoch": 2.3652207098333546, + "ewc_loss": 0.07705222815275192, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039698713226243854, + "grad_norm": 8.898406028747559, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8575481176376343, + "num_tokens": 709433692.0, + "step": 18593 + }, + { + "epoch": 2.365347920111945, + "ewc_loss": 0.07764597237110138, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004029246047139168, + "grad_norm": 9.084366798400879, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8639070987701416, + "num_tokens": 709470773.0, + "step": 18594 + }, + { + "epoch": 2.3654751303905357, + "ewc_loss": 0.07696391642093658, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039610397652722895, + "grad_norm": 8.955947875976562, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.859221339225769, + "num_tokens": 709506764.0, + "step": 18595 + }, + { + "epoch": 2.3656023406691262, + "ewc_loss": 0.07751482725143433, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040161312790587544, + "grad_norm": 9.029232025146484, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8612726926803589, + "num_tokens": 709546931.0, + "step": 18596 + }, + { + "epoch": 2.3657295509477168, + "ewc_loss": 0.0772300586104393, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003987654345110059, + "grad_norm": 9.007092475891113, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8656690120697021, + "num_tokens": 709582842.0, + "step": 18597 + }, + { + "epoch": 2.365856761226307, + "ewc_loss": 0.07734846323728561, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039994949474930763, + "grad_norm": 8.959059715270996, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8656728267669678, + "num_tokens": 709621574.0, + "step": 18598 + }, + { + "epoch": 2.365983971504898, + "ewc_loss": 0.07759147137403488, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040237954817712307, + "grad_norm": 9.019115447998047, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8829938173294067, + "num_tokens": 709662437.0, + "step": 18599 + }, + { + "epoch": 2.366111181783488, + "ewc_loss": 0.07723487913608551, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039881368866190314, + "grad_norm": 8.934630393981934, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8786084055900574, + "num_tokens": 709701318.0, + "step": 18600 + }, + { + "epoch": 2.3662383920620784, + "ewc_loss": 0.07760512828826904, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004025160742457956, + "grad_norm": 9.04826545715332, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8661267757415771, + "num_tokens": 709745156.0, + "step": 18601 + }, + { + "epoch": 2.366365602340669, + "ewc_loss": 0.0773000419139862, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039946529432199895, + "grad_norm": 9.031891822814941, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8595046997070312, + "num_tokens": 709785849.0, + "step": 18602 + }, + { + "epoch": 2.3664928126192595, + "ewc_loss": 0.07732103019952774, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000399675132939592, + "grad_norm": 9.03880786895752, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8701158761978149, + "num_tokens": 709830038.0, + "step": 18603 + }, + { + "epoch": 2.36662002289785, + "ewc_loss": 0.07734896242618561, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003999544424004853, + "grad_norm": 9.023200988769531, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8622272610664368, + "num_tokens": 709871174.0, + "step": 18604 + }, + { + "epoch": 2.3667472331764405, + "ewc_loss": 0.0772448480129242, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000398913340177387, + "grad_norm": 9.001132011413574, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8856850266456604, + "num_tokens": 709905921.0, + "step": 18605 + }, + { + "epoch": 2.366874443455031, + "ewc_loss": 0.0773601084947586, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004000658809673041, + "grad_norm": 8.981147766113281, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8805181980133057, + "num_tokens": 709939856.0, + "step": 18606 + }, + { + "epoch": 2.3670016537336216, + "ewc_loss": 0.07742930203676224, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004007578536402434, + "grad_norm": 9.02633285522461, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8782320022583008, + "num_tokens": 709978805.0, + "step": 18607 + }, + { + "epoch": 2.367128864012212, + "ewc_loss": 0.07727883756160736, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003992531856056303, + "grad_norm": 8.99350643157959, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8787691593170166, + "num_tokens": 710015015.0, + "step": 18608 + }, + { + "epoch": 2.3672560742908026, + "ewc_loss": 0.07742255926132202, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040069047827273607, + "grad_norm": 9.003974914550781, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.85691237449646, + "num_tokens": 710054887.0, + "step": 18609 + }, + { + "epoch": 2.367383284569393, + "ewc_loss": 0.07740885019302368, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004005533701274544, + "grad_norm": 8.954444885253906, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8656004667282104, + "num_tokens": 710097012.0, + "step": 18610 + }, + { + "epoch": 2.3675104948479837, + "ewc_loss": 0.07745227217674255, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040098754107020795, + "grad_norm": 8.997498512268066, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8495801687240601, + "num_tokens": 710130912.0, + "step": 18611 + }, + { + "epoch": 2.3676377051265742, + "ewc_loss": 0.07744376361370087, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004009024705737829, + "grad_norm": 8.956811904907227, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8624480962753296, + "num_tokens": 710168880.0, + "step": 18612 + }, + { + "epoch": 2.3677649154051648, + "ewc_loss": 0.07768888771533966, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040335371159017086, + "grad_norm": 8.994558334350586, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8742880821228027, + "num_tokens": 710205619.0, + "step": 18613 + }, + { + "epoch": 2.3678921256837553, + "ewc_loss": 0.07752422988414764, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040170710417442024, + "grad_norm": 9.028477668762207, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8713480234146118, + "num_tokens": 710239475.0, + "step": 18614 + }, + { + "epoch": 2.368019335962346, + "ewc_loss": 0.0774163007736206, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004006278468295932, + "grad_norm": 8.949213981628418, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8688364028930664, + "num_tokens": 710276533.0, + "step": 18615 + }, + { + "epoch": 2.3681465462409363, + "ewc_loss": 0.07754722237586975, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004019371117465198, + "grad_norm": 8.915901184082031, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8804839849472046, + "num_tokens": 710316731.0, + "step": 18616 + }, + { + "epoch": 2.368273756519527, + "ewc_loss": 0.07758775353431702, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004023424116894603, + "grad_norm": 9.023412704467773, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.86045902967453, + "num_tokens": 710352310.0, + "step": 18617 + }, + { + "epoch": 2.3684009667981174, + "ewc_loss": 0.07755018770694733, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004019667685497552, + "grad_norm": 8.915339469909668, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8611640930175781, + "num_tokens": 710400203.0, + "step": 18618 + }, + { + "epoch": 2.368528177076708, + "ewc_loss": 0.07781804352998734, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004046452813781798, + "grad_norm": 9.004924774169922, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8714336156845093, + "num_tokens": 710433054.0, + "step": 18619 + }, + { + "epoch": 2.3686553873552985, + "ewc_loss": 0.07747313380241394, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040119621553458273, + "grad_norm": 8.922219276428223, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8688035607337952, + "num_tokens": 710466879.0, + "step": 18620 + }, + { + "epoch": 2.368782597633889, + "ewc_loss": 0.07782880961894989, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040475293644703925, + "grad_norm": 9.035143852233887, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8592206239700317, + "num_tokens": 710502808.0, + "step": 18621 + }, + { + "epoch": 2.3689098079124795, + "ewc_loss": 0.0775151401758194, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040161621291190386, + "grad_norm": 8.885370254516602, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.854009211063385, + "num_tokens": 710546190.0, + "step": 18622 + }, + { + "epoch": 2.3690370181910696, + "ewc_loss": 0.0780138149857521, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004066030087415129, + "grad_norm": 9.08824348449707, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8671518564224243, + "num_tokens": 710585556.0, + "step": 18623 + }, + { + "epoch": 2.3691642284696606, + "ewc_loss": 0.07736045122146606, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004000693734269589, + "grad_norm": 8.828836441040039, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8696467876434326, + "num_tokens": 710621366.0, + "step": 18624 + }, + { + "epoch": 2.3692914387482507, + "ewc_loss": 0.07828712463378906, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004093360621482134, + "grad_norm": 9.132892608642578, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8703861236572266, + "num_tokens": 710660217.0, + "step": 18625 + }, + { + "epoch": 2.369418649026841, + "ewc_loss": 0.07713401317596436, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003978049790021032, + "grad_norm": 8.857246398925781, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8620385527610779, + "num_tokens": 710695798.0, + "step": 18626 + }, + { + "epoch": 2.3695458593054317, + "ewc_loss": 0.0783584713935852, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041004957165569067, + "grad_norm": 9.098653793334961, + "learning_rate": 1e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.8480526804924011, + "num_tokens": 710732739.0, + "step": 18627 + }, + { + "epoch": 2.3696730695840222, + "ewc_loss": 0.077174112200737, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003982059715781361, + "grad_norm": 8.8847074508667, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8551880717277527, + "num_tokens": 710769651.0, + "step": 18628 + }, + { + "epoch": 2.3698002798626128, + "ewc_loss": 0.0783107727766037, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004095725598745048, + "grad_norm": 9.09687328338623, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8642928600311279, + "num_tokens": 710809487.0, + "step": 18629 + }, + { + "epoch": 2.3699274901412033, + "ewc_loss": 0.07745082676410675, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040097307646647096, + "grad_norm": 8.900477409362793, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8700031042098999, + "num_tokens": 710849152.0, + "step": 18630 + }, + { + "epoch": 2.370054700419794, + "ewc_loss": 0.07808975130319595, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040736235678195953, + "grad_norm": 9.044965744018555, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8503378033638, + "num_tokens": 710890933.0, + "step": 18631 + }, + { + "epoch": 2.3701819106983844, + "ewc_loss": 0.07760706543922424, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040253548650071025, + "grad_norm": 8.979185104370117, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8684476017951965, + "num_tokens": 710928262.0, + "step": 18632 + }, + { + "epoch": 2.370309120976975, + "ewc_loss": 0.07795564085245132, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040602125227451324, + "grad_norm": 9.04097843170166, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8703368306159973, + "num_tokens": 710962969.0, + "step": 18633 + }, + { + "epoch": 2.3704363312555654, + "ewc_loss": 0.07763754576444626, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004028402909170836, + "grad_norm": 8.987603187561035, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8616102933883667, + "num_tokens": 710998717.0, + "step": 18634 + }, + { + "epoch": 2.370563541534156, + "ewc_loss": 0.07782382518053055, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040470308158546686, + "grad_norm": 9.06967544555664, + "learning_rate": 1e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8512955904006958, + "num_tokens": 711032819.0, + "step": 18635 + }, + { + "epoch": 2.3706907518127465, + "ewc_loss": 0.07730817794799805, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003995466395281255, + "grad_norm": 8.959940910339355, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.855491042137146, + "num_tokens": 711069395.0, + "step": 18636 + }, + { + "epoch": 2.370817962091337, + "ewc_loss": 0.07769429683685303, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040340778650715947, + "grad_norm": 9.03996753692627, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8669682741165161, + "num_tokens": 711105754.0, + "step": 18637 + }, + { + "epoch": 2.3709451723699275, + "ewc_loss": 0.07738566398620605, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040032152901403606, + "grad_norm": 8.97708797454834, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8608897924423218, + "num_tokens": 711145291.0, + "step": 18638 + }, + { + "epoch": 2.371072382648518, + "ewc_loss": 0.07757885754108429, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040225344127975404, + "grad_norm": 9.046026229858398, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8658603429794312, + "num_tokens": 711183431.0, + "step": 18639 + }, + { + "epoch": 2.3711995929271086, + "ewc_loss": 0.07721912860870361, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003986561205238104, + "grad_norm": 8.940369606018066, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8509315252304077, + "num_tokens": 711229754.0, + "step": 18640 + }, + { + "epoch": 2.371326803205699, + "ewc_loss": 0.07749303430318832, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004013951984234154, + "grad_norm": 9.004585266113281, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8664486408233643, + "num_tokens": 711269071.0, + "step": 18641 + }, + { + "epoch": 2.3714540134842896, + "ewc_loss": 0.07725673913955688, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039903228753246367, + "grad_norm": 8.96515941619873, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8611306548118591, + "num_tokens": 711312036.0, + "step": 18642 + }, + { + "epoch": 2.37158122376288, + "ewc_loss": 0.07754009962081909, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004018658073619008, + "grad_norm": 9.061786651611328, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.872981071472168, + "num_tokens": 711347945.0, + "step": 18643 + }, + { + "epoch": 2.3717084340414707, + "ewc_loss": 0.0772182047367096, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003986468946095556, + "grad_norm": 8.913555145263672, + "learning_rate": 1e-06, + "loss": 0.5437, + "mean_token_accuracy": 0.8429645299911499, + "num_tokens": 711388433.0, + "step": 18644 + }, + { + "epoch": 2.371835644320061, + "ewc_loss": 0.07777278870344162, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040419274591840804, + "grad_norm": 9.087359428405762, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8700157403945923, + "num_tokens": 711425821.0, + "step": 18645 + }, + { + "epoch": 2.3719628545986517, + "ewc_loss": 0.07709041237831116, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003973689745180309, + "grad_norm": 8.96056842803955, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8590850830078125, + "num_tokens": 711462725.0, + "step": 18646 + }, + { + "epoch": 2.3720900648772423, + "ewc_loss": 0.0776415467262268, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040288030868396163, + "grad_norm": 9.010530471801758, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8794419765472412, + "num_tokens": 711499987.0, + "step": 18647 + }, + { + "epoch": 2.3722172751558324, + "ewc_loss": 0.07722118496894836, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039867666782811284, + "grad_norm": 8.980525016784668, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8493614792823792, + "num_tokens": 711543658.0, + "step": 18648 + }, + { + "epoch": 2.3723444854344233, + "ewc_loss": 0.07748427987098694, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004013076249975711, + "grad_norm": 9.024038314819336, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8763408660888672, + "num_tokens": 711583875.0, + "step": 18649 + }, + { + "epoch": 2.3724716957130134, + "ewc_loss": 0.07738649845123291, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040032988181337714, + "grad_norm": 9.008890151977539, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8471120595932007, + "num_tokens": 711625543.0, + "step": 18650 + }, + { + "epoch": 2.372598905991604, + "ewc_loss": 0.07744133472442627, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004008781397715211, + "grad_norm": 9.066551208496094, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8666051030158997, + "num_tokens": 711660528.0, + "step": 18651 + }, + { + "epoch": 2.3727261162701945, + "ewc_loss": 0.07720637321472168, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039852861664257944, + "grad_norm": 9.03251838684082, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8798545002937317, + "num_tokens": 711695646.0, + "step": 18652 + }, + { + "epoch": 2.372853326548785, + "ewc_loss": 0.07740691304206848, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004005339287687093, + "grad_norm": 9.051877975463867, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8493141531944275, + "num_tokens": 711734691.0, + "step": 18653 + }, + { + "epoch": 2.3729805368273755, + "ewc_loss": 0.07718672603368759, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039833210757933557, + "grad_norm": 8.998510360717773, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8679803013801575, + "num_tokens": 711771518.0, + "step": 18654 + }, + { + "epoch": 2.373107747105966, + "ewc_loss": 0.07734914124011993, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039995621773414314, + "grad_norm": 9.04874038696289, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8546179533004761, + "num_tokens": 711808968.0, + "step": 18655 + }, + { + "epoch": 2.3732349573845566, + "ewc_loss": 0.07724101841449738, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003988750104326755, + "grad_norm": 8.988574028015137, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8726114630699158, + "num_tokens": 711842944.0, + "step": 18656 + }, + { + "epoch": 2.373362167663147, + "ewc_loss": 0.07748521864414215, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040131702553480864, + "grad_norm": 9.113553047180176, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8611640930175781, + "num_tokens": 711882740.0, + "step": 18657 + }, + { + "epoch": 2.3734893779417376, + "ewc_loss": 0.07706427574157715, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003971075639128685, + "grad_norm": 8.90418815612793, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.859441339969635, + "num_tokens": 711925362.0, + "step": 18658 + }, + { + "epoch": 2.373616588220328, + "ewc_loss": 0.07772134244441986, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004036783066112548, + "grad_norm": 9.111869812011719, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.869472086429596, + "num_tokens": 711971285.0, + "step": 18659 + }, + { + "epoch": 2.3737437984989187, + "ewc_loss": 0.0770782083272934, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039724691305309534, + "grad_norm": 8.989361763000488, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8612587451934814, + "num_tokens": 712013306.0, + "step": 18660 + }, + { + "epoch": 2.3738710087775092, + "ewc_loss": 0.0776536762714386, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004030015552416444, + "grad_norm": 9.072968482971191, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8650298714637756, + "num_tokens": 712050837.0, + "step": 18661 + }, + { + "epoch": 2.3739982190560998, + "ewc_loss": 0.07717613875865936, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003982262860517949, + "grad_norm": 8.912312507629395, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8811179399490356, + "num_tokens": 712085140.0, + "step": 18662 + }, + { + "epoch": 2.3741254293346903, + "ewc_loss": 0.0776970386505127, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040343526052311063, + "grad_norm": 9.076167106628418, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8794493675231934, + "num_tokens": 712122495.0, + "step": 18663 + }, + { + "epoch": 2.374252639613281, + "ewc_loss": 0.07705950736999512, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003970599500462413, + "grad_norm": 8.924581527709961, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8585688471794128, + "num_tokens": 712161312.0, + "step": 18664 + }, + { + "epoch": 2.3743798498918713, + "ewc_loss": 0.07772471755743027, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004037120088469237, + "grad_norm": 9.075601577758789, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8636751174926758, + "num_tokens": 712197275.0, + "step": 18665 + }, + { + "epoch": 2.374507060170462, + "ewc_loss": 0.07723075151443481, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003987723612226546, + "grad_norm": 8.95740795135498, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8524305820465088, + "num_tokens": 712231309.0, + "step": 18666 + }, + { + "epoch": 2.3746342704490524, + "ewc_loss": 0.0777498334646225, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004039631749037653, + "grad_norm": 9.120987892150879, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8607730269432068, + "num_tokens": 712264864.0, + "step": 18667 + }, + { + "epoch": 2.374761480727643, + "ewc_loss": 0.07719297707080841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003983945935033262, + "grad_norm": 8.95898151397705, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.86533123254776, + "num_tokens": 712303564.0, + "step": 18668 + }, + { + "epoch": 2.3748886910062335, + "ewc_loss": 0.07782095670700073, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040467444341629744, + "grad_norm": 9.030571937561035, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8749409914016724, + "num_tokens": 712341872.0, + "step": 18669 + }, + { + "epoch": 2.375015901284824, + "ewc_loss": 0.07730802893638611, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003995451843366027, + "grad_norm": 8.962289810180664, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8531808853149414, + "num_tokens": 712379835.0, + "step": 18670 + }, + { + "epoch": 2.3751431115634145, + "ewc_loss": 0.07758510112762451, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004023158398922533, + "grad_norm": 9.0577392578125, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8614948987960815, + "num_tokens": 712417462.0, + "step": 18671 + }, + { + "epoch": 2.375270321842005, + "ewc_loss": 0.07721307873725891, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039859561366029084, + "grad_norm": 8.892627716064453, + "learning_rate": 1e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.8380588293075562, + "num_tokens": 712461581.0, + "step": 18672 + }, + { + "epoch": 2.375397532120595, + "ewc_loss": 0.07788999378681183, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004053647571709007, + "grad_norm": 9.09328842163086, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8624715805053711, + "num_tokens": 712499615.0, + "step": 18673 + }, + { + "epoch": 2.375524742399186, + "ewc_loss": 0.07711473107337952, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003976121370214969, + "grad_norm": 8.941535949707031, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8761305212974548, + "num_tokens": 712533264.0, + "step": 18674 + }, + { + "epoch": 2.375651952677776, + "ewc_loss": 0.07794170826673508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040588193223811686, + "grad_norm": 9.038758277893066, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8574143648147583, + "num_tokens": 712571496.0, + "step": 18675 + }, + { + "epoch": 2.3757791629563667, + "ewc_loss": 0.07743392884731293, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040080409962683916, + "grad_norm": 8.994013786315918, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8708363771438599, + "num_tokens": 712606510.0, + "step": 18676 + }, + { + "epoch": 2.3759063732349572, + "ewc_loss": 0.07762809097766876, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004027457907795906, + "grad_norm": 9.030867576599121, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8739230036735535, + "num_tokens": 712646218.0, + "step": 18677 + }, + { + "epoch": 2.3760335835135478, + "ewc_loss": 0.07749154418706894, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004013802972622216, + "grad_norm": 8.943846702575684, + "learning_rate": 1e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.832239031791687, + "num_tokens": 712681905.0, + "step": 18678 + }, + { + "epoch": 2.3761607937921383, + "ewc_loss": 0.07785432040691376, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000405008060624823, + "grad_norm": 9.07395076751709, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8768981695175171, + "num_tokens": 712719916.0, + "step": 18679 + }, + { + "epoch": 2.376288004070729, + "ewc_loss": 0.07727591693401337, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003992239944636822, + "grad_norm": 8.91782283782959, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8655045032501221, + "num_tokens": 712760849.0, + "step": 18680 + }, + { + "epoch": 2.3764152143493193, + "ewc_loss": 0.07787864655256271, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040525131043978035, + "grad_norm": 9.008914947509766, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8760858774185181, + "num_tokens": 712801199.0, + "step": 18681 + }, + { + "epoch": 2.37654242462791, + "ewc_loss": 0.07737614214420319, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040022621396929026, + "grad_norm": 8.998515129089355, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8578187227249146, + "num_tokens": 712843471.0, + "step": 18682 + }, + { + "epoch": 2.3766696349065004, + "ewc_loss": 0.07752711325883865, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004017359751742333, + "grad_norm": 9.058788299560547, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8648096323013306, + "num_tokens": 712880649.0, + "step": 18683 + }, + { + "epoch": 2.376796845185091, + "ewc_loss": 0.07740237563848495, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040048861410468817, + "grad_norm": 8.91516399383545, + "learning_rate": 1e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8426756858825684, + "num_tokens": 712920574.0, + "step": 18684 + }, + { + "epoch": 2.3769240554636815, + "ewc_loss": 0.0778392106294632, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004048569535370916, + "grad_norm": 9.054595947265625, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8743364810943604, + "num_tokens": 712962544.0, + "step": 18685 + }, + { + "epoch": 2.377051265742272, + "ewc_loss": 0.07735088467597961, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003999736800324172, + "grad_norm": 8.96798324584961, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.867905855178833, + "num_tokens": 712999138.0, + "step": 18686 + }, + { + "epoch": 2.3771784760208625, + "ewc_loss": 0.07755281031131744, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004019929619971663, + "grad_norm": 9.007988929748535, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8661815524101257, + "num_tokens": 713041799.0, + "step": 18687 + }, + { + "epoch": 2.377305686299453, + "ewc_loss": 0.07728759199380875, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003993407590314746, + "grad_norm": 8.926960945129395, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8575776219367981, + "num_tokens": 713088248.0, + "step": 18688 + }, + { + "epoch": 2.3774328965780436, + "ewc_loss": 0.07776106894016266, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040407548658549786, + "grad_norm": 9.053597450256348, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.873462975025177, + "num_tokens": 713127239.0, + "step": 18689 + }, + { + "epoch": 2.377560106856634, + "ewc_loss": 0.07725584506988525, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003990232653450221, + "grad_norm": 8.857528686523438, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.869117259979248, + "num_tokens": 713168389.0, + "step": 18690 + }, + { + "epoch": 2.3776873171352246, + "ewc_loss": 0.07797364145517349, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040620125946588814, + "grad_norm": 9.189679145812988, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8583459854125977, + "num_tokens": 713208137.0, + "step": 18691 + }, + { + "epoch": 2.377814527413815, + "ewc_loss": 0.07712755352258682, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003977403976023197, + "grad_norm": 8.93138313293457, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8549929857254028, + "num_tokens": 713249344.0, + "step": 18692 + }, + { + "epoch": 2.3779417376924057, + "ewc_loss": 0.07804788649082184, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040694375638850033, + "grad_norm": 9.074505805969238, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8619111776351929, + "num_tokens": 713291376.0, + "step": 18693 + }, + { + "epoch": 2.378068947970996, + "ewc_loss": 0.07719947397708893, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039845958235673606, + "grad_norm": 8.946354866027832, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8635253310203552, + "num_tokens": 713329925.0, + "step": 18694 + }, + { + "epoch": 2.3781961582495867, + "ewc_loss": 0.07802749425172806, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004067397676408291, + "grad_norm": 9.0592041015625, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8671835064888, + "num_tokens": 713372509.0, + "step": 18695 + }, + { + "epoch": 2.378323368528177, + "ewc_loss": 0.07717397809028625, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003982046036981046, + "grad_norm": 8.883179664611816, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.858342707157135, + "num_tokens": 713410055.0, + "step": 18696 + }, + { + "epoch": 2.378450578806768, + "ewc_loss": 0.07803714275360107, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040683624683879316, + "grad_norm": 9.09562873840332, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.862071692943573, + "num_tokens": 713453958.0, + "step": 18697 + }, + { + "epoch": 2.378577789085358, + "ewc_loss": 0.07738979160785675, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004003627400379628, + "grad_norm": 8.938307762145996, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8633533716201782, + "num_tokens": 713485683.0, + "step": 18698 + }, + { + "epoch": 2.3787049993639484, + "ewc_loss": 0.07811949402093887, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004076597688253969, + "grad_norm": 9.056235313415527, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8659207224845886, + "num_tokens": 713528565.0, + "step": 18699 + }, + { + "epoch": 2.378832209642539, + "ewc_loss": 0.07731746137142181, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004020808555651456, + "grad_norm": 9.017889976501465, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8595356941223145, + "num_tokens": 713560932.0, + "step": 18700 + }, + { + "epoch": 2.3789594199211295, + "ewc_loss": 0.07785531133413315, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040501795592717826, + "grad_norm": 9.055828094482422, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8611475229263306, + "num_tokens": 713592891.0, + "step": 18701 + }, + { + "epoch": 2.37908663019972, + "ewc_loss": 0.0776323676109314, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004027885152027011, + "grad_norm": 8.96803092956543, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8612800240516663, + "num_tokens": 713631285.0, + "step": 18702 + }, + { + "epoch": 2.3792138404783105, + "ewc_loss": 0.07790080457925797, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040547287790104747, + "grad_norm": 9.04267406463623, + "learning_rate": 1e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8468970060348511, + "num_tokens": 713675143.0, + "step": 18703 + }, + { + "epoch": 2.379341050756901, + "ewc_loss": 0.07753761857748032, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040184101089835167, + "grad_norm": 8.964393615722656, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8703193664550781, + "num_tokens": 713717556.0, + "step": 18704 + }, + { + "epoch": 2.3794682610354916, + "ewc_loss": 0.07783055305480957, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040477036964148283, + "grad_norm": 9.085737228393555, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8563128709793091, + "num_tokens": 713763515.0, + "step": 18705 + }, + { + "epoch": 2.379595471314082, + "ewc_loss": 0.07724869251251221, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039895172812975943, + "grad_norm": 8.924906730651855, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8691180944442749, + "num_tokens": 713801064.0, + "step": 18706 + }, + { + "epoch": 2.3797226815926726, + "ewc_loss": 0.0778852328658104, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040531717240810394, + "grad_norm": 9.017972946166992, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8606284856796265, + "num_tokens": 713841662.0, + "step": 18707 + }, + { + "epoch": 2.379849891871263, + "ewc_loss": 0.07743014395236969, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040076623554341495, + "grad_norm": 9.003747940063477, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8679786920547485, + "num_tokens": 713883646.0, + "step": 18708 + }, + { + "epoch": 2.3799771021498537, + "ewc_loss": 0.0776628702878952, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004030935524497181, + "grad_norm": 8.980016708374023, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8850967288017273, + "num_tokens": 713925365.0, + "step": 18709 + }, + { + "epoch": 2.3801043124284442, + "ewc_loss": 0.07767202705144882, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000403185113100335, + "grad_norm": 9.073812484741211, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8610695004463196, + "num_tokens": 713961142.0, + "step": 18710 + }, + { + "epoch": 2.3802315227070348, + "ewc_loss": 0.07741247117519379, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040058951708488166, + "grad_norm": 8.955005645751953, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8718578815460205, + "num_tokens": 714001652.0, + "step": 18711 + }, + { + "epoch": 2.3803587329856253, + "ewc_loss": 0.07785758376121521, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004050406569149345, + "grad_norm": 9.129423141479492, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8673425912857056, + "num_tokens": 714043023.0, + "step": 18712 + }, + { + "epoch": 2.380485943264216, + "ewc_loss": 0.07721740752458572, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039863892016001046, + "grad_norm": 8.883209228515625, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8640682697296143, + "num_tokens": 714080389.0, + "step": 18713 + }, + { + "epoch": 2.3806131535428063, + "ewc_loss": 0.07777060568332672, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040661232196725905, + "grad_norm": 9.150981903076172, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8690712451934814, + "num_tokens": 714117695.0, + "step": 18714 + }, + { + "epoch": 2.380740363821397, + "ewc_loss": 0.07706265151500702, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003970913530793041, + "grad_norm": 8.8671875, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8724825382232666, + "num_tokens": 714154204.0, + "step": 18715 + }, + { + "epoch": 2.3808675740999874, + "ewc_loss": 0.07807831466197968, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040968938264995813, + "grad_norm": 9.144021034240723, + "learning_rate": 1e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8476771116256714, + "num_tokens": 714194857.0, + "step": 18716 + }, + { + "epoch": 2.380994784378578, + "ewc_loss": 0.07681655883789062, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003970718535128981, + "grad_norm": 8.903491973876953, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8758211135864258, + "num_tokens": 714228034.0, + "step": 18717 + }, + { + "epoch": 2.3811219946571685, + "ewc_loss": 0.07816894352436066, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040815427200868726, + "grad_norm": 9.051234245300293, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8707643747329712, + "num_tokens": 714269688.0, + "step": 18718 + }, + { + "epoch": 2.381249204935759, + "ewc_loss": 0.07733166217803955, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039978142012842, + "grad_norm": 8.958943367004395, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.858483612537384, + "num_tokens": 714309039.0, + "step": 18719 + }, + { + "epoch": 2.3813764152143495, + "ewc_loss": 0.07766211032867432, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000405527331167832, + "grad_norm": 9.030305862426758, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8704986572265625, + "num_tokens": 714347453.0, + "step": 18720 + }, + { + "epoch": 2.3815036254929396, + "ewc_loss": 0.07764904201030731, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004029552510473877, + "grad_norm": 8.93725872039795, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.861807107925415, + "num_tokens": 714385850.0, + "step": 18721 + }, + { + "epoch": 2.3816308357715306, + "ewc_loss": 0.07792483270168304, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040571315912529826, + "grad_norm": 9.06916618347168, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8594714403152466, + "num_tokens": 714426735.0, + "step": 18722 + }, + { + "epoch": 2.3817580460501206, + "ewc_loss": 0.07740150392055511, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040292125777341425, + "grad_norm": 8.976113319396973, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8769791126251221, + "num_tokens": 714465279.0, + "step": 18723 + }, + { + "epoch": 2.381885256328711, + "ewc_loss": 0.07801143825054169, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004065792018081993, + "grad_norm": 9.065801620483398, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8527328968048096, + "num_tokens": 714505703.0, + "step": 18724 + }, + { + "epoch": 2.3820124666073017, + "ewc_loss": 0.07753553241491318, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040182017255574465, + "grad_norm": 8.978875160217285, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8591785430908203, + "num_tokens": 714543642.0, + "step": 18725 + }, + { + "epoch": 2.3821396768858922, + "ewc_loss": 0.07810914516448975, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040755627560429275, + "grad_norm": 9.048444747924805, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8815287351608276, + "num_tokens": 714583533.0, + "step": 18726 + }, + { + "epoch": 2.3822668871644828, + "ewc_loss": 0.07765306532382965, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004029955016449094, + "grad_norm": 8.962631225585938, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8699339628219604, + "num_tokens": 714625577.0, + "step": 18727 + }, + { + "epoch": 2.3823940974430733, + "ewc_loss": 0.07810428738594055, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004075077304150909, + "grad_norm": 9.080571174621582, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8601444959640503, + "num_tokens": 714664987.0, + "step": 18728 + }, + { + "epoch": 2.382521307721664, + "ewc_loss": 0.07771062850952148, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004035711754113436, + "grad_norm": 8.998998641967773, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8602175116539001, + "num_tokens": 714705881.0, + "step": 18729 + }, + { + "epoch": 2.3826485180002543, + "ewc_loss": 0.07802467793226242, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004067116242367774, + "grad_norm": 9.120957374572754, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8548401594161987, + "num_tokens": 714746012.0, + "step": 18730 + }, + { + "epoch": 2.382775728278845, + "ewc_loss": 0.07749368995428085, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004014017467852682, + "grad_norm": 8.939456939697266, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8664361238479614, + "num_tokens": 714790001.0, + "step": 18731 + }, + { + "epoch": 2.3829029385574354, + "ewc_loss": 0.07821221649646759, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000408586987759918, + "grad_norm": 9.127946853637695, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8722727298736572, + "num_tokens": 714824515.0, + "step": 18732 + }, + { + "epoch": 2.383030148836026, + "ewc_loss": 0.07744260132312775, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004008908581454307, + "grad_norm": 8.951480865478516, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.857575535774231, + "num_tokens": 714860841.0, + "step": 18733 + }, + { + "epoch": 2.3831573591146165, + "ewc_loss": 0.07802923023700714, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040919851744547486, + "grad_norm": 9.081900596618652, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8686355948448181, + "num_tokens": 714896373.0, + "step": 18734 + }, + { + "epoch": 2.383284569393207, + "ewc_loss": 0.0774930790066719, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004013956349808723, + "grad_norm": 8.926332473754883, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8676377534866333, + "num_tokens": 714926684.0, + "step": 18735 + }, + { + "epoch": 2.3834117796717975, + "ewc_loss": 0.07794662564992905, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004083725216332823, + "grad_norm": 9.059244155883789, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.880813479423523, + "num_tokens": 714961057.0, + "step": 18736 + }, + { + "epoch": 2.383538989950388, + "ewc_loss": 0.07774067670106888, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040387161425314844, + "grad_norm": 8.970043182373047, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8704759478569031, + "num_tokens": 715001215.0, + "step": 18737 + }, + { + "epoch": 2.3836662002289786, + "ewc_loss": 0.07776503264904022, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004065565881319344, + "grad_norm": 9.063145637512207, + "learning_rate": 1e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8442176580429077, + "num_tokens": 715038252.0, + "step": 18738 + }, + { + "epoch": 2.383793410507569, + "ewc_loss": 0.0778426080942154, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004048909468110651, + "grad_norm": 9.003763198852539, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8643699884414673, + "num_tokens": 715078320.0, + "step": 18739 + }, + { + "epoch": 2.3839206207861596, + "ewc_loss": 0.07793232798576355, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040578815969638526, + "grad_norm": 9.015070915222168, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8589441180229187, + "num_tokens": 715115628.0, + "step": 18740 + }, + { + "epoch": 2.38404783106475, + "ewc_loss": 0.07795912027359009, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004060560022480786, + "grad_norm": 8.974416732788086, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8730485439300537, + "num_tokens": 715153294.0, + "step": 18741 + }, + { + "epoch": 2.3841750413433407, + "ewc_loss": 0.078004390001297, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004065087123308331, + "grad_norm": 9.039131164550781, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8656290769577026, + "num_tokens": 715193736.0, + "step": 18742 + }, + { + "epoch": 2.384302251621931, + "ewc_loss": 0.07756201177835464, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004045263631269336, + "grad_norm": 8.949549674987793, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8660835027694702, + "num_tokens": 715234539.0, + "step": 18743 + }, + { + "epoch": 2.3844294619005217, + "ewc_loss": 0.07810194790363312, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040748436003923416, + "grad_norm": 9.048870086669922, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8774268627166748, + "num_tokens": 715271567.0, + "step": 18744 + }, + { + "epoch": 2.3845566721791123, + "ewc_loss": 0.07749031484127045, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004038094193674624, + "grad_norm": 8.948749542236328, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8775726556777954, + "num_tokens": 715314850.0, + "step": 18745 + }, + { + "epoch": 2.3846838824577024, + "ewc_loss": 0.07811106741428375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040757551323622465, + "grad_norm": 9.050496101379395, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8705567121505737, + "num_tokens": 715353353.0, + "step": 18746 + }, + { + "epoch": 2.3848110927362933, + "ewc_loss": 0.07774820178747177, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004039468476548791, + "grad_norm": 9.006542205810547, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8746975660324097, + "num_tokens": 715387843.0, + "step": 18747 + }, + { + "epoch": 2.3849383030148834, + "ewc_loss": 0.07817043364048004, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000408169231377542, + "grad_norm": 9.053861618041992, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8514683246612549, + "num_tokens": 715428037.0, + "step": 18748 + }, + { + "epoch": 2.385065513293474, + "ewc_loss": 0.07766016572713852, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004030665149912238, + "grad_norm": 8.98835277557373, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8791938424110413, + "num_tokens": 715464403.0, + "step": 18749 + }, + { + "epoch": 2.3851927235720645, + "ewc_loss": 0.07800076901912689, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004064725653734058, + "grad_norm": 9.002699851989746, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8751757144927979, + "num_tokens": 715501820.0, + "step": 18750 + }, + { + "epoch": 2.385319933850655, + "ewc_loss": 0.07779605686664581, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004044253728352487, + "grad_norm": 8.951983451843262, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8761026859283447, + "num_tokens": 715543287.0, + "step": 18751 + }, + { + "epoch": 2.3854471441292455, + "ewc_loss": 0.07808889448642731, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040735380025580525, + "grad_norm": 9.085891723632812, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8753494024276733, + "num_tokens": 715578189.0, + "step": 18752 + }, + { + "epoch": 2.385574354407836, + "ewc_loss": 0.07738181948661804, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004027244867756963, + "grad_norm": 9.012948036193848, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8554556369781494, + "num_tokens": 715616447.0, + "step": 18753 + }, + { + "epoch": 2.3857015646864266, + "ewc_loss": 0.07779906690120697, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000406896899221465, + "grad_norm": 9.12404727935791, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8700896501541138, + "num_tokens": 715651692.0, + "step": 18754 + }, + { + "epoch": 2.385828774965017, + "ewc_loss": 0.07752706110477448, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040173542220145464, + "grad_norm": 9.029807090759277, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8558465242385864, + "num_tokens": 715687733.0, + "step": 18755 + }, + { + "epoch": 2.3859559852436076, + "ewc_loss": 0.07789995521306992, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040546440868638456, + "grad_norm": 9.045509338378906, + "learning_rate": 1e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8422293066978455, + "num_tokens": 715727481.0, + "step": 18756 + }, + { + "epoch": 2.386083195522198, + "ewc_loss": 0.07728594541549683, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004017656610812992, + "grad_norm": 8.996650695800781, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8481221199035645, + "num_tokens": 715767549.0, + "step": 18757 + }, + { + "epoch": 2.3862104058007887, + "ewc_loss": 0.07778838276863098, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004043486842419952, + "grad_norm": 9.014165878295898, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8568135499954224, + "num_tokens": 715807251.0, + "step": 18758 + }, + { + "epoch": 2.386337616079379, + "ewc_loss": 0.07775912433862686, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040405610343441367, + "grad_norm": 9.00170612335205, + "learning_rate": 1e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8492188453674316, + "num_tokens": 715846893.0, + "step": 18759 + }, + { + "epoch": 2.3864648263579697, + "ewc_loss": 0.0774947851896286, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040385412285104394, + "grad_norm": 9.054789543151855, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8508095741271973, + "num_tokens": 715883611.0, + "step": 18760 + }, + { + "epoch": 2.3865920366365603, + "ewc_loss": 0.07751394808292389, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004016043385490775, + "grad_norm": 8.97495174407959, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8672705888748169, + "num_tokens": 715915598.0, + "step": 18761 + }, + { + "epoch": 2.386719246915151, + "ewc_loss": 0.07780696451663971, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000404534483095631, + "grad_norm": 8.993538856506348, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8725020885467529, + "num_tokens": 715956647.0, + "step": 18762 + }, + { + "epoch": 2.3868464571937413, + "ewc_loss": 0.07758773118257523, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040234214975498617, + "grad_norm": 8.960089683532715, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8717244267463684, + "num_tokens": 715996627.0, + "step": 18763 + }, + { + "epoch": 2.386973667472332, + "ewc_loss": 0.07772120833396912, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004036769678350538, + "grad_norm": 8.993642807006836, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8859231472015381, + "num_tokens": 716027823.0, + "step": 18764 + }, + { + "epoch": 2.3871008777509224, + "ewc_loss": 0.07762075960636139, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040267244912683964, + "grad_norm": 8.966806411743164, + "learning_rate": 1e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8508754968643188, + "num_tokens": 716067430.0, + "step": 18765 + }, + { + "epoch": 2.387228088029513, + "ewc_loss": 0.07787515223026276, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040521641494706273, + "grad_norm": 9.055048942565918, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8576862215995789, + "num_tokens": 716109436.0, + "step": 18766 + }, + { + "epoch": 2.3873552983081034, + "ewc_loss": 0.07741245627403259, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040058940066955984, + "grad_norm": 8.978571891784668, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8604956865310669, + "num_tokens": 716148157.0, + "step": 18767 + }, + { + "epoch": 2.387482508586694, + "ewc_loss": 0.07788492739200592, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040531408740207553, + "grad_norm": 9.074752807617188, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8683440685272217, + "num_tokens": 716184662.0, + "step": 18768 + }, + { + "epoch": 2.3876097188652845, + "ewc_loss": 0.07745733112096786, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004010381526313722, + "grad_norm": 8.964717864990234, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8583261966705322, + "num_tokens": 716227382.0, + "step": 18769 + }, + { + "epoch": 2.387736929143875, + "ewc_loss": 0.07787918299436569, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004052566655445844, + "grad_norm": 9.037463188171387, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8689618110656738, + "num_tokens": 716262236.0, + "step": 18770 + }, + { + "epoch": 2.387864139422465, + "ewc_loss": 0.07763838768005371, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040284876013174653, + "grad_norm": 9.0625581741333, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8747227191925049, + "num_tokens": 716295541.0, + "step": 18771 + }, + { + "epoch": 2.387991349701056, + "ewc_loss": 0.07753408700227737, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040180570795200765, + "grad_norm": 9.014363288879395, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8678973913192749, + "num_tokens": 716331796.0, + "step": 18772 + }, + { + "epoch": 2.388118559979646, + "ewc_loss": 0.0776471495628357, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040293633355759084, + "grad_norm": 9.011770248413086, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8553903102874756, + "num_tokens": 716369539.0, + "step": 18773 + }, + { + "epoch": 2.3882457702582367, + "ewc_loss": 0.07753177732229233, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000401782599510625, + "grad_norm": 9.055448532104492, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8817950487136841, + "num_tokens": 716409806.0, + "step": 18774 + }, + { + "epoch": 2.3883729805368272, + "ewc_loss": 0.07741692662239075, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004006341041531414, + "grad_norm": 8.949455261230469, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8578431010246277, + "num_tokens": 716448191.0, + "step": 18775 + }, + { + "epoch": 2.3885001908154178, + "ewc_loss": 0.07787235081195831, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004051883006468415, + "grad_norm": 9.03397274017334, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8664665222167969, + "num_tokens": 716486522.0, + "step": 18776 + }, + { + "epoch": 2.3886274010940083, + "ewc_loss": 0.07757885754108429, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004022534121759236, + "grad_norm": 8.994912147521973, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8644776344299316, + "num_tokens": 716519383.0, + "step": 18777 + }, + { + "epoch": 2.388754611372599, + "ewc_loss": 0.07784684747457504, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004049333219882101, + "grad_norm": 9.121512413024902, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8677269220352173, + "num_tokens": 716556136.0, + "step": 18778 + }, + { + "epoch": 2.3888818216511893, + "ewc_loss": 0.07750275731086731, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004014924052171409, + "grad_norm": 9.034695625305176, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8577026724815369, + "num_tokens": 716592346.0, + "step": 18779 + }, + { + "epoch": 2.38900903192978, + "ewc_loss": 0.07771401107311249, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040360496495850384, + "grad_norm": 9.047825813293457, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8584768772125244, + "num_tokens": 716633996.0, + "step": 18780 + }, + { + "epoch": 2.3891362422083704, + "ewc_loss": 0.07750725746154785, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040153737063519657, + "grad_norm": 9.009382247924805, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8602631092071533, + "num_tokens": 716669638.0, + "step": 18781 + }, + { + "epoch": 2.389263452486961, + "ewc_loss": 0.07777196168899536, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040418445132672787, + "grad_norm": 9.049860000610352, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8556865453720093, + "num_tokens": 716711194.0, + "step": 18782 + }, + { + "epoch": 2.3893906627655515, + "ewc_loss": 0.07753870636224747, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040185192483477294, + "grad_norm": 9.012232780456543, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8711310029029846, + "num_tokens": 716753402.0, + "step": 18783 + }, + { + "epoch": 2.389517873044142, + "ewc_loss": 0.07766690850257874, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040313394856639206, + "grad_norm": 9.028697967529297, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.867411732673645, + "num_tokens": 716794638.0, + "step": 18784 + }, + { + "epoch": 2.3896450833227325, + "ewc_loss": 0.07770110666751862, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040347594767808914, + "grad_norm": 9.02783489227295, + "learning_rate": 1e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8357344269752502, + "num_tokens": 716834003.0, + "step": 18785 + }, + { + "epoch": 2.389772293601323, + "ewc_loss": 0.07765550911426544, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004030199779663235, + "grad_norm": 9.021384239196777, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8706204295158386, + "num_tokens": 716867349.0, + "step": 18786 + }, + { + "epoch": 2.3898995038799136, + "ewc_loss": 0.07767962664365768, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040326110320165753, + "grad_norm": 9.054821968078613, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8639933466911316, + "num_tokens": 716904271.0, + "step": 18787 + }, + { + "epoch": 2.390026714158504, + "ewc_loss": 0.07776494324207306, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004041142528876662, + "grad_norm": 9.091692924499512, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.866517186164856, + "num_tokens": 716936165.0, + "step": 18788 + }, + { + "epoch": 2.3901539244370946, + "ewc_loss": 0.07757669687271118, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004022317589260638, + "grad_norm": 9.052988052368164, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8628742694854736, + "num_tokens": 716974011.0, + "step": 18789 + }, + { + "epoch": 2.390281134715685, + "ewc_loss": 0.07761810719966888, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000402645964641124, + "grad_norm": 8.998431205749512, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8653681874275208, + "num_tokens": 717012438.0, + "step": 18790 + }, + { + "epoch": 2.3904083449942757, + "ewc_loss": 0.07784625887870789, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004049274721182883, + "grad_norm": 9.075098991394043, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8564407825469971, + "num_tokens": 717051755.0, + "step": 18791 + }, + { + "epoch": 2.390535555272866, + "ewc_loss": 0.07750418037176132, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004015066660940647, + "grad_norm": 9.010554313659668, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8653533458709717, + "num_tokens": 717090740.0, + "step": 18792 + }, + { + "epoch": 2.3906627655514567, + "ewc_loss": 0.0777282863855362, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004037477192468941, + "grad_norm": 9.047258377075195, + "learning_rate": 1e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.8379147052764893, + "num_tokens": 717132033.0, + "step": 18793 + }, + { + "epoch": 2.390789975830047, + "ewc_loss": 0.07759968191385269, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004024616500828415, + "grad_norm": 9.05838394165039, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8718019127845764, + "num_tokens": 717168255.0, + "step": 18794 + }, + { + "epoch": 2.390917186108638, + "ewc_loss": 0.07748699188232422, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004013347497675568, + "grad_norm": 9.021736145019531, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8613436818122864, + "num_tokens": 717206863.0, + "step": 18795 + }, + { + "epoch": 2.391044396387228, + "ewc_loss": 0.0775977373123169, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004024422087240964, + "grad_norm": 9.026606559753418, + "learning_rate": 1e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.842681884765625, + "num_tokens": 717245374.0, + "step": 18796 + }, + { + "epoch": 2.3911716066658184, + "ewc_loss": 0.07753753662109375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004018401959910989, + "grad_norm": 8.998464584350586, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8834607601165771, + "num_tokens": 717279821.0, + "step": 18797 + }, + { + "epoch": 2.391298816944409, + "ewc_loss": 0.07738342136144638, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040029906085692346, + "grad_norm": 9.019981384277344, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8566998243331909, + "num_tokens": 717318896.0, + "step": 18798 + }, + { + "epoch": 2.3914260272229995, + "ewc_loss": 0.07741035521030426, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040056841680780053, + "grad_norm": 9.013028144836426, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8554071187973022, + "num_tokens": 717361493.0, + "step": 18799 + }, + { + "epoch": 2.39155323750159, + "ewc_loss": 0.07755870372056961, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004020518681500107, + "grad_norm": 9.071643829345703, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8546735048294067, + "num_tokens": 717402801.0, + "step": 18800 + }, + { + "epoch": 2.3916804477801805, + "ewc_loss": 0.07732850313186646, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039974990068003535, + "grad_norm": 8.965664863586426, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8826158046722412, + "num_tokens": 717441275.0, + "step": 18801 + }, + { + "epoch": 2.391807658058771, + "ewc_loss": 0.077565997838974, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004045662353746593, + "grad_norm": 9.1300048828125, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8524783849716187, + "num_tokens": 717477423.0, + "step": 18802 + }, + { + "epoch": 2.3919348683373616, + "ewc_loss": 0.07691489160060883, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003980551555287093, + "grad_norm": 9.013899803161621, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8579549789428711, + "num_tokens": 717512978.0, + "step": 18803 + }, + { + "epoch": 2.392062078615952, + "ewc_loss": 0.07750825583934784, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040398884448222816, + "grad_norm": 9.067821502685547, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8840516209602356, + "num_tokens": 717549710.0, + "step": 18804 + }, + { + "epoch": 2.3921892888945426, + "ewc_loss": 0.07715029269456863, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004004091606475413, + "grad_norm": 9.025092124938965, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8611401319503784, + "num_tokens": 717586588.0, + "step": 18805 + }, + { + "epoch": 2.392316499173133, + "ewc_loss": 0.07752850651741028, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004017499159090221, + "grad_norm": 9.031160354614258, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8769564628601074, + "num_tokens": 717621507.0, + "step": 18806 + }, + { + "epoch": 2.3924437094517237, + "ewc_loss": 0.07750298082828522, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004014946171082556, + "grad_norm": 9.036275863647461, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8644427061080933, + "num_tokens": 717655915.0, + "step": 18807 + }, + { + "epoch": 2.392570919730314, + "ewc_loss": 0.07750442624092102, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040150913991965353, + "grad_norm": 9.04192066192627, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8618502616882324, + "num_tokens": 717696357.0, + "step": 18808 + }, + { + "epoch": 2.3926981300089047, + "ewc_loss": 0.07750611007213593, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004015259619336575, + "grad_norm": 9.017426490783691, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8773349523544312, + "num_tokens": 717735397.0, + "step": 18809 + }, + { + "epoch": 2.3928253402874953, + "ewc_loss": 0.0776410698890686, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004028755647595972, + "grad_norm": 9.039974212646484, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8690314888954163, + "num_tokens": 717770081.0, + "step": 18810 + }, + { + "epoch": 2.392952550566086, + "ewc_loss": 0.07762476056814194, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040271246689371765, + "grad_norm": 9.053763389587402, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8801857233047485, + "num_tokens": 717805262.0, + "step": 18811 + }, + { + "epoch": 2.3930797608446763, + "ewc_loss": 0.07751216739416122, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000401586527004838, + "grad_norm": 9.020956993103027, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8640628457069397, + "num_tokens": 717841877.0, + "step": 18812 + }, + { + "epoch": 2.393206971123267, + "ewc_loss": 0.07761512696743011, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004026161623187363, + "grad_norm": 9.000507354736328, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8486709594726562, + "num_tokens": 717881435.0, + "step": 18813 + }, + { + "epoch": 2.3933341814018574, + "ewc_loss": 0.07756027579307556, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004020675551146269, + "grad_norm": 8.999484062194824, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8845662474632263, + "num_tokens": 717922240.0, + "step": 18814 + }, + { + "epoch": 2.393461391680448, + "ewc_loss": 0.0777236819267273, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040370161877945065, + "grad_norm": 9.059093475341797, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8628449440002441, + "num_tokens": 717958438.0, + "step": 18815 + }, + { + "epoch": 2.3935886019590384, + "ewc_loss": 0.07753439247608185, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004018088220618665, + "grad_norm": 9.040971755981445, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8565850257873535, + "num_tokens": 717994451.0, + "step": 18816 + }, + { + "epoch": 2.393715812237629, + "ewc_loss": 0.07767298817634583, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040319477557204664, + "grad_norm": 9.034330368041992, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8679937124252319, + "num_tokens": 718034318.0, + "step": 18817 + }, + { + "epoch": 2.3938430225162195, + "ewc_loss": 0.07753641903400421, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040182904922403395, + "grad_norm": 8.97596549987793, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8654557466506958, + "num_tokens": 718074994.0, + "step": 18818 + }, + { + "epoch": 2.3939702327948096, + "ewc_loss": 0.07776786386966705, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004041435313411057, + "grad_norm": 9.056655883789062, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8612513542175293, + "num_tokens": 718118849.0, + "step": 18819 + }, + { + "epoch": 2.3940974430734006, + "ewc_loss": 0.07753262668848038, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040179109782911837, + "grad_norm": 9.05643367767334, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8711228966712952, + "num_tokens": 718152874.0, + "step": 18820 + }, + { + "epoch": 2.3942246533519906, + "ewc_loss": 0.07770860940217972, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004035509191453457, + "grad_norm": 9.097077369689941, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8576309680938721, + "num_tokens": 718194206.0, + "step": 18821 + }, + { + "epoch": 2.394351863630581, + "ewc_loss": 0.07743567228317261, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040082153282128274, + "grad_norm": 9.035850524902344, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8691976070404053, + "num_tokens": 718227408.0, + "step": 18822 + }, + { + "epoch": 2.3944790739091717, + "ewc_loss": 0.07767455279827118, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040321031701751053, + "grad_norm": 9.055216789245605, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8586850762367249, + "num_tokens": 718266962.0, + "step": 18823 + }, + { + "epoch": 2.3946062841877622, + "ewc_loss": 0.07763023674488068, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004027672403026372, + "grad_norm": 9.05851936340332, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8699820041656494, + "num_tokens": 718307873.0, + "step": 18824 + }, + { + "epoch": 2.3947334944663528, + "ewc_loss": 0.07767795026302338, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040324433939531446, + "grad_norm": 8.970656394958496, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.870238721370697, + "num_tokens": 718350296.0, + "step": 18825 + }, + { + "epoch": 2.3948607047449433, + "ewc_loss": 0.07781843841075897, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004046492394991219, + "grad_norm": 9.104127883911133, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.866805911064148, + "num_tokens": 718390734.0, + "step": 18826 + }, + { + "epoch": 2.394987915023534, + "ewc_loss": 0.07744256407022476, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040089047979563475, + "grad_norm": 8.98738956451416, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8809112906455994, + "num_tokens": 718435630.0, + "step": 18827 + }, + { + "epoch": 2.3951151253021243, + "ewc_loss": 0.07798926532268524, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004063575470354408, + "grad_norm": 9.077672004699707, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8712098598480225, + "num_tokens": 718474791.0, + "step": 18828 + }, + { + "epoch": 2.395242335580715, + "ewc_loss": 0.07753700762987137, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004018349281977862, + "grad_norm": 9.043869972229004, + "learning_rate": 1e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8463735580444336, + "num_tokens": 718514957.0, + "step": 18829 + }, + { + "epoch": 2.3953695458593054, + "ewc_loss": 0.0778365284204483, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040483009070158005, + "grad_norm": 9.121920585632324, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8768181800842285, + "num_tokens": 718551761.0, + "step": 18830 + }, + { + "epoch": 2.395496756137896, + "ewc_loss": 0.07768726348876953, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040333750075660646, + "grad_norm": 9.027647018432617, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8636463284492493, + "num_tokens": 718586445.0, + "step": 18831 + }, + { + "epoch": 2.3956239664164865, + "ewc_loss": 0.07785272598266602, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040499205351807177, + "grad_norm": 9.086348533630371, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8597384691238403, + "num_tokens": 718631105.0, + "step": 18832 + }, + { + "epoch": 2.395751176695077, + "ewc_loss": 0.07755189388990402, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004019837942905724, + "grad_norm": 9.009759902954102, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8615198731422424, + "num_tokens": 718672278.0, + "step": 18833 + }, + { + "epoch": 2.3958783869736675, + "ewc_loss": 0.0779462531208992, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004059273924212903, + "grad_norm": 9.078598022460938, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8700177669525146, + "num_tokens": 718713191.0, + "step": 18834 + }, + { + "epoch": 2.396005597252258, + "ewc_loss": 0.07758767157793045, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040234156767837703, + "grad_norm": 9.036564826965332, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8587801456451416, + "num_tokens": 718745012.0, + "step": 18835 + }, + { + "epoch": 2.3961328075308486, + "ewc_loss": 0.07779766619205475, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004044415254611522, + "grad_norm": 9.033382415771484, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.865652322769165, + "num_tokens": 718780149.0, + "step": 18836 + }, + { + "epoch": 2.396260017809439, + "ewc_loss": 0.07789359986782074, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004054008750244975, + "grad_norm": 9.175544738769531, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8597360849380493, + "num_tokens": 718812831.0, + "step": 18837 + }, + { + "epoch": 2.3963872280880296, + "ewc_loss": 0.07743990421295166, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004008638788945973, + "grad_norm": 9.010910987854004, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8731284141540527, + "num_tokens": 718847650.0, + "step": 18838 + }, + { + "epoch": 2.39651443836662, + "ewc_loss": 0.07813375443220139, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004078023775946349, + "grad_norm": 9.115242958068848, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8655955195426941, + "num_tokens": 718883417.0, + "step": 18839 + }, + { + "epoch": 2.3966416486452107, + "ewc_loss": 0.07739472389221191, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004004120419267565, + "grad_norm": 9.023796081542969, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8901455402374268, + "num_tokens": 718915501.0, + "step": 18840 + }, + { + "epoch": 2.396768858923801, + "ewc_loss": 0.07811163365840912, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040758121758699417, + "grad_norm": 9.1314697265625, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8799740076065063, + "num_tokens": 718954485.0, + "step": 18841 + }, + { + "epoch": 2.3968960692023917, + "ewc_loss": 0.07743984460830688, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004008632677141577, + "grad_norm": 9.016887664794922, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.849534273147583, + "num_tokens": 718995815.0, + "step": 18842 + }, + { + "epoch": 2.3970232794809823, + "ewc_loss": 0.07783134281635284, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004047783149871975, + "grad_norm": 9.041646003723145, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8801100850105286, + "num_tokens": 719032820.0, + "step": 18843 + }, + { + "epoch": 2.3971504897595723, + "ewc_loss": 0.07769445329904556, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040340935811400414, + "grad_norm": 9.040339469909668, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.863238513469696, + "num_tokens": 719066966.0, + "step": 18844 + }, + { + "epoch": 2.3972777000381633, + "ewc_loss": 0.07772901654243469, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040375496610067785, + "grad_norm": 9.067760467529297, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8639030456542969, + "num_tokens": 719108768.0, + "step": 18845 + }, + { + "epoch": 2.3974049103167534, + "ewc_loss": 0.0776519924402237, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004029847332276404, + "grad_norm": 8.98064136505127, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8624255657196045, + "num_tokens": 719151451.0, + "step": 18846 + }, + { + "epoch": 2.397532120595344, + "ewc_loss": 0.07799085974693298, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040637346683070064, + "grad_norm": 9.125978469848633, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8570476770401001, + "num_tokens": 719192103.0, + "step": 18847 + }, + { + "epoch": 2.3976593308739345, + "ewc_loss": 0.07737457007169724, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040021055610850453, + "grad_norm": 9.030428886413574, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8616843223571777, + "num_tokens": 719226514.0, + "step": 18848 + }, + { + "epoch": 2.397786541152525, + "ewc_loss": 0.0779125839471817, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004055906902067363, + "grad_norm": 9.058566093444824, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8891220092773438, + "num_tokens": 719262944.0, + "step": 18849 + }, + { + "epoch": 2.3979137514311155, + "ewc_loss": 0.07770532369613647, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004035180318169296, + "grad_norm": 9.020511627197266, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8709662556648254, + "num_tokens": 719306707.0, + "step": 18850 + }, + { + "epoch": 2.398040961709706, + "ewc_loss": 0.0777779370546341, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040424420149065554, + "grad_norm": 9.011861801147461, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8714592456817627, + "num_tokens": 719347523.0, + "step": 18851 + }, + { + "epoch": 2.3981681719882966, + "ewc_loss": 0.07786457240581512, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040511053521186113, + "grad_norm": 9.038230895996094, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8849129676818848, + "num_tokens": 719379474.0, + "step": 18852 + }, + { + "epoch": 2.398295382266887, + "ewc_loss": 0.07773081958293915, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040377306868322194, + "grad_norm": 8.995831489562988, + "learning_rate": 1e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8488708734512329, + "num_tokens": 719421624.0, + "step": 18853 + }, + { + "epoch": 2.3984225925454776, + "ewc_loss": 0.07789073884487152, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004053722368553281, + "grad_norm": 9.051263809204102, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8614968061447144, + "num_tokens": 719455570.0, + "step": 18854 + }, + { + "epoch": 2.398549802824068, + "ewc_loss": 0.07757960259914398, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004022609209641814, + "grad_norm": 8.909016609191895, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8823376893997192, + "num_tokens": 719500300.0, + "step": 18855 + }, + { + "epoch": 2.3986770131026587, + "ewc_loss": 0.07813788950443268, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040784376324154437, + "grad_norm": 9.141511917114258, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8507670164108276, + "num_tokens": 719538119.0, + "step": 18856 + }, + { + "epoch": 2.398804223381249, + "ewc_loss": 0.07747554779052734, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040122028440237045, + "grad_norm": 8.901551246643066, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8549798727035522, + "num_tokens": 719574012.0, + "step": 18857 + }, + { + "epoch": 2.3989314336598397, + "ewc_loss": 0.07806265354156494, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040953283314593136, + "grad_norm": 9.148527145385742, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8779164552688599, + "num_tokens": 719609548.0, + "step": 18858 + }, + { + "epoch": 2.3990586439384303, + "ewc_loss": 0.07738681137561798, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000400332995923236, + "grad_norm": 8.924120903015137, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8592672348022461, + "num_tokens": 719647280.0, + "step": 18859 + }, + { + "epoch": 2.399185854217021, + "ewc_loss": 0.07802575826644897, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004091638547834009, + "grad_norm": 9.079497337341309, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8806008100509644, + "num_tokens": 719686987.0, + "step": 18860 + }, + { + "epoch": 2.3993130644956113, + "ewc_loss": 0.07737912982702255, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004026975657325238, + "grad_norm": 8.919681549072266, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8836613893508911, + "num_tokens": 719727974.0, + "step": 18861 + }, + { + "epoch": 2.399440274774202, + "ewc_loss": 0.07808194309473038, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004097256751265377, + "grad_norm": 9.142476081848145, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8642429113388062, + "num_tokens": 719764308.0, + "step": 18862 + }, + { + "epoch": 2.3995674850527924, + "ewc_loss": 0.07728922367095947, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040179843199439347, + "grad_norm": 8.9751615524292, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8537518382072449, + "num_tokens": 719801872.0, + "step": 18863 + }, + { + "epoch": 2.399694695331383, + "ewc_loss": 0.0781162828207016, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041006910032592714, + "grad_norm": 9.066941261291504, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8673961162567139, + "num_tokens": 719842504.0, + "step": 18864 + }, + { + "epoch": 2.3998219056099734, + "ewc_loss": 0.07745131850242615, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004034194571431726, + "grad_norm": 8.994081497192383, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8606650829315186, + "num_tokens": 719878885.0, + "step": 18865 + }, + { + "epoch": 2.399949115888564, + "ewc_loss": 0.0781538188457489, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040800304850563407, + "grad_norm": 9.043988227844238, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8495829701423645, + "num_tokens": 719922891.0, + "step": 18866 + }, + { + "epoch": 2.4000763261671545, + "ewc_loss": 0.07768881320953369, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004057943297084421, + "grad_norm": 9.052225112915039, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8627011179924011, + "num_tokens": 719960113.0, + "step": 18867 + }, + { + "epoch": 2.400203536445745, + "ewc_loss": 0.07791143655776978, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040557916508987546, + "grad_norm": 8.977620124816895, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8645711541175842, + "num_tokens": 720006079.0, + "step": 18868 + }, + { + "epoch": 2.400330746724335, + "ewc_loss": 0.07817178964614868, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040818273555487394, + "grad_norm": 9.115280151367188, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8518460392951965, + "num_tokens": 720042219.0, + "step": 18869 + }, + { + "epoch": 2.400457957002926, + "ewc_loss": 0.07744729518890381, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040337920654565096, + "grad_norm": 8.963930130004883, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8664988875389099, + "num_tokens": 720082574.0, + "step": 18870 + }, + { + "epoch": 2.400585167281516, + "ewc_loss": 0.07833583652973175, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004098232602700591, + "grad_norm": 9.134838104248047, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.861854076385498, + "num_tokens": 720115583.0, + "step": 18871 + }, + { + "epoch": 2.4007123775601067, + "ewc_loss": 0.07747884094715118, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040369469206780195, + "grad_norm": 8.979487419128418, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.855146050453186, + "num_tokens": 720151350.0, + "step": 18872 + }, + { + "epoch": 2.400839587838697, + "ewc_loss": 0.07816055417060852, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004105117986910045, + "grad_norm": 9.10732650756836, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8564134836196899, + "num_tokens": 720187072.0, + "step": 18873 + }, + { + "epoch": 2.4009667981172877, + "ewc_loss": 0.07743217051029205, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004032279539387673, + "grad_norm": 8.984188079833984, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8680052757263184, + "num_tokens": 720222249.0, + "step": 18874 + }, + { + "epoch": 2.4010940083958783, + "ewc_loss": 0.07816681265830994, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004105744301341474, + "grad_norm": 9.23978328704834, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8639183044433594, + "num_tokens": 720264682.0, + "step": 18875 + }, + { + "epoch": 2.401221218674469, + "ewc_loss": 0.07731518149375916, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003996166051365435, + "grad_norm": 8.915621757507324, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8620216846466064, + "num_tokens": 720309935.0, + "step": 18876 + }, + { + "epoch": 2.4013484289530593, + "ewc_loss": 0.07866233587265015, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004130882443860173, + "grad_norm": 9.219914436340332, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8584873080253601, + "num_tokens": 720343992.0, + "step": 18877 + }, + { + "epoch": 2.40147563923165, + "ewc_loss": 0.07702624797821045, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00039916872628964484, + "grad_norm": 8.885014533996582, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.863763689994812, + "num_tokens": 720379707.0, + "step": 18878 + }, + { + "epoch": 2.4016028495102404, + "ewc_loss": 0.07859644293785095, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004148706793785095, + "grad_norm": 9.30013370513916, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8613521456718445, + "num_tokens": 720419796.0, + "step": 18879 + }, + { + "epoch": 2.401730059788831, + "ewc_loss": 0.07707560062408447, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003972208360210061, + "grad_norm": 8.938861846923828, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8671373128890991, + "num_tokens": 720460919.0, + "step": 18880 + }, + { + "epoch": 2.4018572700674214, + "ewc_loss": 0.07889833301305771, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041544815758243203, + "grad_norm": 9.249951362609863, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8613646030426025, + "num_tokens": 720498403.0, + "step": 18881 + }, + { + "epoch": 2.401984480346012, + "ewc_loss": 0.07725702226161957, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003990350232925266, + "grad_norm": 8.916988372802734, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8650637865066528, + "num_tokens": 720534931.0, + "step": 18882 + }, + { + "epoch": 2.4021116906246025, + "ewc_loss": 0.07878522574901581, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004143171536270529, + "grad_norm": 9.300604820251465, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8698692321777344, + "num_tokens": 720574062.0, + "step": 18883 + }, + { + "epoch": 2.402238900903193, + "ewc_loss": 0.07734008133411407, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003998656466137618, + "grad_norm": 8.969324111938477, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8578943014144897, + "num_tokens": 720611552.0, + "step": 18884 + }, + { + "epoch": 2.4023661111817836, + "ewc_loss": 0.07861797511577606, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041264461469836533, + "grad_norm": 9.239164352416992, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8635755181312561, + "num_tokens": 720647886.0, + "step": 18885 + }, + { + "epoch": 2.402493321460374, + "ewc_loss": 0.07745005190372467, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040096533484756947, + "grad_norm": 8.95454216003418, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.869784951210022, + "num_tokens": 720688034.0, + "step": 18886 + }, + { + "epoch": 2.4026205317389646, + "ewc_loss": 0.07853849232196808, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004118498181924224, + "grad_norm": 9.24824333190918, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8719210624694824, + "num_tokens": 720727805.0, + "step": 18887 + }, + { + "epoch": 2.402747742017555, + "ewc_loss": 0.07730347663164139, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039949960773810744, + "grad_norm": 8.982667922973633, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.867520272731781, + "num_tokens": 720759717.0, + "step": 18888 + }, + { + "epoch": 2.4028749522961457, + "ewc_loss": 0.07848477363586426, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041131259058602154, + "grad_norm": 9.272160530090332, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8585258722305298, + "num_tokens": 720790435.0, + "step": 18889 + }, + { + "epoch": 2.403002162574736, + "ewc_loss": 0.07740125060081482, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004004773509223014, + "grad_norm": 8.94973373413086, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8616094589233398, + "num_tokens": 720831715.0, + "step": 18890 + }, + { + "epoch": 2.4031293728533267, + "ewc_loss": 0.07850346714258194, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041149952448904514, + "grad_norm": 9.263540267944336, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8646895289421082, + "num_tokens": 720869598.0, + "step": 18891 + }, + { + "epoch": 2.403256583131917, + "ewc_loss": 0.07736938446760178, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040015869308263063, + "grad_norm": 8.948929786682129, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8593188524246216, + "num_tokens": 720911122.0, + "step": 18892 + }, + { + "epoch": 2.403383793410508, + "ewc_loss": 0.07960870116949081, + "ewc_loss_diag": 3.8623809814453125e-05, + "ewc_loss_parallel": 0.00041034483001567423, + "grad_norm": 53.7946891784668, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8690715432167053, + "num_tokens": 720952265.0, + "step": 18893 + }, + { + "epoch": 2.403511003689098, + "ewc_loss": 0.1331649273633957, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0009581141057424247, + "grad_norm": 14.889723777770996, + "learning_rate": 1e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.868817925453186, + "num_tokens": 720991191.0, + "step": 18894 + }, + { + "epoch": 2.4036382139676884, + "ewc_loss": 0.07703559100627899, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003968208038713783, + "grad_norm": 7.937131881713867, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8744955658912659, + "num_tokens": 721032524.0, + "step": 18895 + }, + { + "epoch": 2.403765424246279, + "ewc_loss": 0.11331574618816376, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0007596223149448633, + "grad_norm": 13.672011375427246, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8627105951309204, + "num_tokens": 721072699.0, + "step": 18896 + }, + { + "epoch": 2.4038926345248695, + "ewc_loss": 0.11941536515951157, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0008206184720620513, + "grad_norm": 13.529634475708008, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8777599334716797, + "num_tokens": 721109467.0, + "step": 18897 + }, + { + "epoch": 2.40401984480346, + "ewc_loss": 0.0891139954328537, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005176047561690211, + "grad_norm": 9.593792915344238, + "learning_rate": 1e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8386597633361816, + "num_tokens": 721142108.0, + "step": 18898 + }, + { + "epoch": 2.4041470550820505, + "ewc_loss": 0.09590480476617813, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005855128983967006, + "grad_norm": 11.747732162475586, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8572291135787964, + "num_tokens": 721179875.0, + "step": 18899 + }, + { + "epoch": 2.404274265360641, + "ewc_loss": 0.09894491732120514, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0006159140611998737, + "grad_norm": 11.247197151184082, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8616805672645569, + "num_tokens": 721217141.0, + "step": 18900 + }, + { + "epoch": 2.4044014756392316, + "ewc_loss": 0.08660216629505157, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004924865206703544, + "grad_norm": 9.854321479797363, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8583433032035828, + "num_tokens": 721256357.0, + "step": 18901 + }, + { + "epoch": 2.404528685917822, + "ewc_loss": 0.0889902412891388, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005163672612980008, + "grad_norm": 10.656511306762695, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8637241125106812, + "num_tokens": 721292610.0, + "step": 18902 + }, + { + "epoch": 2.4046558961964126, + "ewc_loss": 0.08743727952241898, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005008376319892704, + "grad_norm": 9.9539794921875, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.870830774307251, + "num_tokens": 721329048.0, + "step": 18903 + }, + { + "epoch": 2.404783106475003, + "ewc_loss": 0.08457525819540024, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004722174198832363, + "grad_norm": 9.96126937866211, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8787356615066528, + "num_tokens": 721364249.0, + "step": 18904 + }, + { + "epoch": 2.4049103167535937, + "ewc_loss": 0.08416465669870377, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046811142237856984, + "grad_norm": 9.91664981842041, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8714094758033752, + "num_tokens": 721391154.0, + "step": 18905 + }, + { + "epoch": 2.405037527032184, + "ewc_loss": 0.08288407325744629, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045530559145845473, + "grad_norm": 9.601279258728027, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.875246524810791, + "num_tokens": 721428443.0, + "step": 18906 + }, + { + "epoch": 2.4051647373107747, + "ewc_loss": 0.08140778541564941, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004429840773809701, + "grad_norm": 9.557209014892578, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8544541597366333, + "num_tokens": 721474545.0, + "step": 18907 + }, + { + "epoch": 2.4052919475893653, + "ewc_loss": 0.08119610697031021, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004408673266880214, + "grad_norm": 9.53154468536377, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8640996813774109, + "num_tokens": 721513153.0, + "step": 18908 + }, + { + "epoch": 2.405419157867956, + "ewc_loss": 0.08004187047481537, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00042932495125569403, + "grad_norm": 9.415657997131348, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8628090023994446, + "num_tokens": 721556287.0, + "step": 18909 + }, + { + "epoch": 2.4055463681465463, + "ewc_loss": 0.08019597828388214, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004284245951566845, + "grad_norm": 9.367676734924316, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8714977502822876, + "num_tokens": 721590219.0, + "step": 18910 + }, + { + "epoch": 2.405673578425137, + "ewc_loss": 0.07919332385063171, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00042083949665538967, + "grad_norm": 9.239541053771973, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.863310694694519, + "num_tokens": 721627308.0, + "step": 18911 + }, + { + "epoch": 2.4058007887037274, + "ewc_loss": 0.07956400513648987, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000422104902099818, + "grad_norm": 9.358541488647461, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8599804639816284, + "num_tokens": 721664218.0, + "step": 18912 + }, + { + "epoch": 2.405927998982318, + "ewc_loss": 0.0789245218038559, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004157100338488817, + "grad_norm": 9.181238174438477, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8742654919624329, + "num_tokens": 721703511.0, + "step": 18913 + }, + { + "epoch": 2.4060552092609084, + "ewc_loss": 0.0790134072303772, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041904032696038485, + "grad_norm": 9.212711334228516, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8777567744255066, + "num_tokens": 721741512.0, + "step": 18914 + }, + { + "epoch": 2.406182419539499, + "ewc_loss": 0.07833682000637054, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004122744721826166, + "grad_norm": 9.140852928161621, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.868915319442749, + "num_tokens": 721778443.0, + "step": 18915 + }, + { + "epoch": 2.4063096298180895, + "ewc_loss": 0.07852529734373093, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004141592071391642, + "grad_norm": 9.192228317260742, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.861204206943512, + "num_tokens": 721817226.0, + "step": 18916 + }, + { + "epoch": 2.4064368400966796, + "ewc_loss": 0.07829873263835907, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041189356124959886, + "grad_norm": 9.171082496643066, + "learning_rate": 1e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.849250316619873, + "num_tokens": 721859419.0, + "step": 18917 + }, + { + "epoch": 2.4065640503752705, + "ewc_loss": 0.07807596027851105, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040966583765111864, + "grad_norm": 9.070834159851074, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8771421909332275, + "num_tokens": 721895183.0, + "step": 18918 + }, + { + "epoch": 2.4066912606538606, + "ewc_loss": 0.07815098762512207, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041041613440029323, + "grad_norm": 9.202998161315918, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8501110076904297, + "num_tokens": 721933566.0, + "step": 18919 + }, + { + "epoch": 2.406818470932451, + "ewc_loss": 0.07781078666448593, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040701410034671426, + "grad_norm": 9.080893516540527, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8675756454467773, + "num_tokens": 721968827.0, + "step": 18920 + }, + { + "epoch": 2.4069456812110417, + "ewc_loss": 0.07812260091304779, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004101322847418487, + "grad_norm": 9.140676498413086, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8683173656463623, + "num_tokens": 722004989.0, + "step": 18921 + }, + { + "epoch": 2.407072891489632, + "ewc_loss": 0.07794149219989777, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004083211824763566, + "grad_norm": 9.075631141662598, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8687057495117188, + "num_tokens": 722043681.0, + "step": 18922 + }, + { + "epoch": 2.4072001017682227, + "ewc_loss": 0.07807540893554688, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040966036613099277, + "grad_norm": 9.10318374633789, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8651389479637146, + "num_tokens": 722086541.0, + "step": 18923 + }, + { + "epoch": 2.4073273120468133, + "ewc_loss": 0.07792702317237854, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004081764491274953, + "grad_norm": 9.109386444091797, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.866377592086792, + "num_tokens": 722125519.0, + "step": 18924 + }, + { + "epoch": 2.407454522325404, + "ewc_loss": 0.07810597121715546, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004075246106367558, + "grad_norm": 9.08391284942627, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8606742024421692, + "num_tokens": 722157665.0, + "step": 18925 + }, + { + "epoch": 2.4075817326039943, + "ewc_loss": 0.07799844443798065, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040889065712690353, + "grad_norm": 9.12177562713623, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8777403831481934, + "num_tokens": 722191819.0, + "step": 18926 + }, + { + "epoch": 2.407708942882585, + "ewc_loss": 0.07807955145835876, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004072604060638696, + "grad_norm": 9.056708335876465, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8551967740058899, + "num_tokens": 722229755.0, + "step": 18927 + }, + { + "epoch": 2.4078361531611754, + "ewc_loss": 0.07786855101585388, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004075917531736195, + "grad_norm": 9.166394233703613, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8540112972259521, + "num_tokens": 722267440.0, + "step": 18928 + }, + { + "epoch": 2.407963363439766, + "ewc_loss": 0.07753173261880875, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004042235668748617, + "grad_norm": 8.966569900512695, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8621397018432617, + "num_tokens": 722310102.0, + "step": 18929 + }, + { + "epoch": 2.4080905737183564, + "ewc_loss": 0.07831664383411407, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041207263711839914, + "grad_norm": 9.224628448486328, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8585355281829834, + "num_tokens": 722346713.0, + "step": 18930 + }, + { + "epoch": 2.408217783996947, + "ewc_loss": 0.07730606943368912, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040196694317273796, + "grad_norm": 8.96255874633789, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8598467111587524, + "num_tokens": 722387803.0, + "step": 18931 + }, + { + "epoch": 2.4083449942755375, + "ewc_loss": 0.07849916815757751, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041389797115698457, + "grad_norm": 9.23536491394043, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.856385350227356, + "num_tokens": 722428943.0, + "step": 18932 + }, + { + "epoch": 2.408472204554128, + "ewc_loss": 0.0774218738079071, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040312495548278093, + "grad_norm": 9.027922630310059, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8578163385391235, + "num_tokens": 722467860.0, + "step": 18933 + }, + { + "epoch": 2.4085994148327186, + "ewc_loss": 0.07842947542667389, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004132009926252067, + "grad_norm": 9.2785005569458, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.861352801322937, + "num_tokens": 722503067.0, + "step": 18934 + }, + { + "epoch": 2.408726625111309, + "ewc_loss": 0.07770827412605286, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040354757220484316, + "grad_norm": 9.016998291015625, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8668261170387268, + "num_tokens": 722537842.0, + "step": 18935 + }, + { + "epoch": 2.4088538353898996, + "ewc_loss": 0.07869318127632141, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004133966867811978, + "grad_norm": 9.274845123291016, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.870283842086792, + "num_tokens": 722574036.0, + "step": 18936 + }, + { + "epoch": 2.40898104566849, + "ewc_loss": 0.07759809494018555, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040244575939141214, + "grad_norm": 9.060620307922363, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.864043116569519, + "num_tokens": 722606087.0, + "step": 18937 + }, + { + "epoch": 2.4091082559470807, + "ewc_loss": 0.07846646010875702, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004111294692847878, + "grad_norm": 9.18457317352295, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.863805890083313, + "num_tokens": 722648745.0, + "step": 18938 + }, + { + "epoch": 2.409235466225671, + "ewc_loss": 0.07785612344741821, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004050260758958757, + "grad_norm": 9.103800773620605, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8554301857948303, + "num_tokens": 722688589.0, + "step": 18939 + }, + { + "epoch": 2.4093626765042617, + "ewc_loss": 0.07805975526571274, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040706241270527244, + "grad_norm": 9.13796329498291, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8831909894943237, + "num_tokens": 722723941.0, + "step": 18940 + }, + { + "epoch": 2.4094898867828523, + "ewc_loss": 0.0779990702867508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040645559784024954, + "grad_norm": 9.123472213745117, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8485487699508667, + "num_tokens": 722768242.0, + "step": 18941 + }, + { + "epoch": 2.4096170970614423, + "ewc_loss": 0.07798004150390625, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040626528789289296, + "grad_norm": 9.110347747802734, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8646233081817627, + "num_tokens": 722800970.0, + "step": 18942 + }, + { + "epoch": 2.4097443073400333, + "ewc_loss": 0.07812270522117615, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040769187035039067, + "grad_norm": 9.147892951965332, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8554859161376953, + "num_tokens": 722839168.0, + "step": 18943 + }, + { + "epoch": 2.4098715176186234, + "ewc_loss": 0.07790960371494293, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040800226270221174, + "grad_norm": 9.084538459777832, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8758901953697205, + "num_tokens": 722883451.0, + "step": 18944 + }, + { + "epoch": 2.409998727897214, + "ewc_loss": 0.0781276524066925, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004077413468621671, + "grad_norm": 9.111515045166016, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8650224208831787, + "num_tokens": 722928581.0, + "step": 18945 + }, + { + "epoch": 2.4101259381758044, + "ewc_loss": 0.07807136327028275, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004071784787811339, + "grad_norm": 9.167679786682129, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8753050565719604, + "num_tokens": 722967141.0, + "step": 18946 + }, + { + "epoch": 2.410253148454395, + "ewc_loss": 0.07811325788497925, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040759745752438903, + "grad_norm": 9.130733489990234, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8598994612693787, + "num_tokens": 723006263.0, + "step": 18947 + }, + { + "epoch": 2.4103803587329855, + "ewc_loss": 0.0781686082482338, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040815092506818473, + "grad_norm": 9.233380317687988, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8671941161155701, + "num_tokens": 723050843.0, + "step": 18948 + }, + { + "epoch": 2.410507569011576, + "ewc_loss": 0.0778111070394516, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040457595605403185, + "grad_norm": 8.997909545898438, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8646968007087708, + "num_tokens": 723092124.0, + "step": 18949 + }, + { + "epoch": 2.4106347792901666, + "ewc_loss": 0.07848790287971497, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004113438480999321, + "grad_norm": 9.226900100708008, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8754188418388367, + "num_tokens": 723122478.0, + "step": 18950 + }, + { + "epoch": 2.410761989568757, + "ewc_loss": 0.07759702205657959, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004024350200779736, + "grad_norm": 8.968932151794434, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8582360744476318, + "num_tokens": 723163054.0, + "step": 18951 + }, + { + "epoch": 2.4108891998473476, + "ewc_loss": 0.07852891832590103, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041419544140808284, + "grad_norm": 9.20737361907959, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.86625736951828, + "num_tokens": 723201518.0, + "step": 18952 + }, + { + "epoch": 2.411016410125938, + "ewc_loss": 0.0776624083518982, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040308889583684504, + "grad_norm": 9.007929801940918, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8655190467834473, + "num_tokens": 723235685.0, + "step": 18953 + }, + { + "epoch": 2.4111436204045287, + "ewc_loss": 0.07876278460025787, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041409270488657057, + "grad_norm": 9.252182960510254, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8638529777526855, + "num_tokens": 723266510.0, + "step": 18954 + }, + { + "epoch": 2.411270830683119, + "ewc_loss": 0.07779531180858612, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004044179222546518, + "grad_norm": 8.997008323669434, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8686528205871582, + "num_tokens": 723302800.0, + "step": 18955 + }, + { + "epoch": 2.4113980409617097, + "ewc_loss": 0.07876287400722504, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004140936362091452, + "grad_norm": 9.203425407409668, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8669766783714294, + "num_tokens": 723340669.0, + "step": 18956 + }, + { + "epoch": 2.4115252512403003, + "ewc_loss": 0.07789040356874466, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004053688608109951, + "grad_norm": 8.984733581542969, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8683347702026367, + "num_tokens": 723377556.0, + "step": 18957 + }, + { + "epoch": 2.411652461518891, + "ewc_loss": 0.07889308035373688, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004153956542722881, + "grad_norm": 9.296394348144531, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8691281080245972, + "num_tokens": 723410942.0, + "step": 18958 + }, + { + "epoch": 2.4117796717974813, + "ewc_loss": 0.07766029983758926, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004030678537674248, + "grad_norm": 8.915956497192383, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8758623003959656, + "num_tokens": 723451131.0, + "step": 18959 + }, + { + "epoch": 2.411906882076072, + "ewc_loss": 0.07880507409572601, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041695701656863093, + "grad_norm": 9.373587608337402, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8677959442138672, + "num_tokens": 723485876.0, + "step": 18960 + }, + { + "epoch": 2.4120340923546624, + "ewc_loss": 0.07753446698188782, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000401809491449967, + "grad_norm": 9.000885009765625, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8719397783279419, + "num_tokens": 723521252.0, + "step": 18961 + }, + { + "epoch": 2.412161302633253, + "ewc_loss": 0.07885652035474777, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041747145587578416, + "grad_norm": 9.329931259155273, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.848996102809906, + "num_tokens": 723561094.0, + "step": 18962 + }, + { + "epoch": 2.4122885129118434, + "ewc_loss": 0.07755521684885025, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000402017030864954, + "grad_norm": 8.984872817993164, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8630557656288147, + "num_tokens": 723597281.0, + "step": 18963 + }, + { + "epoch": 2.412415723190434, + "ewc_loss": 0.0786818265914917, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004157245275564492, + "grad_norm": 9.259305953979492, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8875213861465454, + "num_tokens": 723637690.0, + "step": 18964 + }, + { + "epoch": 2.4125429334690245, + "ewc_loss": 0.07747600972652435, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040366637404076755, + "grad_norm": 9.005675315856934, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8643196821212769, + "num_tokens": 723676250.0, + "step": 18965 + }, + { + "epoch": 2.412670143747615, + "ewc_loss": 0.07886086404323578, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041507347486913204, + "grad_norm": 9.310586929321289, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8724713325500488, + "num_tokens": 723713157.0, + "step": 18966 + }, + { + "epoch": 2.412797354026205, + "ewc_loss": 0.07763660699129105, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040283091948367655, + "grad_norm": 9.009696006774902, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8701995611190796, + "num_tokens": 723745877.0, + "step": 18967 + }, + { + "epoch": 2.412924564304796, + "ewc_loss": 0.07888343930244446, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041529929148964584, + "grad_norm": 9.20542049407959, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8656512498855591, + "num_tokens": 723784285.0, + "step": 18968 + }, + { + "epoch": 2.413051774583386, + "ewc_loss": 0.07749851793050766, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040389143396168947, + "grad_norm": 9.06485366821289, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8540678024291992, + "num_tokens": 723817085.0, + "step": 18969 + }, + { + "epoch": 2.4131789848619767, + "ewc_loss": 0.07839618623256683, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004104267281945795, + "grad_norm": 9.189678192138672, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8595846891403198, + "num_tokens": 723858466.0, + "step": 18970 + }, + { + "epoch": 2.413306195140567, + "ewc_loss": 0.07780507206916809, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004045155656058341, + "grad_norm": 9.002389907836914, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8641985058784485, + "num_tokens": 723896627.0, + "step": 18971 + }, + { + "epoch": 2.4134334054191577, + "ewc_loss": 0.07849016785621643, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004113665490876883, + "grad_norm": 9.163002967834473, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8593968152999878, + "num_tokens": 723930727.0, + "step": 18972 + }, + { + "epoch": 2.4135606156977483, + "ewc_loss": 0.07790268212556839, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004054916789755225, + "grad_norm": 8.992433547973633, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8536368608474731, + "num_tokens": 723965996.0, + "step": 18973 + }, + { + "epoch": 2.413687825976339, + "ewc_loss": 0.07842682301998138, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004107330460101366, + "grad_norm": 9.159323692321777, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.859075129032135, + "num_tokens": 724002441.0, + "step": 18974 + }, + { + "epoch": 2.4138150362549293, + "ewc_loss": 0.07790227234363556, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040548754623159766, + "grad_norm": 9.014666557312012, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8701109290122986, + "num_tokens": 724039238.0, + "step": 18975 + }, + { + "epoch": 2.41394224653352, + "ewc_loss": 0.07861435413360596, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004126084386371076, + "grad_norm": 9.203398704528809, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8703418374061584, + "num_tokens": 724077430.0, + "step": 18976 + }, + { + "epoch": 2.4140694568121104, + "ewc_loss": 0.07753637433052063, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040427001658827066, + "grad_norm": 8.977214813232422, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8632221817970276, + "num_tokens": 724113175.0, + "step": 18977 + }, + { + "epoch": 2.414196667090701, + "ewc_loss": 0.07832477986812592, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041215401142835617, + "grad_norm": 9.135249137878418, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8593733310699463, + "num_tokens": 724156409.0, + "step": 18978 + }, + { + "epoch": 2.4143238773692914, + "ewc_loss": 0.07782754302024841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040474027628079057, + "grad_norm": 8.955885887145996, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8648670315742493, + "num_tokens": 724199879.0, + "step": 18979 + }, + { + "epoch": 2.414451087647882, + "ewc_loss": 0.07857275009155273, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041219231206923723, + "grad_norm": 9.134673118591309, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8796848058700562, + "num_tokens": 724238065.0, + "step": 18980 + }, + { + "epoch": 2.4145782979264725, + "ewc_loss": 0.07753539085388184, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040426015038974583, + "grad_norm": 9.020491600036621, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8568965196609497, + "num_tokens": 724272336.0, + "step": 18981 + }, + { + "epoch": 2.414705508205063, + "ewc_loss": 0.07838886976242065, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041279493598267436, + "grad_norm": 9.184015274047852, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8551331758499146, + "num_tokens": 724305200.0, + "step": 18982 + }, + { + "epoch": 2.4148327184836536, + "ewc_loss": 0.07766993343830109, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040560562047176063, + "grad_norm": 8.962830543518066, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8693150877952576, + "num_tokens": 724341295.0, + "step": 18983 + }, + { + "epoch": 2.414959928762244, + "ewc_loss": 0.07824699580669403, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041137615335173905, + "grad_norm": 9.199613571166992, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8677505254745483, + "num_tokens": 724384336.0, + "step": 18984 + }, + { + "epoch": 2.4150871390408346, + "ewc_loss": 0.07756878435611725, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004045940877404064, + "grad_norm": 9.073542594909668, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8610025644302368, + "num_tokens": 724417279.0, + "step": 18985 + }, + { + "epoch": 2.415214349319425, + "ewc_loss": 0.07850435376167297, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004115084302611649, + "grad_norm": 9.198272705078125, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8646286129951477, + "num_tokens": 724456509.0, + "step": 18986 + }, + { + "epoch": 2.4153415595980157, + "ewc_loss": 0.07769322395324707, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040339710540138185, + "grad_norm": 8.940764427185059, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8757970333099365, + "num_tokens": 724498812.0, + "step": 18987 + }, + { + "epoch": 2.415468769876606, + "ewc_loss": 0.07862850278615952, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041274988325312734, + "grad_norm": 9.175134658813477, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8532819151878357, + "num_tokens": 724538174.0, + "step": 18988 + }, + { + "epoch": 2.4155959801551967, + "ewc_loss": 0.07792733609676361, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040573824662715197, + "grad_norm": 8.975367546081543, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8694340586662292, + "num_tokens": 724577171.0, + "step": 18989 + }, + { + "epoch": 2.415723190433787, + "ewc_loss": 0.07861712574958801, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004126361454837024, + "grad_norm": 9.204219818115234, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8855291604995728, + "num_tokens": 724610626.0, + "step": 18990 + }, + { + "epoch": 2.4158504007123778, + "ewc_loss": 0.07771726697683334, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040363750304095447, + "grad_norm": 8.933283805847168, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8766431212425232, + "num_tokens": 724647613.0, + "step": 18991 + }, + { + "epoch": 2.415977610990968, + "ewc_loss": 0.07878828048706055, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041434765444137156, + "grad_norm": 9.185038566589355, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8666937351226807, + "num_tokens": 724684898.0, + "step": 18992 + }, + { + "epoch": 2.4161048212695584, + "ewc_loss": 0.07773911207914352, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004038559563923627, + "grad_norm": 8.988694190979004, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8629477024078369, + "num_tokens": 724725566.0, + "step": 18993 + }, + { + "epoch": 2.416232031548149, + "ewc_loss": 0.07856288552284241, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004120937373954803, + "grad_norm": 9.16571044921875, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8554252982139587, + "num_tokens": 724763701.0, + "step": 18994 + }, + { + "epoch": 2.4163592418267394, + "ewc_loss": 0.07781688123941422, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004046336398459971, + "grad_norm": 8.98335075378418, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8628270030021667, + "num_tokens": 724798108.0, + "step": 18995 + }, + { + "epoch": 2.41648645210533, + "ewc_loss": 0.07848802208900452, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004113450413569808, + "grad_norm": 9.102931022644043, + "learning_rate": 1e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8434855341911316, + "num_tokens": 724842430.0, + "step": 18996 + }, + { + "epoch": 2.4166136623839205, + "ewc_loss": 0.0781189501285553, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004076543264091015, + "grad_norm": 9.050822257995605, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8602808713912964, + "num_tokens": 724884585.0, + "step": 18997 + }, + { + "epoch": 2.416740872662511, + "ewc_loss": 0.078249990940094, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040896478458307683, + "grad_norm": 9.13275146484375, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8764870166778564, + "num_tokens": 724919338.0, + "step": 18998 + }, + { + "epoch": 2.4168680829411016, + "ewc_loss": 0.07785674929618835, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040503230411559343, + "grad_norm": 8.983052253723145, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8875397443771362, + "num_tokens": 724956007.0, + "step": 18999 + }, + { + "epoch": 2.416995293219692, + "ewc_loss": 0.07855185866355896, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041198343387804925, + "grad_norm": 9.139713287353516, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8811502456665039, + "num_tokens": 724996994.0, + "step": 19000 + }, + { + "epoch": 2.4171225034982826, + "ewc_loss": 0.07783256471157074, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040479048038832843, + "grad_norm": 8.990466117858887, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8581332564353943, + "num_tokens": 725035607.0, + "step": 19001 + }, + { + "epoch": 2.417249713776873, + "ewc_loss": 0.0785415768623352, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041188063914887607, + "grad_norm": 9.127445220947266, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.868807852268219, + "num_tokens": 725075370.0, + "step": 19002 + }, + { + "epoch": 2.4173769240554637, + "ewc_loss": 0.07814077287912369, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040787257603369653, + "grad_norm": 9.12160587310791, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8639893531799316, + "num_tokens": 725119536.0, + "step": 19003 + }, + { + "epoch": 2.417504134334054, + "ewc_loss": 0.07823631912469864, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040882802568376064, + "grad_norm": 9.055573463439941, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8650475144386292, + "num_tokens": 725158048.0, + "step": 19004 + }, + { + "epoch": 2.4176313446126447, + "ewc_loss": 0.07836991548538208, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041016394970938563, + "grad_norm": 9.388025283813477, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8744480013847351, + "num_tokens": 725196343.0, + "step": 19005 + }, + { + "epoch": 2.4177585548912353, + "ewc_loss": 0.0773703008890152, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040016788989305496, + "grad_norm": 8.848063468933105, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8685027360916138, + "num_tokens": 725234613.0, + "step": 19006 + }, + { + "epoch": 2.417885765169826, + "ewc_loss": 0.07909896969795227, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041745457565411925, + "grad_norm": 9.378767967224121, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8634257912635803, + "num_tokens": 725272129.0, + "step": 19007 + }, + { + "epoch": 2.4180129754484163, + "ewc_loss": 0.07709421217441559, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003974069550167769, + "grad_norm": 8.924513816833496, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8559019565582275, + "num_tokens": 725305740.0, + "step": 19008 + }, + { + "epoch": 2.418140185727007, + "ewc_loss": 0.07934773713350296, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041994222556240857, + "grad_norm": 9.453948020935059, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8611919283866882, + "num_tokens": 725340579.0, + "step": 19009 + }, + { + "epoch": 2.4182673960055974, + "ewc_loss": 0.07711319625377655, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039759682840667665, + "grad_norm": 8.808281898498535, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8668555021286011, + "num_tokens": 725380492.0, + "step": 19010 + }, + { + "epoch": 2.418394606284188, + "ewc_loss": 0.07978599518537521, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042432479676790535, + "grad_norm": 9.56871223449707, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8648906946182251, + "num_tokens": 725418043.0, + "step": 19011 + }, + { + "epoch": 2.4185218165627784, + "ewc_loss": 0.07708962261676788, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00039736111648380756, + "grad_norm": 8.883538246154785, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8567527532577515, + "num_tokens": 725456260.0, + "step": 19012 + }, + { + "epoch": 2.418649026841369, + "ewc_loss": 0.07999522984027863, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004264171002432704, + "grad_norm": 9.568465232849121, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8616529703140259, + "num_tokens": 725496605.0, + "step": 19013 + }, + { + "epoch": 2.4187762371199595, + "ewc_loss": 0.07745771110057831, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040104196523316205, + "grad_norm": 8.94942569732666, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8654723167419434, + "num_tokens": 725532395.0, + "step": 19014 + }, + { + "epoch": 2.4189034473985496, + "ewc_loss": 0.07987982779741287, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004252631333656609, + "grad_norm": 9.495229721069336, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8596787452697754, + "num_tokens": 725575335.0, + "step": 19015 + }, + { + "epoch": 2.4190306576771405, + "ewc_loss": 0.07753147184848785, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004017795145045966, + "grad_norm": 8.986811637878418, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8722288608551025, + "num_tokens": 725612345.0, + "step": 19016 + }, + { + "epoch": 2.4191578679557306, + "ewc_loss": 0.07955954968929291, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004220603150315583, + "grad_norm": 9.440289497375488, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8684390187263489, + "num_tokens": 725645566.0, + "step": 19017 + }, + { + "epoch": 2.419285078234321, + "ewc_loss": 0.07771814614534378, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040364632150158286, + "grad_norm": 9.02805233001709, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8597934246063232, + "num_tokens": 725684218.0, + "step": 19018 + }, + { + "epoch": 2.4194122885129117, + "ewc_loss": 0.07923708111047745, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041883563972078264, + "grad_norm": 9.376660346984863, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8704041838645935, + "num_tokens": 725718478.0, + "step": 19019 + }, + { + "epoch": 2.419539498791502, + "ewc_loss": 0.07769101858139038, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004033750738017261, + "grad_norm": 9.00555419921875, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8586252927780151, + "num_tokens": 725756208.0, + "step": 19020 + }, + { + "epoch": 2.4196667090700927, + "ewc_loss": 0.07906673848628998, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041713219252415, + "grad_norm": 9.430632591247559, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8596853613853455, + "num_tokens": 725791884.0, + "step": 19021 + }, + { + "epoch": 2.4197939193486833, + "ewc_loss": 0.07759904861450195, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004024552763439715, + "grad_norm": 9.010357856750488, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.869015634059906, + "num_tokens": 725829138.0, + "step": 19022 + }, + { + "epoch": 2.419921129627274, + "ewc_loss": 0.0790109783411026, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004165746213402599, + "grad_norm": 9.287235260009766, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8662552833557129, + "num_tokens": 725868301.0, + "step": 19023 + }, + { + "epoch": 2.4200483399058643, + "ewc_loss": 0.07783103734254837, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040477520087733865, + "grad_norm": 9.074515342712402, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8530576229095459, + "num_tokens": 725906186.0, + "step": 19024 + }, + { + "epoch": 2.420175550184455, + "ewc_loss": 0.07857298851013184, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004121947567909956, + "grad_norm": 9.23555850982666, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8540158271789551, + "num_tokens": 725947006.0, + "step": 19025 + }, + { + "epoch": 2.4203027604630454, + "ewc_loss": 0.07800979912281036, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004065628454554826, + "grad_norm": 9.096953392028809, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8678682446479797, + "num_tokens": 725987363.0, + "step": 19026 + }, + { + "epoch": 2.420429970741636, + "ewc_loss": 0.07847017049789429, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041116654756478965, + "grad_norm": 9.185516357421875, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8551021218299866, + "num_tokens": 726026176.0, + "step": 19027 + }, + { + "epoch": 2.4205571810202264, + "ewc_loss": 0.07787452638149261, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004052101285196841, + "grad_norm": 9.06846809387207, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8633152842521667, + "num_tokens": 726068124.0, + "step": 19028 + }, + { + "epoch": 2.420684391298817, + "ewc_loss": 0.07841748744249344, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041063971002586186, + "grad_norm": 9.1730375289917, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8571617603302002, + "num_tokens": 726110270.0, + "step": 19029 + }, + { + "epoch": 2.4208116015774075, + "ewc_loss": 0.07795918732881546, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040605670074000955, + "grad_norm": 9.107942581176758, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.879867434501648, + "num_tokens": 726143908.0, + "step": 19030 + }, + { + "epoch": 2.420938811855998, + "ewc_loss": 0.0782531350851059, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040899624582380056, + "grad_norm": 9.186071395874023, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8630306720733643, + "num_tokens": 726179082.0, + "step": 19031 + }, + { + "epoch": 2.4210660221345885, + "ewc_loss": 0.07812121510505676, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004076769691891968, + "grad_norm": 9.119485855102539, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8572320342063904, + "num_tokens": 726216387.0, + "step": 19032 + }, + { + "epoch": 2.421193232413179, + "ewc_loss": 0.07812494039535522, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040771422209218144, + "grad_norm": 9.16813850402832, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8748621940612793, + "num_tokens": 726258831.0, + "step": 19033 + }, + { + "epoch": 2.4213204426917696, + "ewc_loss": 0.07789400219917297, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004054048331454396, + "grad_norm": 9.074676513671875, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8666924238204956, + "num_tokens": 726301929.0, + "step": 19034 + }, + { + "epoch": 2.42144765297036, + "ewc_loss": 0.07825256884098053, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004089905123692006, + "grad_norm": 9.178166389465332, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8667699098587036, + "num_tokens": 726339938.0, + "step": 19035 + }, + { + "epoch": 2.4215748632489507, + "ewc_loss": 0.07791534066200256, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004056182224303484, + "grad_norm": 9.1549711227417, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8669611215591431, + "num_tokens": 726377197.0, + "step": 19036 + }, + { + "epoch": 2.421702073527541, + "ewc_loss": 0.07808774709701538, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040734236245043576, + "grad_norm": 9.188097953796387, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8686503767967224, + "num_tokens": 726417591.0, + "step": 19037 + }, + { + "epoch": 2.4218292838061317, + "ewc_loss": 0.07779263705015182, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004043912049382925, + "grad_norm": 9.062849998474121, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8549433350563049, + "num_tokens": 726456094.0, + "step": 19038 + }, + { + "epoch": 2.4219564940847222, + "ewc_loss": 0.07829469442367554, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004094117903150618, + "grad_norm": 9.21467113494873, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.873883843421936, + "num_tokens": 726491158.0, + "step": 19039 + }, + { + "epoch": 2.4220837043633123, + "ewc_loss": 0.07772284746170044, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040369329508394003, + "grad_norm": 9.053945541381836, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8502585291862488, + "num_tokens": 726531334.0, + "step": 19040 + }, + { + "epoch": 2.4222109146419033, + "ewc_loss": 0.07825417816638947, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040900660678744316, + "grad_norm": 9.219717025756836, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8576531410217285, + "num_tokens": 726571999.0, + "step": 19041 + }, + { + "epoch": 2.4223381249204934, + "ewc_loss": 0.0777382105588913, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040384690510109067, + "grad_norm": 9.12358570098877, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8556067943572998, + "num_tokens": 726604945.0, + "step": 19042 + }, + { + "epoch": 2.422465335199084, + "ewc_loss": 0.07824283093214035, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004088931600563228, + "grad_norm": 9.207002639770508, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8574960827827454, + "num_tokens": 726645023.0, + "step": 19043 + }, + { + "epoch": 2.4225925454776744, + "ewc_loss": 0.07748113572597504, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004037176549900323, + "grad_norm": 9.115906715393066, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.865065336227417, + "num_tokens": 726684297.0, + "step": 19044 + }, + { + "epoch": 2.422719755756265, + "ewc_loss": 0.07808913290500641, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040735621587373316, + "grad_norm": 9.190443992614746, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8543295860290527, + "num_tokens": 726726950.0, + "step": 19045 + }, + { + "epoch": 2.4228469660348555, + "ewc_loss": 0.07766538113355637, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004031186399515718, + "grad_norm": 9.083700180053711, + "learning_rate": 1e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.8459920883178711, + "num_tokens": 726761944.0, + "step": 19046 + }, + { + "epoch": 2.422974176313446, + "ewc_loss": 0.07818116992712021, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000408276537200436, + "grad_norm": 9.125441551208496, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.87430340051651, + "num_tokens": 726799041.0, + "step": 19047 + }, + { + "epoch": 2.4231013865920366, + "ewc_loss": 0.07809045165777206, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004073693708050996, + "grad_norm": 9.12582015991211, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8584478497505188, + "num_tokens": 726838160.0, + "step": 19048 + }, + { + "epoch": 2.423228596870627, + "ewc_loss": 0.0779855027794838, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004063198866788298, + "grad_norm": 9.105224609375, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8751102685928345, + "num_tokens": 726873150.0, + "step": 19049 + }, + { + "epoch": 2.4233558071492176, + "ewc_loss": 0.07819443196058273, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040840916335582733, + "grad_norm": 9.13830280303955, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8693428039550781, + "num_tokens": 726905253.0, + "step": 19050 + }, + { + "epoch": 2.423483017427808, + "ewc_loss": 0.07809150218963623, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004073798772878945, + "grad_norm": 9.186296463012695, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8760900497436523, + "num_tokens": 726942842.0, + "step": 19051 + }, + { + "epoch": 2.4236102277063987, + "ewc_loss": 0.0780707374215126, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004071722214575857, + "grad_norm": 9.027674674987793, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8694973587989807, + "num_tokens": 726987974.0, + "step": 19052 + }, + { + "epoch": 2.423737437984989, + "ewc_loss": 0.07850302755832672, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004114951298106462, + "grad_norm": 9.202698707580566, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.883429765701294, + "num_tokens": 727021614.0, + "step": 19053 + }, + { + "epoch": 2.4238646482635797, + "ewc_loss": 0.07773250341415405, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040378986159339547, + "grad_norm": 9.034808158874512, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.862785816192627, + "num_tokens": 727061773.0, + "step": 19054 + }, + { + "epoch": 2.4239918585421703, + "ewc_loss": 0.07882452011108398, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004147100553382188, + "grad_norm": 9.222216606140137, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8585519790649414, + "num_tokens": 727095877.0, + "step": 19055 + }, + { + "epoch": 2.424119068820761, + "ewc_loss": 0.07788985967636108, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040536338929086924, + "grad_norm": 9.142562866210938, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8679887056350708, + "num_tokens": 727128676.0, + "step": 19056 + }, + { + "epoch": 2.4242462790993513, + "ewc_loss": 0.07837831974029541, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041024808888323605, + "grad_norm": 9.144144058227539, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8738908171653748, + "num_tokens": 727164528.0, + "step": 19057 + }, + { + "epoch": 2.424373489377942, + "ewc_loss": 0.07806207239627838, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040708560845814645, + "grad_norm": 9.027508735656738, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8721929788589478, + "num_tokens": 727202403.0, + "step": 19058 + }, + { + "epoch": 2.4245006996565324, + "ewc_loss": 0.07845008373260498, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004109657311346382, + "grad_norm": 9.15982723236084, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8672107458114624, + "num_tokens": 727242231.0, + "step": 19059 + }, + { + "epoch": 2.424627909935123, + "ewc_loss": 0.07794903218746185, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040595518657937646, + "grad_norm": 9.047210693359375, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8506091833114624, + "num_tokens": 727276371.0, + "step": 19060 + }, + { + "epoch": 2.4247551202137134, + "ewc_loss": 0.07848356664180756, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041130054160021245, + "grad_norm": 9.195930480957031, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8679894208908081, + "num_tokens": 727320101.0, + "step": 19061 + }, + { + "epoch": 2.424882330492304, + "ewc_loss": 0.07779309153556824, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040683720726519823, + "grad_norm": 9.05145263671875, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8559985756874084, + "num_tokens": 727363687.0, + "step": 19062 + }, + { + "epoch": 2.4250095407708945, + "ewc_loss": 0.07855980098247528, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041206280002370477, + "grad_norm": 9.110841751098633, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8701171875, + "num_tokens": 727399772.0, + "step": 19063 + }, + { + "epoch": 2.425136751049485, + "ewc_loss": 0.07834020256996155, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004098669160157442, + "grad_norm": 9.12173843383789, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8738852739334106, + "num_tokens": 727442944.0, + "step": 19064 + }, + { + "epoch": 2.425263961328075, + "ewc_loss": 0.07839448750019073, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004104097024537623, + "grad_norm": 9.11980152130127, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8735308647155762, + "num_tokens": 727483283.0, + "step": 19065 + }, + { + "epoch": 2.425391171606666, + "ewc_loss": 0.07836361974477768, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004101010272279382, + "grad_norm": 9.195334434509277, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8632960915565491, + "num_tokens": 727526316.0, + "step": 19066 + }, + { + "epoch": 2.425518381885256, + "ewc_loss": 0.07812357693910599, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004077006014995277, + "grad_norm": 9.086255073547363, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8725998401641846, + "num_tokens": 727563218.0, + "step": 19067 + }, + { + "epoch": 2.4256455921638467, + "ewc_loss": 0.07855670154094696, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041203180444426835, + "grad_norm": 9.215025901794434, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8779919147491455, + "num_tokens": 727594877.0, + "step": 19068 + }, + { + "epoch": 2.425772802442437, + "ewc_loss": 0.07801632583141327, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040662806713953614, + "grad_norm": 9.080034255981445, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.877661406993866, + "num_tokens": 727627561.0, + "step": 19069 + }, + { + "epoch": 2.4259000127210277, + "ewc_loss": 0.07876205444335938, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004140853707212955, + "grad_norm": 9.210251808166504, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8565719723701477, + "num_tokens": 727664384.0, + "step": 19070 + }, + { + "epoch": 2.4260272229996183, + "ewc_loss": 0.07792916893959045, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040575649472884834, + "grad_norm": 9.073802947998047, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8615884184837341, + "num_tokens": 727700866.0, + "step": 19071 + }, + { + "epoch": 2.426154433278209, + "ewc_loss": 0.07867126166820526, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041317741852253675, + "grad_norm": 9.239731788635254, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8842329978942871, + "num_tokens": 727741539.0, + "step": 19072 + }, + { + "epoch": 2.4262816435567993, + "ewc_loss": 0.07799030840396881, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040636793710291386, + "grad_norm": 9.08296012878418, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.865835964679718, + "num_tokens": 727777828.0, + "step": 19073 + }, + { + "epoch": 2.42640885383539, + "ewc_loss": 0.07852432131767273, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004117080825380981, + "grad_norm": 9.136983871459961, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8734886646270752, + "num_tokens": 727818979.0, + "step": 19074 + }, + { + "epoch": 2.4265360641139804, + "ewc_loss": 0.07824768126010895, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004089416179340333, + "grad_norm": 9.155243873596191, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8650392293930054, + "num_tokens": 727856440.0, + "step": 19075 + }, + { + "epoch": 2.426663274392571, + "ewc_loss": 0.07794217765331268, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004083279927726835, + "grad_norm": 9.057571411132812, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.875805139541626, + "num_tokens": 727895036.0, + "step": 19076 + }, + { + "epoch": 2.4267904846711614, + "ewc_loss": 0.07856589555740356, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004121238016523421, + "grad_norm": 9.224058151245117, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8534029722213745, + "num_tokens": 727932292.0, + "step": 19077 + }, + { + "epoch": 2.426917694949752, + "ewc_loss": 0.07806014269590378, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004070662835147232, + "grad_norm": 9.145890235900879, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8657270669937134, + "num_tokens": 727967005.0, + "step": 19078 + }, + { + "epoch": 2.4270449052283425, + "ewc_loss": 0.07852537930011749, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041171861812472343, + "grad_norm": 9.17078685760498, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8557765483856201, + "num_tokens": 728004262.0, + "step": 19079 + }, + { + "epoch": 2.427172115506933, + "ewc_loss": 0.07817034423351288, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040816827095113695, + "grad_norm": 9.118461608886719, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8743138313293457, + "num_tokens": 728041451.0, + "step": 19080 + }, + { + "epoch": 2.4272993257855235, + "ewc_loss": 0.07831227779388428, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040958766476251185, + "grad_norm": 9.123751640319824, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8635916113853455, + "num_tokens": 728079217.0, + "step": 19081 + }, + { + "epoch": 2.427426536064114, + "ewc_loss": 0.07831971347332001, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004096620250493288, + "grad_norm": 9.158565521240234, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8817857503890991, + "num_tokens": 728116221.0, + "step": 19082 + }, + { + "epoch": 2.4275537463427046, + "ewc_loss": 0.0781073123216629, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040753796929493546, + "grad_norm": 9.158744812011719, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8563910722732544, + "num_tokens": 728154861.0, + "step": 19083 + }, + { + "epoch": 2.427680956621295, + "ewc_loss": 0.07834883779287338, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004099532379768789, + "grad_norm": 9.239564895629883, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8533251881599426, + "num_tokens": 728188276.0, + "step": 19084 + }, + { + "epoch": 2.4278081668998857, + "ewc_loss": 0.07783028483390808, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004072090669069439, + "grad_norm": 9.314163208007812, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8765296339988708, + "num_tokens": 728229313.0, + "step": 19085 + }, + { + "epoch": 2.427935377178476, + "ewc_loss": 0.07815203070640564, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040798517875373363, + "grad_norm": 9.642138481140137, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8558197021484375, + "num_tokens": 728268706.0, + "step": 19086 + }, + { + "epoch": 2.4280625874570667, + "ewc_loss": 0.0771508663892746, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003979735483881086, + "grad_norm": 8.952984809875488, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8785516619682312, + "num_tokens": 728297490.0, + "step": 19087 + }, + { + "epoch": 2.428189797735657, + "ewc_loss": 0.07902659475803375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004167307633906603, + "grad_norm": 9.739143371582031, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8591046929359436, + "num_tokens": 728336041.0, + "step": 19088 + }, + { + "epoch": 2.4283170080142478, + "ewc_loss": 0.0767861157655716, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0003943259653169662, + "grad_norm": 8.98526668548584, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8732478618621826, + "num_tokens": 728372701.0, + "step": 19089 + }, + { + "epoch": 2.428444218292838, + "ewc_loss": 0.07921098172664642, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004210160986986011, + "grad_norm": 9.555373191833496, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8686131238937378, + "num_tokens": 728408138.0, + "step": 19090 + }, + { + "epoch": 2.4285714285714284, + "ewc_loss": 0.07671743631362915, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003960806643590331, + "grad_norm": 8.846508979797363, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8713603019714355, + "num_tokens": 728452649.0, + "step": 19091 + }, + { + "epoch": 2.428698638850019, + "ewc_loss": 0.07984337210655212, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004273400118108839, + "grad_norm": 9.666547775268555, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8561403751373291, + "num_tokens": 728490551.0, + "step": 19092 + }, + { + "epoch": 2.4288258491286094, + "ewc_loss": 0.07688496261835098, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0003977558808401227, + "grad_norm": 8.820733070373535, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8708951473236084, + "num_tokens": 728529558.0, + "step": 19093 + }, + { + "epoch": 2.4289530594072, + "ewc_loss": 0.08032781630754471, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004321844025980681, + "grad_norm": 9.728296279907227, + "learning_rate": 1e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8421023488044739, + "num_tokens": 728573495.0, + "step": 19094 + }, + { + "epoch": 2.4290802696857905, + "ewc_loss": 0.07741443067789078, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040060916217043996, + "grad_norm": 8.968064308166504, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8657336235046387, + "num_tokens": 728609166.0, + "step": 19095 + }, + { + "epoch": 2.429207479964381, + "ewc_loss": 0.08051016181707382, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043156647006981075, + "grad_norm": 9.548829078674316, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8735284805297852, + "num_tokens": 728653070.0, + "step": 19096 + }, + { + "epoch": 2.4293346902429716, + "ewc_loss": 0.07799273729324341, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004063921805936843, + "grad_norm": 9.151628494262695, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8673517107963562, + "num_tokens": 728689142.0, + "step": 19097 + }, + { + "epoch": 2.429461900521562, + "ewc_loss": 0.07944725453853607, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004233787767589092, + "grad_norm": 9.389467239379883, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8690515756607056, + "num_tokens": 728725615.0, + "step": 19098 + }, + { + "epoch": 2.4295891108001526, + "ewc_loss": 0.0781681090593338, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004081459192093462, + "grad_norm": 9.202747344970703, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8558058142662048, + "num_tokens": 728762366.0, + "step": 19099 + }, + { + "epoch": 2.429716321078743, + "ewc_loss": 0.07871121168136597, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004160183889325708, + "grad_norm": 9.292819023132324, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8540815711021423, + "num_tokens": 728801951.0, + "step": 19100 + }, + { + "epoch": 2.4298435313573337, + "ewc_loss": 0.07841439545154572, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041060883086174726, + "grad_norm": 9.132186889648438, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8741710186004639, + "num_tokens": 728836403.0, + "step": 19101 + }, + { + "epoch": 2.429970741635924, + "ewc_loss": 0.07850365340709686, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004139427619520575, + "grad_norm": 9.23868179321289, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8726638555526733, + "num_tokens": 728878485.0, + "step": 19102 + }, + { + "epoch": 2.4300979519145147, + "ewc_loss": 0.07825937122106552, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040905855712480843, + "grad_norm": 9.119739532470703, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8750426769256592, + "num_tokens": 728915935.0, + "step": 19103 + }, + { + "epoch": 2.4302251621931052, + "ewc_loss": 0.07876379787921906, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041410280391573906, + "grad_norm": 9.213297843933105, + "learning_rate": 1e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8476549386978149, + "num_tokens": 728956847.0, + "step": 19104 + }, + { + "epoch": 2.4303523724716958, + "ewc_loss": 0.07812035828828812, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000407668441766873, + "grad_norm": 9.077342987060547, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8702359795570374, + "num_tokens": 728994865.0, + "step": 19105 + }, + { + "epoch": 2.4304795827502863, + "ewc_loss": 0.07862463593482971, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041271120426245034, + "grad_norm": 9.164275169372559, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8574925661087036, + "num_tokens": 729033030.0, + "step": 19106 + }, + { + "epoch": 2.430606793028877, + "ewc_loss": 0.07828924059867859, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000411798624554649, + "grad_norm": 9.168436050415039, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.876387357711792, + "num_tokens": 729070430.0, + "step": 19107 + }, + { + "epoch": 2.4307340033074674, + "ewc_loss": 0.078330859541893, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004097734927199781, + "grad_norm": 9.101827621459961, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8774062395095825, + "num_tokens": 729102127.0, + "step": 19108 + }, + { + "epoch": 2.430861213586058, + "ewc_loss": 0.07866856455802917, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004131504974793643, + "grad_norm": 9.116086959838867, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8673828840255737, + "num_tokens": 729138128.0, + "step": 19109 + }, + { + "epoch": 2.4309884238646484, + "ewc_loss": 0.07861761748790741, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004126410640310496, + "grad_norm": 9.135126113891602, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8622962236404419, + "num_tokens": 729179756.0, + "step": 19110 + }, + { + "epoch": 2.431115634143239, + "ewc_loss": 0.07841428369283676, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000410607666708529, + "grad_norm": 9.05091667175293, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8540792465209961, + "num_tokens": 729221356.0, + "step": 19111 + }, + { + "epoch": 2.4312428444218295, + "ewc_loss": 0.07849854230880737, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.000413891626521945, + "grad_norm": 9.105342864990234, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8686906099319458, + "num_tokens": 729260456.0, + "step": 19112 + }, + { + "epoch": 2.4313700547004196, + "ewc_loss": 0.07841254770755768, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004130316956434399, + "grad_norm": 9.1361665725708, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8542264699935913, + "num_tokens": 729299654.0, + "step": 19113 + }, + { + "epoch": 2.4314972649790105, + "ewc_loss": 0.07843418419361115, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041080673690885305, + "grad_norm": 9.063619613647461, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8680153489112854, + "num_tokens": 729341296.0, + "step": 19114 + }, + { + "epoch": 2.4316244752576006, + "ewc_loss": 0.07893538475036621, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041581870755180717, + "grad_norm": 9.181947708129883, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8691091537475586, + "num_tokens": 729379632.0, + "step": 19115 + }, + { + "epoch": 2.431751685536191, + "ewc_loss": 0.07831864058971405, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040965122752822936, + "grad_norm": 9.004683494567871, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8666903972625732, + "num_tokens": 729423202.0, + "step": 19116 + }, + { + "epoch": 2.4318788958147817, + "ewc_loss": 0.07896719872951508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004161368415225297, + "grad_norm": 9.189783096313477, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8854074478149414, + "num_tokens": 729457108.0, + "step": 19117 + }, + { + "epoch": 2.432006106093372, + "ewc_loss": 0.0781087726354599, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040999395423568785, + "grad_norm": 9.04261302947998, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8621850609779358, + "num_tokens": 729494065.0, + "step": 19118 + }, + { + "epoch": 2.4321333163719627, + "ewc_loss": 0.07878093421459198, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004167156293988228, + "grad_norm": 9.194718360900879, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8703262805938721, + "num_tokens": 729536621.0, + "step": 19119 + }, + { + "epoch": 2.4322605266505533, + "ewc_loss": 0.07807309925556183, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004096372576896101, + "grad_norm": 9.048853874206543, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8663122653961182, + "num_tokens": 729574423.0, + "step": 19120 + }, + { + "epoch": 2.432387736929144, + "ewc_loss": 0.07883064448833466, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004172126646153629, + "grad_norm": 9.184784889221191, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8606990575790405, + "num_tokens": 729614673.0, + "step": 19121 + }, + { + "epoch": 2.4325149472077343, + "ewc_loss": 0.07834498584270477, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040991470450535417, + "grad_norm": 9.030830383300781, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8648232817649841, + "num_tokens": 729653049.0, + "step": 19122 + }, + { + "epoch": 2.432642157486325, + "ewc_loss": 0.07884447276592255, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004173510242253542, + "grad_norm": 9.203283309936523, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8640718460083008, + "num_tokens": 729689667.0, + "step": 19123 + }, + { + "epoch": 2.4327693677649154, + "ewc_loss": 0.0781431570649147, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041033781599253416, + "grad_norm": 8.994584083557129, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8697089552879333, + "num_tokens": 729734747.0, + "step": 19124 + }, + { + "epoch": 2.432896578043506, + "ewc_loss": 0.07910405099391937, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004199467657599598, + "grad_norm": 9.337947845458984, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8607949614524841, + "num_tokens": 729770135.0, + "step": 19125 + }, + { + "epoch": 2.4330237883220964, + "ewc_loss": 0.0781528502702713, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040799329872243106, + "grad_norm": 8.996118545532227, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8647270202636719, + "num_tokens": 729809203.0, + "step": 19126 + }, + { + "epoch": 2.433150998600687, + "ewc_loss": 0.07938006520271301, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042026551091112196, + "grad_norm": 9.222344398498535, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8715373277664185, + "num_tokens": 729850164.0, + "step": 19127 + }, + { + "epoch": 2.4332782088792775, + "ewc_loss": 0.07824406027793884, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040890550008043647, + "grad_norm": 9.050337791442871, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8546919822692871, + "num_tokens": 729884337.0, + "step": 19128 + }, + { + "epoch": 2.433405419157868, + "ewc_loss": 0.07909846305847168, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174494824837893, + "grad_norm": 9.172358512878418, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8782461285591125, + "num_tokens": 729924098.0, + "step": 19129 + }, + { + "epoch": 2.4335326294364585, + "ewc_loss": 0.07832138240337372, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040967861423268914, + "grad_norm": 9.053221702575684, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8845734596252441, + "num_tokens": 729960671.0, + "step": 19130 + }, + { + "epoch": 2.433659839715049, + "ewc_loss": 0.07909363508224487, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041740122833289206, + "grad_norm": 9.208852767944336, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8672024607658386, + "num_tokens": 730000780.0, + "step": 19131 + }, + { + "epoch": 2.4337870499936396, + "ewc_loss": 0.078336201608181, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040982686914503574, + "grad_norm": 8.99804973602295, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8634722828865051, + "num_tokens": 730045198.0, + "step": 19132 + }, + { + "epoch": 2.43391426027223, + "ewc_loss": 0.07914210110902786, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041788583621382713, + "grad_norm": 9.177694320678711, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8603790998458862, + "num_tokens": 730085010.0, + "step": 19133 + }, + { + "epoch": 2.4340414705508207, + "ewc_loss": 0.07842430472373962, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004107078420929611, + "grad_norm": 9.18107795715332, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8599686622619629, + "num_tokens": 730121512.0, + "step": 19134 + }, + { + "epoch": 2.434168680829411, + "ewc_loss": 0.07881779968738556, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041464288369752467, + "grad_norm": 9.087959289550781, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8649436831474304, + "num_tokens": 730164859.0, + "step": 19135 + }, + { + "epoch": 2.4342958911080017, + "ewc_loss": 0.07852619886398315, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004141682875342667, + "grad_norm": 9.087599754333496, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8674620389938354, + "num_tokens": 730202273.0, + "step": 19136 + }, + { + "epoch": 2.4344231013865922, + "ewc_loss": 0.07876676321029663, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004141325189266354, + "grad_norm": 9.176912307739258, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8580174446105957, + "num_tokens": 730248208.0, + "step": 19137 + }, + { + "epoch": 2.4345503116651823, + "ewc_loss": 0.07870036363601685, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000413468515034765, + "grad_norm": 9.097657203674316, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8738135099411011, + "num_tokens": 730291084.0, + "step": 19138 + }, + { + "epoch": 2.4346775219437733, + "ewc_loss": 0.07850503921508789, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041395664447918534, + "grad_norm": 9.123703956604004, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8700172305107117, + "num_tokens": 730334504.0, + "step": 19139 + }, + { + "epoch": 2.4348047322223634, + "ewc_loss": 0.07842709124088287, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041317721479572356, + "grad_norm": 9.14427661895752, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8698761463165283, + "num_tokens": 730373356.0, + "step": 19140 + }, + { + "epoch": 2.434931942500954, + "ewc_loss": 0.07851584255695343, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004140646487940103, + "grad_norm": 9.181417465209961, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8512144684791565, + "num_tokens": 730416229.0, + "step": 19141 + }, + { + "epoch": 2.4350591527795444, + "ewc_loss": 0.07842831313610077, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041318940930068493, + "grad_norm": 9.122827529907227, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8785967230796814, + "num_tokens": 730458922.0, + "step": 19142 + }, + { + "epoch": 2.435186363058135, + "ewc_loss": 0.07862178236246109, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004151240864302963, + "grad_norm": 9.194573402404785, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8642327785491943, + "num_tokens": 730494583.0, + "step": 19143 + }, + { + "epoch": 2.4353135733367255, + "ewc_loss": 0.07834136486053467, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041231990326195955, + "grad_norm": 9.126218795776367, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.866456151008606, + "num_tokens": 730529526.0, + "step": 19144 + }, + { + "epoch": 2.435440783615316, + "ewc_loss": 0.07876314222812653, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041653771768324077, + "grad_norm": 9.284765243530273, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.868604302406311, + "num_tokens": 730558584.0, + "step": 19145 + }, + { + "epoch": 2.4355679938939065, + "ewc_loss": 0.07811097800731659, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041001601493917406, + "grad_norm": 9.072200775146484, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.865088701248169, + "num_tokens": 730598677.0, + "step": 19146 + }, + { + "epoch": 2.435695204172497, + "ewc_loss": 0.07884937524795532, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004174000059720129, + "grad_norm": 9.31936264038086, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8749109506607056, + "num_tokens": 730636985.0, + "step": 19147 + }, + { + "epoch": 2.4358224144510876, + "ewc_loss": 0.07782728970050812, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004071791481692344, + "grad_norm": 9.003387451171875, + "learning_rate": 1e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8487361073493958, + "num_tokens": 730675409.0, + "step": 19148 + }, + { + "epoch": 2.435949624729678, + "ewc_loss": 0.07901397347450256, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041904597310349345, + "grad_norm": 9.314602851867676, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8531675338745117, + "num_tokens": 730714739.0, + "step": 19149 + }, + { + "epoch": 2.4360768350082687, + "ewc_loss": 0.07777027785778046, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004066089750267565, + "grad_norm": 9.020694732666016, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8638748526573181, + "num_tokens": 730757059.0, + "step": 19150 + }, + { + "epoch": 2.436204045286859, + "ewc_loss": 0.07930061221122742, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004219123220536858, + "grad_norm": 9.359770774841309, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8638912439346313, + "num_tokens": 730793469.0, + "step": 19151 + }, + { + "epoch": 2.4363312555654497, + "ewc_loss": 0.07820725440979004, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004085373948328197, + "grad_norm": 9.046069145202637, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.870663046836853, + "num_tokens": 730829336.0, + "step": 19152 + }, + { + "epoch": 2.4364584658440402, + "ewc_loss": 0.07919374853372574, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00042084374581463635, + "grad_norm": 9.330340385437012, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8629564046859741, + "num_tokens": 730869304.0, + "step": 19153 + }, + { + "epoch": 2.4365856761226308, + "ewc_loss": 0.07814322412014008, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004103385435882956, + "grad_norm": 9.123164176940918, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8534575700759888, + "num_tokens": 730905315.0, + "step": 19154 + }, + { + "epoch": 2.4367128864012213, + "ewc_loss": 0.07922935485839844, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004211997729726136, + "grad_norm": 9.316986083984375, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8563731908798218, + "num_tokens": 730947346.0, + "step": 19155 + }, + { + "epoch": 2.436840096679812, + "ewc_loss": 0.07809874415397644, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00040989372064359486, + "grad_norm": 9.099113464355469, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8706322908401489, + "num_tokens": 730976907.0, + "step": 19156 + }, + { + "epoch": 2.4369673069584024, + "ewc_loss": 0.07903722673654556, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041927851270884275, + "grad_norm": 9.285298347473145, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8615137338638306, + "num_tokens": 731015940.0, + "step": 19157 + }, + { + "epoch": 2.437094517236993, + "ewc_loss": 0.07833527028560638, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004098175559192896, + "grad_norm": 9.056999206542969, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8834783434867859, + "num_tokens": 731051345.0, + "step": 19158 + }, + { + "epoch": 2.4372217275155834, + "ewc_loss": 0.07919280230998993, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004183928540442139, + "grad_norm": 9.264132499694824, + "learning_rate": 1e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.8323690891265869, + "num_tokens": 731089642.0, + "step": 19159 + }, + { + "epoch": 2.437348937794174, + "ewc_loss": 0.07822349667549133, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004086997942067683, + "grad_norm": 9.062668800354004, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8830109238624573, + "num_tokens": 731133196.0, + "step": 19160 + }, + { + "epoch": 2.4374761480727645, + "ewc_loss": 0.07920706272125244, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041853543370962143, + "grad_norm": 14.659322738647461, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8549348711967468, + "num_tokens": 731175510.0, + "step": 19161 + }, + { + "epoch": 2.437603358351355, + "ewc_loss": 0.08626508712768555, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004891157150268555, + "grad_norm": 9.89211368560791, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8728283047676086, + "num_tokens": 731217526.0, + "step": 19162 + }, + { + "epoch": 2.437730568629945, + "ewc_loss": 0.08143521845340729, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004408170352689922, + "grad_norm": 9.615361213684082, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8541549444198608, + "num_tokens": 731259525.0, + "step": 19163 + }, + { + "epoch": 2.437857778908536, + "ewc_loss": 0.07964566349983215, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004229214391671121, + "grad_norm": 9.342381477355957, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8703821897506714, + "num_tokens": 731301480.0, + "step": 19164 + }, + { + "epoch": 2.437984989187126, + "ewc_loss": 0.08186295628547668, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004450943961273879, + "grad_norm": 9.637296676635742, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.874491810798645, + "num_tokens": 731347385.0, + "step": 19165 + }, + { + "epoch": 2.4381121994657167, + "ewc_loss": 0.07926398515701294, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041910470463335514, + "grad_norm": 9.266207695007324, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8695443868637085, + "num_tokens": 731383666.0, + "step": 19166 + }, + { + "epoch": 2.438239409744307, + "ewc_loss": 0.08054797351360321, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004319445579312742, + "grad_norm": 9.474685668945312, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8653954267501831, + "num_tokens": 731420297.0, + "step": 19167 + }, + { + "epoch": 2.4383666200228977, + "ewc_loss": 0.0792480930685997, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004189457686152309, + "grad_norm": 9.292531967163086, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8608774542808533, + "num_tokens": 731455083.0, + "step": 19168 + }, + { + "epoch": 2.4384938303014883, + "ewc_loss": 0.08008940517902374, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042735887109301984, + "grad_norm": 9.48020076751709, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8736333847045898, + "num_tokens": 731482555.0, + "step": 19169 + }, + { + "epoch": 2.438621040580079, + "ewc_loss": 0.07882325351238251, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041469736606813967, + "grad_norm": 9.218886375427246, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.877722978591919, + "num_tokens": 731519459.0, + "step": 19170 + }, + { + "epoch": 2.4387482508586693, + "ewc_loss": 0.07965452969074249, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004230100894346833, + "grad_norm": 9.344269752502441, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8694831728935242, + "num_tokens": 731554577.0, + "step": 19171 + }, + { + "epoch": 2.43887546113726, + "ewc_loss": 0.07889202237129211, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041538506047800183, + "grad_norm": 9.20667839050293, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.856422483921051, + "num_tokens": 731594993.0, + "step": 19172 + }, + { + "epoch": 2.4390026714158504, + "ewc_loss": 0.07926077395677567, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004190725740045309, + "grad_norm": 14.709431648254395, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8707321882247925, + "num_tokens": 731629182.0, + "step": 19173 + }, + { + "epoch": 2.439129881694441, + "ewc_loss": 0.08667370676994324, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004932019510306418, + "grad_norm": 10.020941734313965, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8635549545288086, + "num_tokens": 731667661.0, + "step": 19174 + }, + { + "epoch": 2.4392570919730314, + "ewc_loss": 0.08075056225061417, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004339704755693674, + "grad_norm": 9.589468955993652, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8669131398200989, + "num_tokens": 731704911.0, + "step": 19175 + }, + { + "epoch": 2.439384302251622, + "ewc_loss": 0.07982024550437927, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042466731974855065, + "grad_norm": 9.550619125366211, + "learning_rate": 1e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8475853800773621, + "num_tokens": 731744990.0, + "step": 19176 + }, + { + "epoch": 2.4395115125302125, + "ewc_loss": 0.08065155893564224, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004329804505687207, + "grad_norm": 9.43735408782959, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8674194812774658, + "num_tokens": 731783335.0, + "step": 19177 + }, + { + "epoch": 2.439638722808803, + "ewc_loss": 0.07970435917377472, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004235084052197635, + "grad_norm": 9.480645179748535, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8678381443023682, + "num_tokens": 731819206.0, + "step": 19178 + }, + { + "epoch": 2.4397659330873935, + "ewc_loss": 0.07926733791828156, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004191382322460413, + "grad_norm": 9.287970542907715, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8627611994743347, + "num_tokens": 731855291.0, + "step": 19179 + }, + { + "epoch": 2.439893143365984, + "ewc_loss": 0.07973276078701019, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042379245860502124, + "grad_norm": 9.42805004119873, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8687648177146912, + "num_tokens": 731899392.0, + "step": 19180 + }, + { + "epoch": 2.4400203536445746, + "ewc_loss": 0.07876774668693542, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041414235602132976, + "grad_norm": 9.198931694030762, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8661527633666992, + "num_tokens": 731937502.0, + "step": 19181 + }, + { + "epoch": 2.440147563923165, + "ewc_loss": 0.07927604019641876, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004216666566208005, + "grad_norm": 9.360902786254883, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8682332038879395, + "num_tokens": 731974946.0, + "step": 19182 + }, + { + "epoch": 2.4402747742017556, + "ewc_loss": 0.07862977683544159, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004127626307308674, + "grad_norm": 9.180061340332031, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.853693962097168, + "num_tokens": 732018732.0, + "step": 19183 + }, + { + "epoch": 2.440401984480346, + "ewc_loss": 0.07899202406406403, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041882647201418877, + "grad_norm": 9.328389167785645, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8642904162406921, + "num_tokens": 732052036.0, + "step": 19184 + }, + { + "epoch": 2.4405291947589367, + "ewc_loss": 0.0782843679189682, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041174987563863397, + "grad_norm": 9.172444343566895, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8614620566368103, + "num_tokens": 732090944.0, + "step": 19185 + }, + { + "epoch": 2.440656405037527, + "ewc_loss": 0.0789099857211113, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041800609324127436, + "grad_norm": 9.330145835876465, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8679535388946533, + "num_tokens": 732131406.0, + "step": 19186 + }, + { + "epoch": 2.4407836153161178, + "ewc_loss": 0.07845959067344666, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004110607551410794, + "grad_norm": 9.183839797973633, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8645949363708496, + "num_tokens": 732171701.0, + "step": 19187 + }, + { + "epoch": 2.440910825594708, + "ewc_loss": 0.07888131588697433, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041527801658958197, + "grad_norm": 9.216424942016602, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8751161694526672, + "num_tokens": 732209658.0, + "step": 19188 + }, + { + "epoch": 2.4410380358732984, + "ewc_loss": 0.07829320430755615, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041183826397173107, + "grad_norm": 9.154998779296875, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.854340136051178, + "num_tokens": 732251395.0, + "step": 19189 + }, + { + "epoch": 2.441165246151889, + "ewc_loss": 0.07871551066637039, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004136199422646314, + "grad_norm": 9.23926830291748, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8552806377410889, + "num_tokens": 732289306.0, + "step": 19190 + }, + { + "epoch": 2.4412924564304794, + "ewc_loss": 0.07845757901668549, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004110406443942338, + "grad_norm": 9.067483901977539, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8508816957473755, + "num_tokens": 732337916.0, + "step": 19191 + }, + { + "epoch": 2.44141966670907, + "ewc_loss": 0.07881419360637665, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004170481988694519, + "grad_norm": 9.25583267211914, + "learning_rate": 1e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8525829911231995, + "num_tokens": 732380358.0, + "step": 19192 + }, + { + "epoch": 2.4415468769876605, + "ewc_loss": 0.07830892503261566, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040955410804599524, + "grad_norm": 9.066778182983398, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8749812841415405, + "num_tokens": 732416534.0, + "step": 19193 + }, + { + "epoch": 2.441674087266251, + "ewc_loss": 0.07918764650821686, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041834128205664456, + "grad_norm": 9.193480491638184, + "learning_rate": 1e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8448828458786011, + "num_tokens": 732458927.0, + "step": 19194 + }, + { + "epoch": 2.4418012975448415, + "ewc_loss": 0.07852363586425781, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004117012140341103, + "grad_norm": 9.12843132019043, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8572248220443726, + "num_tokens": 732501261.0, + "step": 19195 + }, + { + "epoch": 2.441928507823432, + "ewc_loss": 0.07893625646829605, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004182688135188073, + "grad_norm": 9.239089012145996, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8732374310493469, + "num_tokens": 732541162.0, + "step": 19196 + }, + { + "epoch": 2.4420557181020226, + "ewc_loss": 0.0783240795135498, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041214708471670747, + "grad_norm": 9.118249893188477, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8631775379180908, + "num_tokens": 732578237.0, + "step": 19197 + }, + { + "epoch": 2.442182928380613, + "ewc_loss": 0.0789186954498291, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004180932592134923, + "grad_norm": 9.229665756225586, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8544608354568481, + "num_tokens": 732618624.0, + "step": 19198 + }, + { + "epoch": 2.4423101386592037, + "ewc_loss": 0.07871849089860916, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004136497445870191, + "grad_norm": 9.107260704040527, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8622499704360962, + "num_tokens": 732661747.0, + "step": 19199 + }, + { + "epoch": 2.442437348937794, + "ewc_loss": 0.07920214533805847, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041848630644381046, + "grad_norm": 9.197355270385742, + "learning_rate": 1e-06, + "loss": 0.5358, + "mean_token_accuracy": 0.8445335030555725, + "num_tokens": 732703511.0, + "step": 19200 + }, + { + "epoch": 2.4425645592163847, + "ewc_loss": 0.07858921587467194, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041235698154196143, + "grad_norm": 9.124444961547852, + "learning_rate": 1e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.84520423412323, + "num_tokens": 732746749.0, + "step": 19201 + }, + { + "epoch": 2.4426917694949752, + "ewc_loss": 0.07913516461849213, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041781648178584874, + "grad_norm": 9.208633422851562, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8659220933914185, + "num_tokens": 732783184.0, + "step": 19202 + }, + { + "epoch": 2.4428189797735658, + "ewc_loss": 0.07850642502307892, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004139704687986523, + "grad_norm": 9.12537956237793, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.858881950378418, + "num_tokens": 732818664.0, + "step": 19203 + }, + { + "epoch": 2.4429461900521563, + "ewc_loss": 0.07909242063760757, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041738906293176115, + "grad_norm": 9.233439445495605, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8561382293701172, + "num_tokens": 732854286.0, + "step": 19204 + }, + { + "epoch": 2.443073400330747, + "ewc_loss": 0.0784682035446167, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004111469315830618, + "grad_norm": 9.092589378356934, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8706439137458801, + "num_tokens": 732893032.0, + "step": 19205 + }, + { + "epoch": 2.4432006106093374, + "ewc_loss": 0.07916629314422607, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041812777635641396, + "grad_norm": 9.24719524383545, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8641239404678345, + "num_tokens": 732929942.0, + "step": 19206 + }, + { + "epoch": 2.443327820887928, + "ewc_loss": 0.07843908667564392, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041085571865551174, + "grad_norm": 9.094860076904297, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8599346876144409, + "num_tokens": 732968218.0, + "step": 19207 + }, + { + "epoch": 2.4434550311665184, + "ewc_loss": 0.0791889876127243, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004183546989224851, + "grad_norm": 9.272942543029785, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8882084488868713, + "num_tokens": 733001464.0, + "step": 19208 + }, + { + "epoch": 2.443582241445109, + "ewc_loss": 0.07833711802959442, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040983606595546007, + "grad_norm": 9.144808769226074, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8564348816871643, + "num_tokens": 733036311.0, + "step": 19209 + }, + { + "epoch": 2.4437094517236995, + "ewc_loss": 0.07906430959701538, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041710789082571864, + "grad_norm": 9.274992942810059, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8718810081481934, + "num_tokens": 733070487.0, + "step": 19210 + }, + { + "epoch": 2.4438366620022896, + "ewc_loss": 0.07834232598543167, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004098881036043167, + "grad_norm": 9.112634658813477, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8630146384239197, + "num_tokens": 733110357.0, + "step": 19211 + }, + { + "epoch": 2.4439638722808805, + "ewc_loss": 0.07913865149021149, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004178514063823968, + "grad_norm": 9.334470748901367, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8566877245903015, + "num_tokens": 733149070.0, + "step": 19212 + }, + { + "epoch": 2.4440910825594706, + "ewc_loss": 0.0782925933599472, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040939077734947205, + "grad_norm": 9.214727401733398, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8807233572006226, + "num_tokens": 733180848.0, + "step": 19213 + }, + { + "epoch": 2.444218292838061, + "ewc_loss": 0.07892149686813354, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004156797658652067, + "grad_norm": 9.325035095214844, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8649044036865234, + "num_tokens": 733219204.0, + "step": 19214 + }, + { + "epoch": 2.4443455031166517, + "ewc_loss": 0.07843442261219025, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004083676903974265, + "grad_norm": 9.095708847045898, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8539348840713501, + "num_tokens": 733255237.0, + "step": 19215 + }, + { + "epoch": 2.444472713395242, + "ewc_loss": 0.07932257652282715, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004172491608187556, + "grad_norm": 9.295244216918945, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8767189979553223, + "num_tokens": 733293243.0, + "step": 19216 + }, + { + "epoch": 2.4445999236738327, + "ewc_loss": 0.07816222310066223, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004080871294718236, + "grad_norm": 9.117233276367188, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8673721551895142, + "num_tokens": 733333530.0, + "step": 19217 + }, + { + "epoch": 2.4447271339524232, + "ewc_loss": 0.07907049357891083, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041716976556926966, + "grad_norm": 9.3624849319458, + "learning_rate": 1e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8481231331825256, + "num_tokens": 733373790.0, + "step": 19218 + }, + { + "epoch": 2.4448543442310138, + "ewc_loss": 0.0780540183186531, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040700501995161176, + "grad_norm": 9.055591583251953, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8666629791259766, + "num_tokens": 733416037.0, + "step": 19219 + }, + { + "epoch": 2.4449815545096043, + "ewc_loss": 0.07916629314422607, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004181277472525835, + "grad_norm": 9.305861473083496, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8608764410018921, + "num_tokens": 733453014.0, + "step": 19220 + }, + { + "epoch": 2.445108764788195, + "ewc_loss": 0.07814788818359375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004079437640029937, + "grad_norm": 9.133569717407227, + "learning_rate": 1e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8508007526397705, + "num_tokens": 733491197.0, + "step": 19221 + }, + { + "epoch": 2.4452359750667854, + "ewc_loss": 0.07954759150743484, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004170579486526549, + "grad_norm": 14.332831382751465, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8778241872787476, + "num_tokens": 733531142.0, + "step": 19222 + }, + { + "epoch": 2.445363185345376, + "ewc_loss": 0.08500993996858597, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004765642515849322, + "grad_norm": 9.818485260009766, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8581557869911194, + "num_tokens": 733573916.0, + "step": 19223 + }, + { + "epoch": 2.4454903956239664, + "ewc_loss": 0.08112741261720657, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004377389559522271, + "grad_norm": 9.714568138122559, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8588130474090576, + "num_tokens": 733613204.0, + "step": 19224 + }, + { + "epoch": 2.445617605902557, + "ewc_loss": 0.07917359471321106, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041820076876319945, + "grad_norm": 9.264606475830078, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8592681288719177, + "num_tokens": 733653618.0, + "step": 19225 + }, + { + "epoch": 2.4457448161811475, + "ewc_loss": 0.08169512450695038, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044341603643260896, + "grad_norm": 9.743697166442871, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8814213871955872, + "num_tokens": 733687795.0, + "step": 19226 + }, + { + "epoch": 2.445872026459738, + "ewc_loss": 0.07871806621551514, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004136454954277724, + "grad_norm": 9.25331974029541, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8635295629501343, + "num_tokens": 733723574.0, + "step": 19227 + }, + { + "epoch": 2.4459992367383285, + "ewc_loss": 0.08042958378791809, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004307607305236161, + "grad_norm": 9.519976615905762, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8721186518669128, + "num_tokens": 733762051.0, + "step": 19228 + }, + { + "epoch": 2.446126447016919, + "ewc_loss": 0.07892376184463501, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041570240864530206, + "grad_norm": 9.261449813842773, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8823000192642212, + "num_tokens": 733800640.0, + "step": 19229 + }, + { + "epoch": 2.4462536572955096, + "ewc_loss": 0.07966219633817673, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00042552821105346084, + "grad_norm": 9.415487289428711, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8531953692436218, + "num_tokens": 733841321.0, + "step": 19230 + }, + { + "epoch": 2.4463808675741, + "ewc_loss": 0.07858914881944656, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004123563412576914, + "grad_norm": 9.265727996826172, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.867193341255188, + "num_tokens": 733874104.0, + "step": 19231 + }, + { + "epoch": 2.4465080778526906, + "ewc_loss": 0.07940812408924103, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004205461300443858, + "grad_norm": 9.292720794677734, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8688421845436096, + "num_tokens": 733913127.0, + "step": 19232 + }, + { + "epoch": 2.446635288131281, + "ewc_loss": 0.07874805480241776, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004139453812967986, + "grad_norm": 9.23905086517334, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8818076848983765, + "num_tokens": 733951900.0, + "step": 19233 + }, + { + "epoch": 2.4467624984098717, + "ewc_loss": 0.07888078689575195, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004152726905886084, + "grad_norm": 9.256025314331055, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8674187660217285, + "num_tokens": 733992866.0, + "step": 19234 + }, + { + "epoch": 2.4468897086884622, + "ewc_loss": 0.07874446362257004, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000413909467170015, + "grad_norm": 9.174178123474121, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8597612380981445, + "num_tokens": 734029895.0, + "step": 19235 + }, + { + "epoch": 2.4470169189670523, + "ewc_loss": 0.0788838118314743, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004153029585722834, + "grad_norm": 9.313150405883789, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8806259632110596, + "num_tokens": 734066954.0, + "step": 19236 + }, + { + "epoch": 2.4471441292456433, + "ewc_loss": 0.07853594422340393, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004118242650292814, + "grad_norm": 9.148916244506836, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8860985636711121, + "num_tokens": 734102412.0, + "step": 19237 + }, + { + "epoch": 2.4472713395242334, + "ewc_loss": 0.07897011190652847, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004161659744568169, + "grad_norm": 9.221736907958984, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8823902606964111, + "num_tokens": 734141400.0, + "step": 19238 + }, + { + "epoch": 2.447398549802824, + "ewc_loss": 0.07857201248407364, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041218497790396214, + "grad_norm": 9.274151802062988, + "learning_rate": 1e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.844376802444458, + "num_tokens": 734176628.0, + "step": 19239 + }, + { + "epoch": 2.4475257600814144, + "ewc_loss": 0.07873799651861191, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041384482756257057, + "grad_norm": 9.157445907592773, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8706668615341187, + "num_tokens": 734219756.0, + "step": 19240 + }, + { + "epoch": 2.447652970360005, + "ewc_loss": 0.07909133285284042, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041737817809917033, + "grad_norm": 9.365281105041504, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.863858699798584, + "num_tokens": 734250509.0, + "step": 19241 + }, + { + "epoch": 2.4477801806385955, + "ewc_loss": 0.07830102741718292, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004094750911463052, + "grad_norm": 9.107300758361816, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.871937096118927, + "num_tokens": 734287887.0, + "step": 19242 + }, + { + "epoch": 2.447907390917186, + "ewc_loss": 0.07946214079856873, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004210862680338323, + "grad_norm": 9.307058334350586, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8708212375640869, + "num_tokens": 734330446.0, + "step": 19243 + }, + { + "epoch": 2.4480346011957765, + "ewc_loss": 0.07831105589866638, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040957535384222865, + "grad_norm": 9.115817070007324, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8715918064117432, + "num_tokens": 734368427.0, + "step": 19244 + }, + { + "epoch": 2.448161811474367, + "ewc_loss": 0.07947153598070145, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004211801860947162, + "grad_norm": 9.346034049987793, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8835736513137817, + "num_tokens": 734405103.0, + "step": 19245 + }, + { + "epoch": 2.4482890217529576, + "ewc_loss": 0.07824648171663284, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040892965625971556, + "grad_norm": 9.052264213562012, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8624051809310913, + "num_tokens": 734448069.0, + "step": 19246 + }, + { + "epoch": 2.448416232031548, + "ewc_loss": 0.07957887649536133, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000422253564465791, + "grad_norm": 9.338622093200684, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8516899943351746, + "num_tokens": 734486016.0, + "step": 19247 + }, + { + "epoch": 2.4485434423101387, + "ewc_loss": 0.07806681096553802, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004095743061043322, + "grad_norm": 9.063817977905273, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8535465598106384, + "num_tokens": 734526943.0, + "step": 19248 + }, + { + "epoch": 2.448670652588729, + "ewc_loss": 0.07949619740247726, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042142681195400655, + "grad_norm": 9.331047058105469, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8766155242919922, + "num_tokens": 734566137.0, + "step": 19249 + }, + { + "epoch": 2.4487978628673197, + "ewc_loss": 0.07833285629749298, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040979337063618004, + "grad_norm": 9.03926944732666, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8646225929260254, + "num_tokens": 734602366.0, + "step": 19250 + }, + { + "epoch": 2.4489250731459102, + "ewc_loss": 0.0798134133219719, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004221575800329447, + "grad_norm": 9.311210632324219, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.873874843120575, + "num_tokens": 734644251.0, + "step": 19251 + }, + { + "epoch": 2.4490522834245008, + "ewc_loss": 0.0784759372472763, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000411224173149094, + "grad_norm": 9.086383819580078, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8625366687774658, + "num_tokens": 734682938.0, + "step": 19252 + }, + { + "epoch": 2.4491794937030913, + "ewc_loss": 0.07921536266803741, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004210598999634385, + "grad_norm": 9.303682327270508, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8658773899078369, + "num_tokens": 734722975.0, + "step": 19253 + }, + { + "epoch": 2.449306703981682, + "ewc_loss": 0.07824388146400452, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004113450413569808, + "grad_norm": 9.120789527893066, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8674901127815247, + "num_tokens": 734761361.0, + "step": 19254 + }, + { + "epoch": 2.4494339142602723, + "ewc_loss": 0.0791197270154953, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00042010354809463024, + "grad_norm": 9.265341758728027, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8628894686698914, + "num_tokens": 734800597.0, + "step": 19255 + }, + { + "epoch": 2.449561124538863, + "ewc_loss": 0.07838810980319977, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.00041278733988292515, + "grad_norm": 9.139080047607422, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8533191680908203, + "num_tokens": 734840575.0, + "step": 19256 + }, + { + "epoch": 2.4496883348174534, + "ewc_loss": 0.07934477925300598, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004199126560706645, + "grad_norm": 9.32530689239502, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8557271957397461, + "num_tokens": 734876943.0, + "step": 19257 + }, + { + "epoch": 2.449815545096044, + "ewc_loss": 0.07864128053188324, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041287767817266285, + "grad_norm": 9.102583885192871, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8809391856193542, + "num_tokens": 734916436.0, + "step": 19258 + }, + { + "epoch": 2.4499427553746345, + "ewc_loss": 0.07942546904087067, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042071950156241655, + "grad_norm": 9.328036308288574, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8651005625724792, + "num_tokens": 734956336.0, + "step": 19259 + }, + { + "epoch": 2.450069965653225, + "ewc_loss": 0.07840191572904587, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041048400453291833, + "grad_norm": 9.128575325012207, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8633170127868652, + "num_tokens": 734997132.0, + "step": 19260 + }, + { + "epoch": 2.450197175931815, + "ewc_loss": 0.07951629161834717, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004216277156956494, + "grad_norm": 9.345483779907227, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8532638549804688, + "num_tokens": 735036192.0, + "step": 19261 + }, + { + "epoch": 2.450324386210406, + "ewc_loss": 0.0785825103521347, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004122899263165891, + "grad_norm": 9.066309928894043, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8788413405418396, + "num_tokens": 735079368.0, + "step": 19262 + }, + { + "epoch": 2.450451596488996, + "ewc_loss": 0.0795397087931633, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004218619433231652, + "grad_norm": 9.360536575317383, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8714224100112915, + "num_tokens": 735109714.0, + "step": 19263 + }, + { + "epoch": 2.4505788067675867, + "ewc_loss": 0.07852703332901001, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041173520730808377, + "grad_norm": 9.11941909790039, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8632646799087524, + "num_tokens": 735152712.0, + "step": 19264 + }, + { + "epoch": 2.450706017046177, + "ewc_loss": 0.07951761782169342, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042164104524999857, + "grad_norm": 9.333138465881348, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8609735369682312, + "num_tokens": 735193313.0, + "step": 19265 + }, + { + "epoch": 2.4508332273247677, + "ewc_loss": 0.07856708765029907, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004121357633266598, + "grad_norm": 9.14976692199707, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8800567388534546, + "num_tokens": 735235413.0, + "step": 19266 + }, + { + "epoch": 2.4509604376033582, + "ewc_loss": 0.07939667999744415, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000420431635575369, + "grad_norm": 9.246678352355957, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8661984205245972, + "num_tokens": 735270809.0, + "step": 19267 + }, + { + "epoch": 2.4510876478819488, + "ewc_loss": 0.07874235510826111, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004138883959967643, + "grad_norm": 9.13961124420166, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8571100234985352, + "num_tokens": 735309790.0, + "step": 19268 + }, + { + "epoch": 2.4512148581605393, + "ewc_loss": 0.07936355471611023, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042010037577711046, + "grad_norm": 9.329764366149902, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8664041757583618, + "num_tokens": 735341084.0, + "step": 19269 + }, + { + "epoch": 2.45134206843913, + "ewc_loss": 0.07874055206775665, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041387035162188113, + "grad_norm": 9.113367080688477, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8810837268829346, + "num_tokens": 735376964.0, + "step": 19270 + }, + { + "epoch": 2.4514692787177204, + "ewc_loss": 0.07958610355854034, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042232588748447597, + "grad_norm": 9.358564376831055, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8756375312805176, + "num_tokens": 735413296.0, + "step": 19271 + }, + { + "epoch": 2.451596488996311, + "ewc_loss": 0.07862412929534912, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004127061110921204, + "grad_norm": 9.108687400817871, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8698099851608276, + "num_tokens": 735454992.0, + "step": 19272 + }, + { + "epoch": 2.4517236992749014, + "ewc_loss": 0.0796513706445694, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004229785699862987, + "grad_norm": 9.436286926269531, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8676588535308838, + "num_tokens": 735493315.0, + "step": 19273 + }, + { + "epoch": 2.451850909553492, + "ewc_loss": 0.07837356626987457, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004102005041204393, + "grad_norm": 9.135838508605957, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8752094507217407, + "num_tokens": 735532666.0, + "step": 19274 + }, + { + "epoch": 2.4519781198320825, + "ewc_loss": 0.07938385009765625, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042030331678688526, + "grad_norm": 9.319038391113281, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.863235592842102, + "num_tokens": 735571987.0, + "step": 19275 + }, + { + "epoch": 2.452105330110673, + "ewc_loss": 0.07846572995185852, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004111221933271736, + "grad_norm": 9.107458114624023, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8624180555343628, + "num_tokens": 735609409.0, + "step": 19276 + }, + { + "epoch": 2.4522325403892635, + "ewc_loss": 0.07927414774894714, + "ewc_loss_diag": 3.719329833984375e-05, + "ewc_loss_parallel": 0.0004216476809233427, + "grad_norm": 9.35358715057373, + "learning_rate": 1e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.838280975818634, + "num_tokens": 735647299.0, + "step": 19277 + }, + { + "epoch": 2.452359750667854, + "ewc_loss": 0.07845627516508102, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004110276058781892, + "grad_norm": 9.127114295959473, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8729041814804077, + "num_tokens": 735677757.0, + "step": 19278 + }, + { + "epoch": 2.4524869609464446, + "ewc_loss": 0.07943134754896164, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004207783204037696, + "grad_norm": 9.212627410888672, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8706544637680054, + "num_tokens": 735723924.0, + "step": 19279 + }, + { + "epoch": 2.452614171225035, + "ewc_loss": 0.07880009710788727, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041446578688919544, + "grad_norm": 9.185970306396484, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8492916822433472, + "num_tokens": 735758105.0, + "step": 19280 + }, + { + "epoch": 2.4527413815036256, + "ewc_loss": 0.07903863489627838, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004168511659372598, + "grad_norm": 9.156061172485352, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8619817495346069, + "num_tokens": 735797603.0, + "step": 19281 + }, + { + "epoch": 2.452868591782216, + "ewc_loss": 0.07927271723747253, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041919201612472534, + "grad_norm": 9.212143898010254, + "learning_rate": 1e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8385825157165527, + "num_tokens": 735838958.0, + "step": 19282 + }, + { + "epoch": 2.4529958020608067, + "ewc_loss": 0.07881353050470352, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004146001301705837, + "grad_norm": 9.127413749694824, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8585731387138367, + "num_tokens": 735878894.0, + "step": 19283 + }, + { + "epoch": 2.453123012339397, + "ewc_loss": 0.07937540113925934, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004202188865747303, + "grad_norm": 9.193632125854492, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8955029845237732, + "num_tokens": 735913222.0, + "step": 19284 + }, + { + "epoch": 2.4532502226179878, + "ewc_loss": 0.07896198332309723, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004160846583545208, + "grad_norm": 9.117199897766113, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8718887567520142, + "num_tokens": 735951884.0, + "step": 19285 + }, + { + "epoch": 2.453377432896578, + "ewc_loss": 0.0793592780828476, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042005759314633906, + "grad_norm": 9.179845809936523, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8603836297988892, + "num_tokens": 735998955.0, + "step": 19286 + }, + { + "epoch": 2.4535046431751684, + "ewc_loss": 0.07911691069602966, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041763397166505456, + "grad_norm": 9.171467781066895, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8646358251571655, + "num_tokens": 736039631.0, + "step": 19287 + }, + { + "epoch": 2.453631853453759, + "ewc_loss": 0.07934166491031647, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004198815440759063, + "grad_norm": 9.147261619567871, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8617467284202576, + "num_tokens": 736085100.0, + "step": 19288 + }, + { + "epoch": 2.4537590637323494, + "ewc_loss": 0.07942414283752441, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004207062884233892, + "grad_norm": 9.257225036621094, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8814992904663086, + "num_tokens": 736119680.0, + "step": 19289 + }, + { + "epoch": 2.45388627401094, + "ewc_loss": 0.07904484868049622, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004169133317191154, + "grad_norm": 9.173212051391602, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8762977123260498, + "num_tokens": 736159769.0, + "step": 19290 + }, + { + "epoch": 2.4540134842895305, + "ewc_loss": 0.0793718695640564, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042018358362838626, + "grad_norm": 9.221122741699219, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8729051351547241, + "num_tokens": 736199566.0, + "step": 19291 + }, + { + "epoch": 2.454140694568121, + "ewc_loss": 0.07892201095819473, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000415684946347028, + "grad_norm": 9.139452934265137, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8698461651802063, + "num_tokens": 736239238.0, + "step": 19292 + }, + { + "epoch": 2.4542679048467115, + "ewc_loss": 0.07929309457540512, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004193958011455834, + "grad_norm": 9.250789642333984, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8688685894012451, + "num_tokens": 736274576.0, + "step": 19293 + }, + { + "epoch": 2.454395115125302, + "ewc_loss": 0.07887554168701172, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041522024548612535, + "grad_norm": 9.098337173461914, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8577613830566406, + "num_tokens": 736307509.0, + "step": 19294 + }, + { + "epoch": 2.4545223254038926, + "ewc_loss": 0.07945817708969116, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004210465704090893, + "grad_norm": 9.267496109008789, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8627274036407471, + "num_tokens": 736348313.0, + "step": 19295 + }, + { + "epoch": 2.454649535682483, + "ewc_loss": 0.0786634087562561, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041309892549179494, + "grad_norm": 9.124402046203613, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8618753552436829, + "num_tokens": 736380716.0, + "step": 19296 + }, + { + "epoch": 2.4547767459610736, + "ewc_loss": 0.0793764516711235, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004202293639536947, + "grad_norm": 9.182518005371094, + "learning_rate": 1e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8449110984802246, + "num_tokens": 736422669.0, + "step": 19297 + }, + { + "epoch": 2.454903956239664, + "ewc_loss": 0.07877308130264282, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004141956160310656, + "grad_norm": 9.14468002319336, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8737374544143677, + "num_tokens": 736452929.0, + "step": 19298 + }, + { + "epoch": 2.4550311665182547, + "ewc_loss": 0.07927580922842026, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041922295349650085, + "grad_norm": 9.141534805297852, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8679085373878479, + "num_tokens": 736486755.0, + "step": 19299 + }, + { + "epoch": 2.4551583767968452, + "ewc_loss": 0.079023577272892, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004167006118223071, + "grad_norm": 9.147913932800293, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8734713196754456, + "num_tokens": 736521542.0, + "step": 19300 + }, + { + "epoch": 2.4552855870754358, + "ewc_loss": 0.07891277968883514, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004155925998929888, + "grad_norm": 9.134270668029785, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8660809993743896, + "num_tokens": 736557312.0, + "step": 19301 + }, + { + "epoch": 2.4554127973540263, + "ewc_loss": 0.07909205555915833, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004173853958491236, + "grad_norm": 9.13966178894043, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.860931396484375, + "num_tokens": 736596469.0, + "step": 19302 + }, + { + "epoch": 2.455540007632617, + "ewc_loss": 0.07902101427316666, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004166750004515052, + "grad_norm": 9.115889549255371, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8497092723846436, + "num_tokens": 736636078.0, + "step": 19303 + }, + { + "epoch": 2.4556672179112073, + "ewc_loss": 0.07917315512895584, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041819640318863094, + "grad_norm": 9.181312561035156, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8793574571609497, + "num_tokens": 736674117.0, + "step": 19304 + }, + { + "epoch": 2.455794428189798, + "ewc_loss": 0.07882846891880035, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041474957833997905, + "grad_norm": 9.08956527709961, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8612442016601562, + "num_tokens": 736709116.0, + "step": 19305 + }, + { + "epoch": 2.4559216384683884, + "ewc_loss": 0.07908604294061661, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004173252673354, + "grad_norm": 9.200368881225586, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8577810525894165, + "num_tokens": 736750383.0, + "step": 19306 + }, + { + "epoch": 2.456048848746979, + "ewc_loss": 0.07889141142368317, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041537891956977546, + "grad_norm": 9.11398696899414, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8538001775741577, + "num_tokens": 736790836.0, + "step": 19307 + }, + { + "epoch": 2.4561760590255695, + "ewc_loss": 0.0791940987110138, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004184057761449367, + "grad_norm": 9.2323579788208, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8625097274780273, + "num_tokens": 736826085.0, + "step": 19308 + }, + { + "epoch": 2.4563032693041595, + "ewc_loss": 0.0787380039691925, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041384485666640103, + "grad_norm": 9.090155601501465, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8651070594787598, + "num_tokens": 736860599.0, + "step": 19309 + }, + { + "epoch": 2.4564304795827505, + "ewc_loss": 0.07940022647380829, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004204670840408653, + "grad_norm": 9.254743576049805, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8827763795852661, + "num_tokens": 736893286.0, + "step": 19310 + }, + { + "epoch": 2.4565576898613406, + "ewc_loss": 0.07853176444768906, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004117824719287455, + "grad_norm": 9.019172668457031, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8751728534698486, + "num_tokens": 736931597.0, + "step": 19311 + }, + { + "epoch": 2.456684900139931, + "ewc_loss": 0.07957574725151062, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042222230695188046, + "grad_norm": 9.326926231384277, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8731915950775146, + "num_tokens": 736972183.0, + "step": 19312 + }, + { + "epoch": 2.4568121104185217, + "ewc_loss": 0.07819238305091858, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040838864515535533, + "grad_norm": 9.007902145385742, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8715168237686157, + "num_tokens": 737004184.0, + "step": 19313 + }, + { + "epoch": 2.456939320697112, + "ewc_loss": 0.07969304174184799, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042339524952694774, + "grad_norm": 9.350422859191895, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.865651547908783, + "num_tokens": 737042644.0, + "step": 19314 + }, + { + "epoch": 2.4570665309757027, + "ewc_loss": 0.07820011675357819, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004084659740328789, + "grad_norm": 9.07805061340332, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8555226922035217, + "num_tokens": 737074830.0, + "step": 19315 + }, + { + "epoch": 2.4571937412542932, + "ewc_loss": 0.07922843098640442, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041874911403283477, + "grad_norm": 9.195570945739746, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8732367157936096, + "num_tokens": 737107756.0, + "step": 19316 + }, + { + "epoch": 2.4573209515328838, + "ewc_loss": 0.07862211763858795, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041268597124144435, + "grad_norm": 9.157434463500977, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.874282956123352, + "num_tokens": 737143457.0, + "step": 19317 + }, + { + "epoch": 2.4574481618114743, + "ewc_loss": 0.07896529138088226, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000416117807617411, + "grad_norm": 9.181310653686523, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8612111806869507, + "num_tokens": 737177005.0, + "step": 19318 + }, + { + "epoch": 2.457575372090065, + "ewc_loss": 0.07870359718799591, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041350084939040244, + "grad_norm": 9.097285270690918, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8560706973075867, + "num_tokens": 737218265.0, + "step": 19319 + }, + { + "epoch": 2.4577025823686554, + "ewc_loss": 0.07887149602174759, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041517982026562095, + "grad_norm": 9.218016624450684, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8549203872680664, + "num_tokens": 737259495.0, + "step": 19320 + }, + { + "epoch": 2.457829792647246, + "ewc_loss": 0.07861283421516418, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004125931591261178, + "grad_norm": 9.077573776245117, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8583052754402161, + "num_tokens": 737300073.0, + "step": 19321 + }, + { + "epoch": 2.4579570029258364, + "ewc_loss": 0.0790099948644638, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041656478424556553, + "grad_norm": 9.21341323852539, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8659599423408508, + "num_tokens": 737334716.0, + "step": 19322 + }, + { + "epoch": 2.458084213204427, + "ewc_loss": 0.07857635617256165, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004122283717151731, + "grad_norm": 9.12185001373291, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8719292879104614, + "num_tokens": 737377086.0, + "step": 19323 + }, + { + "epoch": 2.4582114234830175, + "ewc_loss": 0.07896135747432709, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041607837192714214, + "grad_norm": 9.193700790405273, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8552594184875488, + "num_tokens": 737418259.0, + "step": 19324 + }, + { + "epoch": 2.458338633761608, + "ewc_loss": 0.07864201068878174, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041288495413027704, + "grad_norm": 9.186237335205078, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8541618585586548, + "num_tokens": 737453150.0, + "step": 19325 + }, + { + "epoch": 2.4584658440401985, + "ewc_loss": 0.0787983387708664, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004144482663832605, + "grad_norm": 9.20447063446045, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8798525929450989, + "num_tokens": 737495552.0, + "step": 19326 + }, + { + "epoch": 2.458593054318789, + "ewc_loss": 0.07858818769454956, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004123467078898102, + "grad_norm": 9.077066421508789, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8700946569442749, + "num_tokens": 737537518.0, + "step": 19327 + }, + { + "epoch": 2.4587202645973796, + "ewc_loss": 0.07899105548858643, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000416375434724614, + "grad_norm": 9.22510814666748, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8616299033164978, + "num_tokens": 737577472.0, + "step": 19328 + }, + { + "epoch": 2.45884747487597, + "ewc_loss": 0.07857409119606018, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004122057871427387, + "grad_norm": 9.208797454833984, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.857589840888977, + "num_tokens": 737617250.0, + "step": 19329 + }, + { + "epoch": 2.4589746851545606, + "ewc_loss": 0.07904812693595886, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041450472781434655, + "grad_norm": 9.798611640930176, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8687660098075867, + "num_tokens": 737655544.0, + "step": 19330 + }, + { + "epoch": 2.459101895433151, + "ewc_loss": 0.07744736969470978, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040093858842737973, + "grad_norm": 8.944039344787598, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8693675994873047, + "num_tokens": 737688644.0, + "step": 19331 + }, + { + "epoch": 2.4592291057117417, + "ewc_loss": 0.07989723235368729, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004254371742717922, + "grad_norm": 9.411758422851562, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8689795136451721, + "num_tokens": 737730264.0, + "step": 19332 + }, + { + "epoch": 2.459356315990332, + "ewc_loss": 0.07750269770622253, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040149185224436224, + "grad_norm": 8.887943267822266, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8608033061027527, + "num_tokens": 737763578.0, + "step": 19333 + }, + { + "epoch": 2.4594835262689223, + "ewc_loss": 0.08017495274543762, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004282143490854651, + "grad_norm": 9.478777885437012, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8682441711425781, + "num_tokens": 737804062.0, + "step": 19334 + }, + { + "epoch": 2.4596107365475133, + "ewc_loss": 0.0775834247469902, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040229910518974066, + "grad_norm": 8.887173652648926, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8796738386154175, + "num_tokens": 737845608.0, + "step": 19335 + }, + { + "epoch": 2.4597379468261034, + "ewc_loss": 0.08031874895095825, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042965231114067137, + "grad_norm": 9.465766906738281, + "learning_rate": 1e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8476238250732422, + "num_tokens": 737881366.0, + "step": 19336 + }, + { + "epoch": 2.459865157104694, + "ewc_loss": 0.07806025445461273, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040706738946028054, + "grad_norm": 9.040426254272461, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8579106330871582, + "num_tokens": 737922595.0, + "step": 19337 + }, + { + "epoch": 2.4599923673832844, + "ewc_loss": 0.08000348508358002, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042649972601793706, + "grad_norm": 9.429144859313965, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8718100786209106, + "num_tokens": 737956368.0, + "step": 19338 + }, + { + "epoch": 2.460119577661875, + "ewc_loss": 0.07829945534467697, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000409459404181689, + "grad_norm": 8.990806579589844, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8714333176612854, + "num_tokens": 738000167.0, + "step": 19339 + }, + { + "epoch": 2.4602467879404655, + "ewc_loss": 0.07997500151395798, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042621485772542655, + "grad_norm": 9.414216041564941, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8590211868286133, + "num_tokens": 738040850.0, + "step": 19340 + }, + { + "epoch": 2.460373998219056, + "ewc_loss": 0.07849916815757751, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000411456567235291, + "grad_norm": 9.025065422058105, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8632808327674866, + "num_tokens": 738080370.0, + "step": 19341 + }, + { + "epoch": 2.4605012084976465, + "ewc_loss": 0.08007624000310898, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042722723446786404, + "grad_norm": 9.346257209777832, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8700556755065918, + "num_tokens": 738116751.0, + "step": 19342 + }, + { + "epoch": 2.460628418776237, + "ewc_loss": 0.07859469950199127, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004124118131585419, + "grad_norm": 9.138800621032715, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8842229843139648, + "num_tokens": 738154597.0, + "step": 19343 + }, + { + "epoch": 2.4607556290548276, + "ewc_loss": 0.07948850840330124, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042134991963393986, + "grad_norm": 9.322636604309082, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8727129697799683, + "num_tokens": 738186484.0, + "step": 19344 + }, + { + "epoch": 2.460882839333418, + "ewc_loss": 0.07877446711063385, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041420955676585436, + "grad_norm": 9.05772876739502, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8704143762588501, + "num_tokens": 738217722.0, + "step": 19345 + }, + { + "epoch": 2.4610100496120086, + "ewc_loss": 0.07947241514921188, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004211890045553446, + "grad_norm": 9.293052673339844, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.878905177116394, + "num_tokens": 738255304.0, + "step": 19346 + }, + { + "epoch": 2.461137259890599, + "ewc_loss": 0.07853788137435913, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004118436190765351, + "grad_norm": 9.092665672302246, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8617267608642578, + "num_tokens": 738292162.0, + "step": 19347 + }, + { + "epoch": 2.4612644701691897, + "ewc_loss": 0.07940379530191422, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004205027944408357, + "grad_norm": 9.227387428283691, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8787790536880493, + "num_tokens": 738332851.0, + "step": 19348 + }, + { + "epoch": 2.4613916804477802, + "ewc_loss": 0.0786154568195343, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004126194107811898, + "grad_norm": 9.060514450073242, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8759938478469849, + "num_tokens": 738375120.0, + "step": 19349 + }, + { + "epoch": 2.4615188907263708, + "ewc_loss": 0.07927504181861877, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041921527008526027, + "grad_norm": 9.267046928405762, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8570944666862488, + "num_tokens": 738413857.0, + "step": 19350 + }, + { + "epoch": 2.4616461010049613, + "ewc_loss": 0.07854312658309937, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004118960932828486, + "grad_norm": 9.036675453186035, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8596484661102295, + "num_tokens": 738450097.0, + "step": 19351 + }, + { + "epoch": 2.461773311283552, + "ewc_loss": 0.07963868975639343, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042285173549316823, + "grad_norm": 9.336669921875, + "learning_rate": 1e-06, + "loss": 0.544, + "mean_token_accuracy": 0.8482968807220459, + "num_tokens": 738483784.0, + "step": 19352 + }, + { + "epoch": 2.4619005215621423, + "ewc_loss": 0.07842430472373962, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041070787119679153, + "grad_norm": 9.028504371643066, + "learning_rate": 1e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8462227582931519, + "num_tokens": 738526820.0, + "step": 19353 + }, + { + "epoch": 2.462027731840733, + "ewc_loss": 0.07956751435995102, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004221400013193488, + "grad_norm": 9.241868019104004, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8514227867126465, + "num_tokens": 738569049.0, + "step": 19354 + }, + { + "epoch": 2.4621549421193234, + "ewc_loss": 0.07850507646799088, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004115156189072877, + "grad_norm": 9.083501815795898, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8631181120872498, + "num_tokens": 738608320.0, + "step": 19355 + }, + { + "epoch": 2.462282152397914, + "ewc_loss": 0.07932017743587494, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004196666122879833, + "grad_norm": 9.199339866638184, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8798790574073792, + "num_tokens": 738645566.0, + "step": 19356 + }, + { + "epoch": 2.4624093626765045, + "ewc_loss": 0.0785277932882309, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000411742803407833, + "grad_norm": 9.10453987121582, + "learning_rate": 1e-06, + "loss": 0.5476, + "mean_token_accuracy": 0.8422633409500122, + "num_tokens": 738686184.0, + "step": 19357 + }, + { + "epoch": 2.462536572955095, + "ewc_loss": 0.07911300659179688, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004175949434284121, + "grad_norm": 9.232510566711426, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8505988121032715, + "num_tokens": 738728820.0, + "step": 19358 + }, + { + "epoch": 2.462663783233685, + "ewc_loss": 0.07849577069282532, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041142257396131754, + "grad_norm": 9.151871681213379, + "learning_rate": 1e-06, + "loss": 0.5659, + "mean_token_accuracy": 0.8360351920127869, + "num_tokens": 738760180.0, + "step": 19359 + }, + { + "epoch": 2.462790993512276, + "ewc_loss": 0.07894086837768555, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041587348096072674, + "grad_norm": 9.182376861572266, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8727284669876099, + "num_tokens": 738795501.0, + "step": 19360 + }, + { + "epoch": 2.462918203790866, + "ewc_loss": 0.07880683243274689, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004145332204643637, + "grad_norm": 9.137897491455078, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8559138774871826, + "num_tokens": 738831027.0, + "step": 19361 + }, + { + "epoch": 2.4630454140694567, + "ewc_loss": 0.07879480719566345, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000414412934333086, + "grad_norm": 9.17094612121582, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8474158644676208, + "num_tokens": 738870383.0, + "step": 19362 + }, + { + "epoch": 2.463172624348047, + "ewc_loss": 0.07880143821239471, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004144792619626969, + "grad_norm": 9.128606796264648, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8703640103340149, + "num_tokens": 738908231.0, + "step": 19363 + }, + { + "epoch": 2.4632998346266377, + "ewc_loss": 0.07900657504796982, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041653058724477887, + "grad_norm": 9.167558670043945, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8561668395996094, + "num_tokens": 738950927.0, + "step": 19364 + }, + { + "epoch": 2.4634270449052282, + "ewc_loss": 0.07870398461818695, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041106328717432916, + "grad_norm": 14.545271873474121, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8772879838943481, + "num_tokens": 738989857.0, + "step": 19365 + }, + { + "epoch": 2.4635542551838188, + "ewc_loss": 0.08545979857444763, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000478621426736936, + "grad_norm": 9.706110000610352, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8787654638290405, + "num_tokens": 739021281.0, + "step": 19366 + }, + { + "epoch": 2.4636814654624093, + "ewc_loss": 0.08138536661863327, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004403185157570988, + "grad_norm": 9.59170913696289, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8725444078445435, + "num_tokens": 739063475.0, + "step": 19367 + }, + { + "epoch": 2.463808675741, + "ewc_loss": 0.07903456687927246, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004168104787822813, + "grad_norm": 9.177408218383789, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8613429069519043, + "num_tokens": 739107083.0, + "step": 19368 + }, + { + "epoch": 2.4639358860195903, + "ewc_loss": 0.08206811547279358, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004471459542401135, + "grad_norm": 9.617876052856445, + "learning_rate": 1e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.8426957726478577, + "num_tokens": 739152002.0, + "step": 19369 + }, + { + "epoch": 2.464063096298181, + "ewc_loss": 0.07895546406507492, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004160194657742977, + "grad_norm": 9.250844955444336, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8620157241821289, + "num_tokens": 739191781.0, + "step": 19370 + }, + { + "epoch": 2.4641903065767714, + "ewc_loss": 0.08068901300430298, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043335495865903795, + "grad_norm": 9.440402030944824, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8873971104621887, + "num_tokens": 739230374.0, + "step": 19371 + }, + { + "epoch": 2.464317516855362, + "ewc_loss": 0.07910296320915222, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000417494447901845, + "grad_norm": 9.203784942626953, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8603754043579102, + "num_tokens": 739267310.0, + "step": 19372 + }, + { + "epoch": 2.4644447271339525, + "ewc_loss": 0.08007340878248215, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004271989455446601, + "grad_norm": 9.385276794433594, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.868391215801239, + "num_tokens": 739305074.0, + "step": 19373 + }, + { + "epoch": 2.464571937412543, + "ewc_loss": 0.07897023111581802, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004161671386100352, + "grad_norm": 9.16748332977295, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.856887698173523, + "num_tokens": 739343028.0, + "step": 19374 + }, + { + "epoch": 2.4646991476911335, + "ewc_loss": 0.07983049750328064, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004247698525432497, + "grad_norm": 9.412313461303711, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8646485805511475, + "num_tokens": 739384364.0, + "step": 19375 + }, + { + "epoch": 2.464826357969724, + "ewc_loss": 0.08018215000629425, + "ewc_loss_diag": 3.886222839355469e-05, + "ewc_loss_parallel": 0.00041363792843185365, + "grad_norm": 11.384085655212402, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8803062438964844, + "num_tokens": 739423564.0, + "step": 19376 + }, + { + "epoch": 2.4649535682483146, + "ewc_loss": 0.07830055803060532, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040947040542960167, + "grad_norm": 8.899852752685547, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8751317262649536, + "num_tokens": 739457609.0, + "step": 19377 + }, + { + "epoch": 2.465080778526905, + "ewc_loss": 0.08265014737844467, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045296631287783384, + "grad_norm": 9.873651504516602, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8659605979919434, + "num_tokens": 739497204.0, + "step": 19378 + }, + { + "epoch": 2.4652079888054956, + "ewc_loss": 0.07754917442798615, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040195658220909536, + "grad_norm": 8.807699203491211, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8599771857261658, + "num_tokens": 739543688.0, + "step": 19379 + }, + { + "epoch": 2.465335199084086, + "ewc_loss": 0.08330106735229492, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004594754718709737, + "grad_norm": 9.942678451538086, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8761346340179443, + "num_tokens": 739578221.0, + "step": 19380 + }, + { + "epoch": 2.4654624093626767, + "ewc_loss": 0.0784769356250763, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004112341848667711, + "grad_norm": 8.959424018859863, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8701774477958679, + "num_tokens": 739615892.0, + "step": 19381 + }, + { + "epoch": 2.4655896196412668, + "ewc_loss": 0.08274059742689133, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004538708017207682, + "grad_norm": 9.760342597961426, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8508107662200928, + "num_tokens": 739657697.0, + "step": 19382 + }, + { + "epoch": 2.4657168299198577, + "ewc_loss": 0.07925237715244293, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004189886385574937, + "grad_norm": 9.246333122253418, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8748142719268799, + "num_tokens": 739694173.0, + "step": 19383 + }, + { + "epoch": 2.465844040198448, + "ewc_loss": 0.08117188513278961, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004381837497930974, + "grad_norm": 9.532532691955566, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8624362945556641, + "num_tokens": 739734111.0, + "step": 19384 + }, + { + "epoch": 2.4659712504770384, + "ewc_loss": 0.07941091805696487, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042057401151396334, + "grad_norm": 9.243877410888672, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8664556741714478, + "num_tokens": 739771409.0, + "step": 19385 + }, + { + "epoch": 2.466098460755629, + "ewc_loss": 0.08038613200187683, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004278847191017121, + "grad_norm": 9.410365104675293, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8511312007904053, + "num_tokens": 739807185.0, + "step": 19386 + }, + { + "epoch": 2.4662256710342194, + "ewc_loss": 0.0791424959897995, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004178898234385997, + "grad_norm": 9.173623085021973, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.875766396522522, + "num_tokens": 739837750.0, + "step": 19387 + }, + { + "epoch": 2.46635288131281, + "ewc_loss": 0.08011884987354279, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004252118815202266, + "grad_norm": 14.695813179016113, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8646986484527588, + "num_tokens": 739879516.0, + "step": 19388 + }, + { + "epoch": 2.4664800915914005, + "ewc_loss": 0.08781659603118896, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005046307924203575, + "grad_norm": 10.141387939453125, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8719453811645508, + "num_tokens": 739918688.0, + "step": 19389 + }, + { + "epoch": 2.466607301869991, + "ewc_loss": 0.08047233521938324, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043118814937770367, + "grad_norm": 9.347091674804688, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8693451285362244, + "num_tokens": 739958978.0, + "step": 19390 + }, + { + "epoch": 2.4667345121485815, + "ewc_loss": 0.08065886795520782, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000433053559390828, + "grad_norm": 9.5374174118042, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8690406084060669, + "num_tokens": 740002428.0, + "step": 19391 + }, + { + "epoch": 2.466861722427172, + "ewc_loss": 0.08029382675886154, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042940309504047036, + "grad_norm": 9.227100372314453, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8865138292312622, + "num_tokens": 740045856.0, + "step": 19392 + }, + { + "epoch": 2.4669889327057626, + "ewc_loss": 0.08105412125587463, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043700606329366565, + "grad_norm": 9.544476509094238, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8631871342658997, + "num_tokens": 740084512.0, + "step": 19393 + }, + { + "epoch": 2.467116142984353, + "ewc_loss": 0.07906467467546463, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041711158701218665, + "grad_norm": 9.063155174255371, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8768806457519531, + "num_tokens": 740129624.0, + "step": 19394 + }, + { + "epoch": 2.4672433532629436, + "ewc_loss": 0.08136482536792755, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004401130718179047, + "grad_norm": 9.625831604003906, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8486475944519043, + "num_tokens": 740168368.0, + "step": 19395 + }, + { + "epoch": 2.467370563541534, + "ewc_loss": 0.07884907722473145, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004149555752519518, + "grad_norm": 9.115585327148438, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8584380745887756, + "num_tokens": 740206670.0, + "step": 19396 + }, + { + "epoch": 2.4674977738201247, + "ewc_loss": 0.08111713081598282, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004351947281975299, + "grad_norm": 9.520346641540527, + "learning_rate": 1e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8425893187522888, + "num_tokens": 740246422.0, + "step": 19397 + }, + { + "epoch": 2.4676249840987152, + "ewc_loss": 0.07894477248191833, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004159125965088606, + "grad_norm": 9.116070747375488, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8632433414459229, + "num_tokens": 740284450.0, + "step": 19398 + }, + { + "epoch": 2.4677521943773058, + "ewc_loss": 0.0805155336856842, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043162022484466434, + "grad_norm": 9.40892219543457, + "learning_rate": 1e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.8421276211738586, + "num_tokens": 740319062.0, + "step": 19399 + }, + { + "epoch": 2.4678794046558963, + "ewc_loss": 0.0789332389831543, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041579725802876055, + "grad_norm": 9.168621063232422, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8664214611053467, + "num_tokens": 740353996.0, + "step": 19400 + }, + { + "epoch": 2.468006614934487, + "ewc_loss": 0.08002258837223053, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004266906762495637, + "grad_norm": 9.32304573059082, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8639494180679321, + "num_tokens": 740394694.0, + "step": 19401 + }, + { + "epoch": 2.4681338252130773, + "ewc_loss": 0.07918751239776611, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000418339972384274, + "grad_norm": 9.160219192504883, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8863131999969482, + "num_tokens": 740436567.0, + "step": 19402 + }, + { + "epoch": 2.468261035491668, + "ewc_loss": 0.07962748408317566, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004227396857459098, + "grad_norm": 9.361416816711426, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8557385206222534, + "num_tokens": 740476458.0, + "step": 19403 + }, + { + "epoch": 2.4683882457702584, + "ewc_loss": 0.07900156080722809, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041648049955256283, + "grad_norm": 9.084016799926758, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8748939633369446, + "num_tokens": 740514703.0, + "step": 19404 + }, + { + "epoch": 2.468515456048849, + "ewc_loss": 0.07985232770442963, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004249880730640143, + "grad_norm": 9.342873573303223, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8760548830032349, + "num_tokens": 740552328.0, + "step": 19405 + }, + { + "epoch": 2.4686426663274394, + "ewc_loss": 0.07887306809425354, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041519547812640667, + "grad_norm": 9.12711238861084, + "learning_rate": 1e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8481407165527344, + "num_tokens": 740593141.0, + "step": 19406 + }, + { + "epoch": 2.4687698766060295, + "ewc_loss": 0.07988978922367096, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004253626975696534, + "grad_norm": 9.243181228637695, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8747079968452454, + "num_tokens": 740634097.0, + "step": 19407 + }, + { + "epoch": 2.4688970868846205, + "ewc_loss": 0.0790460854768753, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004169257008470595, + "grad_norm": 9.164451599121094, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.871976375579834, + "num_tokens": 740672917.0, + "step": 19408 + }, + { + "epoch": 2.4690242971632106, + "ewc_loss": 0.07950779050588608, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004215427325107157, + "grad_norm": 9.263986587524414, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8596044778823853, + "num_tokens": 740709351.0, + "step": 19409 + }, + { + "epoch": 2.469151507441801, + "ewc_loss": 0.07910284399986267, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174932837486267, + "grad_norm": 9.1826810836792, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8561927676200867, + "num_tokens": 740744049.0, + "step": 19410 + }, + { + "epoch": 2.4692787177203916, + "ewc_loss": 0.07955287396907806, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000421993579948321, + "grad_norm": 9.251605033874512, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.867608904838562, + "num_tokens": 740784765.0, + "step": 19411 + }, + { + "epoch": 2.469405927998982, + "ewc_loss": 0.07909544557332993, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174193018116057, + "grad_norm": 9.173637390136719, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8589348793029785, + "num_tokens": 740815859.0, + "step": 19412 + }, + { + "epoch": 2.4695331382775727, + "ewc_loss": 0.0795263797044754, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004217285895720124, + "grad_norm": 9.279367446899414, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8837010860443115, + "num_tokens": 740854300.0, + "step": 19413 + }, + { + "epoch": 2.4696603485561632, + "ewc_loss": 0.07906495034694672, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004171142936684191, + "grad_norm": 9.218633651733398, + "learning_rate": 1e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8495454788208008, + "num_tokens": 740891836.0, + "step": 19414 + }, + { + "epoch": 2.4697875588347538, + "ewc_loss": 0.07929421216249466, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004194069770164788, + "grad_norm": 9.151233673095703, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.873967707157135, + "num_tokens": 740927434.0, + "step": 19415 + }, + { + "epoch": 2.4699147691133443, + "ewc_loss": 0.07919216901063919, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004183865385130048, + "grad_norm": 9.158327102661133, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8684854507446289, + "num_tokens": 740973707.0, + "step": 19416 + }, + { + "epoch": 2.470041979391935, + "ewc_loss": 0.07908269762992859, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041729179793037474, + "grad_norm": 9.191314697265625, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.871513843536377, + "num_tokens": 741015953.0, + "step": 19417 + }, + { + "epoch": 2.4701691896705253, + "ewc_loss": 0.07927880436182022, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041925290133804083, + "grad_norm": 9.19151782989502, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8728703856468201, + "num_tokens": 741049634.0, + "step": 19418 + }, + { + "epoch": 2.470296399949116, + "ewc_loss": 0.07911200076341629, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004175848443992436, + "grad_norm": 9.167122840881348, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8742367029190063, + "num_tokens": 741084267.0, + "step": 19419 + }, + { + "epoch": 2.4704236102277064, + "ewc_loss": 0.07921339571475983, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041859885095618665, + "grad_norm": 9.172554969787598, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8730508089065552, + "num_tokens": 741122237.0, + "step": 19420 + }, + { + "epoch": 2.470550820506297, + "ewc_loss": 0.07927197217941284, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041918453644029796, + "grad_norm": 9.187626838684082, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8522780537605286, + "num_tokens": 741163593.0, + "step": 19421 + }, + { + "epoch": 2.4706780307848875, + "ewc_loss": 0.07916752994060516, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041814017458818853, + "grad_norm": 9.168549537658691, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8553300499916077, + "num_tokens": 741202328.0, + "step": 19422 + }, + { + "epoch": 2.470805241063478, + "ewc_loss": 0.0796150416135788, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004201738629490137, + "grad_norm": 9.221009254455566, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8839155435562134, + "num_tokens": 741240095.0, + "step": 19423 + }, + { + "epoch": 2.4709324513420685, + "ewc_loss": 0.0791412740945816, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041787754162214696, + "grad_norm": 9.161227226257324, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8670046925544739, + "num_tokens": 741278509.0, + "step": 19424 + }, + { + "epoch": 2.471059661620659, + "ewc_loss": 0.07927517592906952, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041921655065380037, + "grad_norm": 9.21274471282959, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8596047163009644, + "num_tokens": 741321446.0, + "step": 19425 + }, + { + "epoch": 2.4711868718992496, + "ewc_loss": 0.07916411757469177, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041810606489889324, + "grad_norm": 9.257308959960938, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8620758652687073, + "num_tokens": 741360469.0, + "step": 19426 + }, + { + "epoch": 2.47131408217784, + "ewc_loss": 0.07924427837133408, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041890761349350214, + "grad_norm": 9.213529586791992, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8539804220199585, + "num_tokens": 741393833.0, + "step": 19427 + }, + { + "epoch": 2.4714412924564306, + "ewc_loss": 0.0790572315454483, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004170371685177088, + "grad_norm": 9.104107856750488, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8684796094894409, + "num_tokens": 741426956.0, + "step": 19428 + }, + { + "epoch": 2.471568502735021, + "ewc_loss": 0.0794219896197319, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004206847515888512, + "grad_norm": 9.247800827026367, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8654102087020874, + "num_tokens": 741464820.0, + "step": 19429 + }, + { + "epoch": 2.4716957130136117, + "ewc_loss": 0.07892805337905884, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041574539500288665, + "grad_norm": 9.129020690917969, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8725001811981201, + "num_tokens": 741499519.0, + "step": 19430 + }, + { + "epoch": 2.471822923292202, + "ewc_loss": 0.07936443388462067, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004201092233415693, + "grad_norm": 9.248671531677246, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8713135123252869, + "num_tokens": 741533443.0, + "step": 19431 + }, + { + "epoch": 2.4719501335707923, + "ewc_loss": 0.07897816598415375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004162465047556907, + "grad_norm": 9.133174896240234, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8629708290100098, + "num_tokens": 741576849.0, + "step": 19432 + }, + { + "epoch": 2.4720773438493833, + "ewc_loss": 0.07922343909740448, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041869920096360147, + "grad_norm": 9.220048904418945, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8694717884063721, + "num_tokens": 741614653.0, + "step": 19433 + }, + { + "epoch": 2.4722045541279734, + "ewc_loss": 0.07917001843452454, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041572359623387456, + "grad_norm": 9.110427856445312, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8603736162185669, + "num_tokens": 741649354.0, + "step": 19434 + }, + { + "epoch": 2.472331764406564, + "ewc_loss": 0.07945813238620758, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004210461920592934, + "grad_norm": 9.198870658874512, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8596444129943848, + "num_tokens": 741692257.0, + "step": 19435 + }, + { + "epoch": 2.4724589746851544, + "ewc_loss": 0.07900362461805344, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004165011050645262, + "grad_norm": 9.127853393554688, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8621853590011597, + "num_tokens": 741729739.0, + "step": 19436 + }, + { + "epoch": 2.472586184963745, + "ewc_loss": 0.07942450791597366, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004207099264021963, + "grad_norm": 9.193800926208496, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8586493134498596, + "num_tokens": 741763409.0, + "step": 19437 + }, + { + "epoch": 2.4727133952423355, + "ewc_loss": 0.07922537624835968, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041627715108916163, + "grad_norm": 9.163005828857422, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8616300225257874, + "num_tokens": 741798746.0, + "step": 19438 + }, + { + "epoch": 2.472840605520926, + "ewc_loss": 0.07923392951488495, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041880409116856754, + "grad_norm": 9.16345500946045, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8743895888328552, + "num_tokens": 741840488.0, + "step": 19439 + }, + { + "epoch": 2.4729678157995165, + "ewc_loss": 0.07915566116571426, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004180214600637555, + "grad_norm": 9.120146751403809, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8833884000778198, + "num_tokens": 741880839.0, + "step": 19440 + }, + { + "epoch": 2.473095026078107, + "ewc_loss": 0.079170361161232, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004181684635113925, + "grad_norm": 9.191765785217285, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8594030737876892, + "num_tokens": 741919317.0, + "step": 19441 + }, + { + "epoch": 2.4732222363566976, + "ewc_loss": 0.07891878485679626, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041565269930288196, + "grad_norm": 9.144301414489746, + "learning_rate": 1e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8443945646286011, + "num_tokens": 741955782.0, + "step": 19442 + }, + { + "epoch": 2.473349446635288, + "ewc_loss": 0.07944092154502869, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041843263898044825, + "grad_norm": 9.202813148498535, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8593845963478088, + "num_tokens": 741990410.0, + "step": 19443 + }, + { + "epoch": 2.4734766569138786, + "ewc_loss": 0.07910826057195663, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004151060420554131, + "grad_norm": 9.119355201721191, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8590110540390015, + "num_tokens": 742031536.0, + "step": 19444 + }, + { + "epoch": 2.473603867192469, + "ewc_loss": 0.07950875163078308, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004191109328530729, + "grad_norm": 9.193109512329102, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8531067371368408, + "num_tokens": 742067222.0, + "step": 19445 + }, + { + "epoch": 2.4737310774710597, + "ewc_loss": 0.07912084460258484, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004152318579144776, + "grad_norm": 9.027999877929688, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8770721554756165, + "num_tokens": 742114740.0, + "step": 19446 + }, + { + "epoch": 2.47385828774965, + "ewc_loss": 0.07940329611301422, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004204977594781667, + "grad_norm": 9.235739707946777, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8717498183250427, + "num_tokens": 742149344.0, + "step": 19447 + }, + { + "epoch": 2.4739854980282407, + "ewc_loss": 0.0786997377872467, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004134622577112168, + "grad_norm": 9.126097679138184, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.866336464881897, + "num_tokens": 742191089.0, + "step": 19448 + }, + { + "epoch": 2.4741127083068313, + "ewc_loss": 0.0793132334947586, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000419597199652344, + "grad_norm": 9.221603393554688, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8658415675163269, + "num_tokens": 742224944.0, + "step": 19449 + }, + { + "epoch": 2.474239918585422, + "ewc_loss": 0.07916027307510376, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041562618571333587, + "grad_norm": 9.10168170928955, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8550767302513123, + "num_tokens": 742263674.0, + "step": 19450 + }, + { + "epoch": 2.4743671288640123, + "ewc_loss": 0.07924290001392365, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041889381827786565, + "grad_norm": 9.231558799743652, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8612553477287292, + "num_tokens": 742300424.0, + "step": 19451 + }, + { + "epoch": 2.474494339142603, + "ewc_loss": 0.07912413030862808, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004152647452428937, + "grad_norm": 9.094292640686035, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8752813935279846, + "num_tokens": 742333854.0, + "step": 19452 + }, + { + "epoch": 2.4746215494211934, + "ewc_loss": 0.07950814068317413, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004215461958665401, + "grad_norm": 9.206133842468262, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8733073472976685, + "num_tokens": 742377126.0, + "step": 19453 + }, + { + "epoch": 2.474748759699784, + "ewc_loss": 0.07907843589782715, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041480775689706206, + "grad_norm": 9.089102745056152, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8627546429634094, + "num_tokens": 742421284.0, + "step": 19454 + }, + { + "epoch": 2.4748759699783744, + "ewc_loss": 0.07945343852043152, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042099927668459713, + "grad_norm": 9.24344253540039, + "learning_rate": 1e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8472352623939514, + "num_tokens": 742459124.0, + "step": 19455 + }, + { + "epoch": 2.475003180256965, + "ewc_loss": 0.07894851267337799, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041350856190547347, + "grad_norm": 9.08911418914795, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8578615784645081, + "num_tokens": 742493137.0, + "step": 19456 + }, + { + "epoch": 2.475130390535555, + "ewc_loss": 0.07976612448692322, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004216846718918532, + "grad_norm": 9.297831535339355, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8665914535522461, + "num_tokens": 742529805.0, + "step": 19457 + }, + { + "epoch": 2.475257600814146, + "ewc_loss": 0.07896824926137924, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041370594408363104, + "grad_norm": 9.113729476928711, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8780860304832458, + "num_tokens": 742560859.0, + "step": 19458 + }, + { + "epoch": 2.475384811092736, + "ewc_loss": 0.07963362336158752, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042280106572434306, + "grad_norm": 9.252435684204102, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.87751305103302, + "num_tokens": 742601465.0, + "step": 19459 + }, + { + "epoch": 2.4755120213713266, + "ewc_loss": 0.07867582142353058, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041322308243252337, + "grad_norm": 9.087468147277832, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8587971329689026, + "num_tokens": 742639242.0, + "step": 19460 + }, + { + "epoch": 2.475639231649917, + "ewc_loss": 0.07982692122459412, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004222926509100944, + "grad_norm": 9.339334487915039, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8644404411315918, + "num_tokens": 742676574.0, + "step": 19461 + }, + { + "epoch": 2.4757664419285077, + "ewc_loss": 0.0788193941116333, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041221739957109094, + "grad_norm": 9.138554573059082, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8559341430664062, + "num_tokens": 742713326.0, + "step": 19462 + }, + { + "epoch": 2.4758936522070982, + "ewc_loss": 0.07969610393047333, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004209844919387251, + "grad_norm": 9.282256126403809, + "learning_rate": 1e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8432849645614624, + "num_tokens": 742756883.0, + "step": 19463 + }, + { + "epoch": 2.4760208624856888, + "ewc_loss": 0.07893174886703491, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041334095294587314, + "grad_norm": 9.193704605102539, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8603589534759521, + "num_tokens": 742794301.0, + "step": 19464 + }, + { + "epoch": 2.4761480727642793, + "ewc_loss": 0.07926632463932037, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041668672929517925, + "grad_norm": 9.161982536315918, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8675894737243652, + "num_tokens": 742832966.0, + "step": 19465 + }, + { + "epoch": 2.47627528304287, + "ewc_loss": 0.07923757284879684, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004163991834502667, + "grad_norm": 9.179655075073242, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8617377281188965, + "num_tokens": 742872926.0, + "step": 19466 + }, + { + "epoch": 2.4764024933214603, + "ewc_loss": 0.07909698784351349, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041499329381622374, + "grad_norm": 9.232840538024902, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8707153797149658, + "num_tokens": 742910838.0, + "step": 19467 + }, + { + "epoch": 2.476529703600051, + "ewc_loss": 0.0788208469748497, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004146733263041824, + "grad_norm": 9.207062721252441, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8722934722900391, + "num_tokens": 742950098.0, + "step": 19468 + }, + { + "epoch": 2.4766569138786414, + "ewc_loss": 0.0789366066455841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000415830931160599, + "grad_norm": 9.168468475341797, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8754966259002686, + "num_tokens": 742983872.0, + "step": 19469 + }, + { + "epoch": 2.476784124157232, + "ewc_loss": 0.07916824519634247, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041570590110495687, + "grad_norm": 9.284189224243164, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8533416986465454, + "num_tokens": 743029816.0, + "step": 19470 + }, + { + "epoch": 2.4769113344358225, + "ewc_loss": 0.07875321805477142, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004115556366741657, + "grad_norm": 9.182923316955566, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8561972975730896, + "num_tokens": 743068565.0, + "step": 19471 + }, + { + "epoch": 2.477038544714413, + "ewc_loss": 0.07917602360248566, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041578366653993726, + "grad_norm": 9.310076713562012, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8670450448989868, + "num_tokens": 743112137.0, + "step": 19472 + }, + { + "epoch": 2.4771657549930035, + "ewc_loss": 0.07851238548755646, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00040914732380770147, + "grad_norm": 9.096723556518555, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8793072700500488, + "num_tokens": 743145067.0, + "step": 19473 + }, + { + "epoch": 2.477292965271594, + "ewc_loss": 0.07934801280498505, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004175035282969475, + "grad_norm": 9.359993934631348, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8714334964752197, + "num_tokens": 743177017.0, + "step": 19474 + }, + { + "epoch": 2.4774201755501846, + "ewc_loss": 0.07798537611961365, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004063186061102897, + "grad_norm": 8.984350204467773, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8656038045883179, + "num_tokens": 743218121.0, + "step": 19475 + }, + { + "epoch": 2.477547385828775, + "ewc_loss": 0.07985364645719528, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042255991138517857, + "grad_norm": 9.580570220947266, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8704110980033875, + "num_tokens": 743256635.0, + "step": 19476 + }, + { + "epoch": 2.4776745961073656, + "ewc_loss": 0.07771390676498413, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040360394632443786, + "grad_norm": 8.923276901245117, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8626044392585754, + "num_tokens": 743293801.0, + "step": 19477 + }, + { + "epoch": 2.477801806385956, + "ewc_loss": 0.08044570684432983, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004309219366405159, + "grad_norm": 9.600459098815918, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8798941969871521, + "num_tokens": 743331582.0, + "step": 19478 + }, + { + "epoch": 2.4779290166645467, + "ewc_loss": 0.07792621850967407, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00040328563773073256, + "grad_norm": 8.894393920898438, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8674603700637817, + "num_tokens": 743369901.0, + "step": 19479 + }, + { + "epoch": 2.4780562269431368, + "ewc_loss": 0.0808153823018074, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043217724305577576, + "grad_norm": 9.578442573547363, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8591939210891724, + "num_tokens": 743405808.0, + "step": 19480 + }, + { + "epoch": 2.4781834372217277, + "ewc_loss": 0.0782596617937088, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00040662000537849963, + "grad_norm": 9.009049415588379, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8679147958755493, + "num_tokens": 743447236.0, + "step": 19481 + }, + { + "epoch": 2.478310647500318, + "ewc_loss": 0.0807342678308487, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043136614840477705, + "grad_norm": 9.412410736083984, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8716748952865601, + "num_tokens": 743486494.0, + "step": 19482 + }, + { + "epoch": 2.4784378577789083, + "ewc_loss": 0.07851596176624298, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00040918306331150234, + "grad_norm": 9.13138484954834, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.862034261226654, + "num_tokens": 743523941.0, + "step": 19483 + }, + { + "epoch": 2.478565068057499, + "ewc_loss": 0.079769566655159, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004241605056449771, + "grad_norm": 9.439850807189941, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8683207035064697, + "num_tokens": 743554169.0, + "step": 19484 + }, + { + "epoch": 2.4786922783360894, + "ewc_loss": 0.07831589877605438, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004096238117199391, + "grad_norm": 9.074645042419434, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8498588800430298, + "num_tokens": 743594656.0, + "step": 19485 + }, + { + "epoch": 2.47881948861468, + "ewc_loss": 0.07965786755084991, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004230435297358781, + "grad_norm": 9.395004272460938, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8722963333129883, + "num_tokens": 743628638.0, + "step": 19486 + }, + { + "epoch": 2.4789466988932705, + "ewc_loss": 0.07841790467500687, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041064390097744763, + "grad_norm": 9.10523796081543, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.85723477602005, + "num_tokens": 743665382.0, + "step": 19487 + }, + { + "epoch": 2.479073909171861, + "ewc_loss": 0.07944287359714508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004208935424685478, + "grad_norm": 9.285521507263184, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8789429068565369, + "num_tokens": 743701712.0, + "step": 19488 + }, + { + "epoch": 2.4792011194504515, + "ewc_loss": 0.07851698994636536, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004116347699891776, + "grad_norm": 9.126276016235352, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8652716875076294, + "num_tokens": 743740347.0, + "step": 19489 + }, + { + "epoch": 2.479328329729042, + "ewc_loss": 0.0791984349489212, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004184491408523172, + "grad_norm": 9.245546340942383, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8662256002426147, + "num_tokens": 743775434.0, + "step": 19490 + }, + { + "epoch": 2.4794555400076326, + "ewc_loss": 0.07865776121616364, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041304249316453934, + "grad_norm": 9.074847221374512, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8641669154167175, + "num_tokens": 743812475.0, + "step": 19491 + }, + { + "epoch": 2.479582750286223, + "ewc_loss": 0.07951822131872177, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041920566582120955, + "grad_norm": 9.265969276428223, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8567053079605103, + "num_tokens": 743851350.0, + "step": 19492 + }, + { + "epoch": 2.4797099605648136, + "ewc_loss": 0.07863099873065948, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004127747961319983, + "grad_norm": 9.129866600036621, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8603293299674988, + "num_tokens": 743885086.0, + "step": 19493 + }, + { + "epoch": 2.479837170843404, + "ewc_loss": 0.07914206385612488, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004178854578640312, + "grad_norm": 9.123700141906738, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8679810762405396, + "num_tokens": 743931747.0, + "step": 19494 + }, + { + "epoch": 2.4799643811219947, + "ewc_loss": 0.07913975417613983, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041786240763030946, + "grad_norm": 9.18543815612793, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8777909278869629, + "num_tokens": 743971806.0, + "step": 19495 + }, + { + "epoch": 2.480091591400585, + "ewc_loss": 0.07898038625717163, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004162687109783292, + "grad_norm": 9.209650039672852, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8751853704452515, + "num_tokens": 744005304.0, + "step": 19496 + }, + { + "epoch": 2.4802188016791757, + "ewc_loss": 0.078955739736557, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041602220153436065, + "grad_norm": 9.112235069274902, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8576946258544922, + "num_tokens": 744051159.0, + "step": 19497 + }, + { + "epoch": 2.4803460119577663, + "ewc_loss": 0.07914671301841736, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004179319366812706, + "grad_norm": 9.231592178344727, + "learning_rate": 1e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8487706184387207, + "num_tokens": 744089856.0, + "step": 19498 + }, + { + "epoch": 2.480473222236357, + "ewc_loss": 0.07884270697832108, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004148918960709125, + "grad_norm": 9.084208488464355, + "learning_rate": 1e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.8453445434570312, + "num_tokens": 744136355.0, + "step": 19499 + }, + { + "epoch": 2.4806004325149473, + "ewc_loss": 0.07967664301395416, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004207898164168, + "grad_norm": 9.259320259094238, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8539844751358032, + "num_tokens": 744177682.0, + "step": 19500 + }, + { + "epoch": 2.480727642793538, + "ewc_loss": 0.07858335971832275, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004122984828427434, + "grad_norm": 9.03818130493164, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8670424222946167, + "num_tokens": 744214209.0, + "step": 19501 + }, + { + "epoch": 2.4808548530721284, + "ewc_loss": 0.07961425930261612, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004226074379403144, + "grad_norm": 9.33391284942627, + "learning_rate": 1e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.847819447517395, + "num_tokens": 744257923.0, + "step": 19502 + }, + { + "epoch": 2.480982063350719, + "ewc_loss": 0.07848984003067017, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004113632603548467, + "grad_norm": 9.053289413452148, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8710217475891113, + "num_tokens": 744293370.0, + "step": 19503 + }, + { + "epoch": 2.4811092736293094, + "ewc_loss": 0.07976211607456207, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042408599983900785, + "grad_norm": 9.295914649963379, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8730709552764893, + "num_tokens": 744328572.0, + "step": 19504 + }, + { + "epoch": 2.4812364839078995, + "ewc_loss": 0.07860197126865387, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004124846018385142, + "grad_norm": 9.155790328979492, + "learning_rate": 1e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8480775356292725, + "num_tokens": 744364124.0, + "step": 19505 + }, + { + "epoch": 2.4813636941864905, + "ewc_loss": 0.07943785935640335, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004208434256725013, + "grad_norm": 9.25825309753418, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8688138723373413, + "num_tokens": 744403679.0, + "step": 19506 + }, + { + "epoch": 2.4814909044650806, + "ewc_loss": 0.07892324775457382, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041569731547497213, + "grad_norm": 9.136775016784668, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.864660382270813, + "num_tokens": 744440922.0, + "step": 19507 + }, + { + "epoch": 2.481618114743671, + "ewc_loss": 0.07938328385353088, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004202976415399462, + "grad_norm": 9.248751640319824, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8619894981384277, + "num_tokens": 744479025.0, + "step": 19508 + }, + { + "epoch": 2.4817453250222616, + "ewc_loss": 0.07896038889884949, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041606868035160005, + "grad_norm": 9.200897216796875, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8660098314285278, + "num_tokens": 744514165.0, + "step": 19509 + }, + { + "epoch": 2.481872535300852, + "ewc_loss": 0.07920829951763153, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041854780283756554, + "grad_norm": 9.165058135986328, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8601522445678711, + "num_tokens": 744547210.0, + "step": 19510 + }, + { + "epoch": 2.4819997455794427, + "ewc_loss": 0.07902760803699493, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041674089152365923, + "grad_norm": 9.209692001342773, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8649004101753235, + "num_tokens": 744585132.0, + "step": 19511 + }, + { + "epoch": 2.4821269558580332, + "ewc_loss": 0.07878285646438599, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004142934631090611, + "grad_norm": 9.108535766601562, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8770290613174438, + "num_tokens": 744617921.0, + "step": 19512 + }, + { + "epoch": 2.4822541661366238, + "ewc_loss": 0.07918889075517654, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041835373849608004, + "grad_norm": 9.225250244140625, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8617542386054993, + "num_tokens": 744653095.0, + "step": 19513 + }, + { + "epoch": 2.4823813764152143, + "ewc_loss": 0.07883523404598236, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041481718653813004, + "grad_norm": 9.09583568572998, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8710222244262695, + "num_tokens": 744688258.0, + "step": 19514 + }, + { + "epoch": 2.482508586693805, + "ewc_loss": 0.07942981272935867, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042076295358128846, + "grad_norm": 9.262327194213867, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8647568225860596, + "num_tokens": 744723500.0, + "step": 19515 + }, + { + "epoch": 2.4826357969723953, + "ewc_loss": 0.07884249836206436, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041488982969895005, + "grad_norm": 9.121527671813965, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8657512664794922, + "num_tokens": 744757133.0, + "step": 19516 + }, + { + "epoch": 2.482763007250986, + "ewc_loss": 0.07935871183872223, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042005200521089137, + "grad_norm": 9.211636543273926, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8761919736862183, + "num_tokens": 744793814.0, + "step": 19517 + }, + { + "epoch": 2.4828902175295764, + "ewc_loss": 0.07916663587093353, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041568977758288383, + "grad_norm": 9.120138168334961, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.868605375289917, + "num_tokens": 744828680.0, + "step": 19518 + }, + { + "epoch": 2.483017427808167, + "ewc_loss": 0.07922769337892532, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041874177986755967, + "grad_norm": 9.178462982177734, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8609952330589294, + "num_tokens": 744861642.0, + "step": 19519 + }, + { + "epoch": 2.4831446380867574, + "ewc_loss": 0.07896240800619125, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041608893661759794, + "grad_norm": 9.155921936035156, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8763009309768677, + "num_tokens": 744900345.0, + "step": 19520 + }, + { + "epoch": 2.483271848365348, + "ewc_loss": 0.07916838675737381, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041814870201051235, + "grad_norm": 9.19094467163086, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8723716735839844, + "num_tokens": 744938589.0, + "step": 19521 + }, + { + "epoch": 2.4833990586439385, + "ewc_loss": 0.07893635332584381, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004158283700235188, + "grad_norm": 9.140174865722656, + "learning_rate": 1e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8549177646636963, + "num_tokens": 744975385.0, + "step": 19522 + }, + { + "epoch": 2.483526268922529, + "ewc_loss": 0.07935203611850739, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041754383710213006, + "grad_norm": 9.246641159057617, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.854456901550293, + "num_tokens": 745010860.0, + "step": 19523 + }, + { + "epoch": 2.4836534792011196, + "ewc_loss": 0.07880877703428268, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041455263271927834, + "grad_norm": 9.161839485168457, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8567118048667908, + "num_tokens": 745047183.0, + "step": 19524 + }, + { + "epoch": 2.48378068947971, + "ewc_loss": 0.07922494411468506, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041871427674777806, + "grad_norm": 9.235188484191895, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8621499538421631, + "num_tokens": 745081906.0, + "step": 19525 + }, + { + "epoch": 2.4839078997583006, + "ewc_loss": 0.07881979644298553, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041466279071755707, + "grad_norm": 9.093586921691895, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8709291815757751, + "num_tokens": 745127285.0, + "step": 19526 + }, + { + "epoch": 2.484035110036891, + "ewc_loss": 0.07935109734535217, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041997581138275564, + "grad_norm": 9.293977737426758, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8807109594345093, + "num_tokens": 745171564.0, + "step": 19527 + }, + { + "epoch": 2.4841623203154817, + "ewc_loss": 0.07852485030889511, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041171335033141077, + "grad_norm": 9.07618236541748, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8766269683837891, + "num_tokens": 745214252.0, + "step": 19528 + }, + { + "epoch": 2.484289530594072, + "ewc_loss": 0.0793679803609848, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042014464270323515, + "grad_norm": 9.266037940979004, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8674823641777039, + "num_tokens": 745251983.0, + "step": 19529 + }, + { + "epoch": 2.4844167408726623, + "ewc_loss": 0.07849240303039551, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041138887172564864, + "grad_norm": 9.086610794067383, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8671993017196655, + "num_tokens": 745287037.0, + "step": 19530 + }, + { + "epoch": 2.4845439511512533, + "ewc_loss": 0.07961155474185944, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004201389674562961, + "grad_norm": 9.263449668884277, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8623486757278442, + "num_tokens": 745327854.0, + "step": 19531 + }, + { + "epoch": 2.4846711614298433, + "ewc_loss": 0.07836504280567169, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000410115288104862, + "grad_norm": 9.075581550598145, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8528784513473511, + "num_tokens": 745370227.0, + "step": 19532 + }, + { + "epoch": 2.484798371708434, + "ewc_loss": 0.07943634688854218, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042082826257683337, + "grad_norm": 9.27077865600586, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8631863594055176, + "num_tokens": 745409416.0, + "step": 19533 + }, + { + "epoch": 2.4849255819870244, + "ewc_loss": 0.07863327860832214, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004127975844312459, + "grad_norm": 9.101511001586914, + "learning_rate": 1e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.8460767269134521, + "num_tokens": 745449908.0, + "step": 19534 + }, + { + "epoch": 2.485052792265615, + "ewc_loss": 0.0793246179819107, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041971105383709073, + "grad_norm": 9.26197624206543, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8540047407150269, + "num_tokens": 745486384.0, + "step": 19535 + }, + { + "epoch": 2.4851800025442055, + "ewc_loss": 0.07861179113388062, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004125827399548143, + "grad_norm": 9.169179916381836, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8545598387718201, + "num_tokens": 745523058.0, + "step": 19536 + }, + { + "epoch": 2.485307212822796, + "ewc_loss": 0.07920508086681366, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004185156722087413, + "grad_norm": 9.233539581298828, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8473312854766846, + "num_tokens": 745559276.0, + "step": 19537 + }, + { + "epoch": 2.4854344231013865, + "ewc_loss": 0.07869333028793335, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041339820018038154, + "grad_norm": 9.162250518798828, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8636285066604614, + "num_tokens": 745592726.0, + "step": 19538 + }, + { + "epoch": 2.485561633379977, + "ewc_loss": 0.07911045849323273, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041756947757676244, + "grad_norm": 9.22360897064209, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8673763871192932, + "num_tokens": 745630117.0, + "step": 19539 + }, + { + "epoch": 2.4856888436585676, + "ewc_loss": 0.07883293181657791, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041479416540823877, + "grad_norm": 9.11776065826416, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8509753942489624, + "num_tokens": 745664968.0, + "step": 19540 + }, + { + "epoch": 2.485816053937158, + "ewc_loss": 0.07901141792535782, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004165790160186589, + "grad_norm": 9.172882080078125, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8659013509750366, + "num_tokens": 745701893.0, + "step": 19541 + }, + { + "epoch": 2.4859432642157486, + "ewc_loss": 0.07892556488513947, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041572045302018523, + "grad_norm": 9.175317764282227, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8606295585632324, + "num_tokens": 745736884.0, + "step": 19542 + }, + { + "epoch": 2.486070474494339, + "ewc_loss": 0.07890370488166809, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004155019414611161, + "grad_norm": 9.0884428024292, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8765471577644348, + "num_tokens": 745782381.0, + "step": 19543 + }, + { + "epoch": 2.4861976847729297, + "ewc_loss": 0.07907316088676453, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041719648288562894, + "grad_norm": 9.135363578796387, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.878983199596405, + "num_tokens": 745827630.0, + "step": 19544 + }, + { + "epoch": 2.48632489505152, + "ewc_loss": 0.078939288854599, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004158577066846192, + "grad_norm": 9.14985179901123, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8725946545600891, + "num_tokens": 745862223.0, + "step": 19545 + }, + { + "epoch": 2.4864521053301107, + "ewc_loss": 0.07908090949058533, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004172739863861352, + "grad_norm": 9.178019523620605, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8815299868583679, + "num_tokens": 745903077.0, + "step": 19546 + }, + { + "epoch": 2.4865793156087013, + "ewc_loss": 0.07894586026668549, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041592345223762095, + "grad_norm": 9.140334129333496, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8653318285942078, + "num_tokens": 745945534.0, + "step": 19547 + }, + { + "epoch": 2.486706525887292, + "ewc_loss": 0.07908430695533752, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041730792145244777, + "grad_norm": 9.185364723205566, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8480758666992188, + "num_tokens": 745986308.0, + "step": 19548 + }, + { + "epoch": 2.4868337361658823, + "ewc_loss": 0.07886159420013428, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004150808381382376, + "grad_norm": 9.208948135375977, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8555052876472473, + "num_tokens": 746022206.0, + "step": 19549 + }, + { + "epoch": 2.486960946444473, + "ewc_loss": 0.07904799282550812, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041694482206366956, + "grad_norm": 9.137624740600586, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8719421625137329, + "num_tokens": 746059854.0, + "step": 19550 + }, + { + "epoch": 2.4870881567230634, + "ewc_loss": 0.0791059285402298, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004175241047050804, + "grad_norm": 9.265226364135742, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8732857704162598, + "num_tokens": 746103994.0, + "step": 19551 + }, + { + "epoch": 2.487215367001654, + "ewc_loss": 0.07873362302780151, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041380105540156364, + "grad_norm": 9.111236572265625, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8556429743766785, + "num_tokens": 746138491.0, + "step": 19552 + }, + { + "epoch": 2.4873425772802444, + "ewc_loss": 0.07946173846721649, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004210822517052293, + "grad_norm": 9.274645805358887, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8631511926651001, + "num_tokens": 746172598.0, + "step": 19553 + }, + { + "epoch": 2.487469787558835, + "ewc_loss": 0.07865588366985321, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004130236920900643, + "grad_norm": 9.13522720336914, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.872580885887146, + "num_tokens": 746210385.0, + "step": 19554 + }, + { + "epoch": 2.487596997837425, + "ewc_loss": 0.07935021072626114, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041996693471446633, + "grad_norm": 9.189924240112305, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8671612739562988, + "num_tokens": 746252563.0, + "step": 19555 + }, + { + "epoch": 2.487724208116016, + "ewc_loss": 0.07882016897201538, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041466657421551645, + "grad_norm": 9.09228801727295, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8705730438232422, + "num_tokens": 746292356.0, + "step": 19556 + }, + { + "epoch": 2.487851418394606, + "ewc_loss": 0.07926150411367416, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041907987906597555, + "grad_norm": 9.25336742401123, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8533377647399902, + "num_tokens": 746325467.0, + "step": 19557 + }, + { + "epoch": 2.4879786286731966, + "ewc_loss": 0.07876837253570557, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004141485842410475, + "grad_norm": 9.13239860534668, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8604003190994263, + "num_tokens": 746362285.0, + "step": 19558 + }, + { + "epoch": 2.488105838951787, + "ewc_loss": 0.07942040264606476, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004206688900012523, + "grad_norm": 9.228901863098145, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8727735877037048, + "num_tokens": 746401540.0, + "step": 19559 + }, + { + "epoch": 2.4882330492303777, + "ewc_loss": 0.07866284251213074, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004130932502448559, + "grad_norm": 9.12452220916748, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8662843704223633, + "num_tokens": 746439282.0, + "step": 19560 + }, + { + "epoch": 2.488360259508968, + "ewc_loss": 0.07927317172288895, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004191965563222766, + "grad_norm": 9.195981979370117, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8769670724868774, + "num_tokens": 746480993.0, + "step": 19561 + }, + { + "epoch": 2.4884874697875587, + "ewc_loss": 0.07888676226139069, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041533249896019697, + "grad_norm": 9.183735847473145, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8619874119758606, + "num_tokens": 746519318.0, + "step": 19562 + }, + { + "epoch": 2.4886146800661493, + "ewc_loss": 0.07909733057022095, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174381901975721, + "grad_norm": 9.197985649108887, + "learning_rate": 1e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8508325815200806, + "num_tokens": 746559761.0, + "step": 19563 + }, + { + "epoch": 2.48874189034474, + "ewc_loss": 0.07898753881454468, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041634024819359183, + "grad_norm": 9.173972129821777, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.872499942779541, + "num_tokens": 746591622.0, + "step": 19564 + }, + { + "epoch": 2.4888691006233303, + "ewc_loss": 0.07902391999959946, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000416704046074301, + "grad_norm": 9.155231475830078, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8618475794792175, + "num_tokens": 746627709.0, + "step": 19565 + }, + { + "epoch": 2.488996310901921, + "ewc_loss": 0.07916118949651718, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004180767573416233, + "grad_norm": 9.199020385742188, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8626794219017029, + "num_tokens": 746663445.0, + "step": 19566 + }, + { + "epoch": 2.4891235211805114, + "ewc_loss": 0.07908480614423752, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004173128982074559, + "grad_norm": 9.200093269348145, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8535359501838684, + "num_tokens": 746706869.0, + "step": 19567 + }, + { + "epoch": 2.489250731459102, + "ewc_loss": 0.0791940987110138, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004184057761449367, + "grad_norm": 9.2068452835083, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8767502903938293, + "num_tokens": 746749502.0, + "step": 19568 + }, + { + "epoch": 2.4893779417376924, + "ewc_loss": 0.07901238650083542, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041658870759420097, + "grad_norm": 9.173523902893066, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8694005012512207, + "num_tokens": 746786510.0, + "step": 19569 + }, + { + "epoch": 2.489505152016283, + "ewc_loss": 0.07912581413984299, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004177230002824217, + "grad_norm": 9.199634552001953, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8691214323043823, + "num_tokens": 746819430.0, + "step": 19570 + }, + { + "epoch": 2.4896323622948735, + "ewc_loss": 0.07900959998369217, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004165608552284539, + "grad_norm": 9.163846015930176, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.868105947971344, + "num_tokens": 746856018.0, + "step": 19571 + }, + { + "epoch": 2.489759572573464, + "ewc_loss": 0.0791490226984024, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004179550160188228, + "grad_norm": 9.22728157043457, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8593800067901611, + "num_tokens": 746897350.0, + "step": 19572 + }, + { + "epoch": 2.4898867828520546, + "ewc_loss": 0.07890193164348602, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000415484159020707, + "grad_norm": 9.163553237915039, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.857371985912323, + "num_tokens": 746937163.0, + "step": 19573 + }, + { + "epoch": 2.490013993130645, + "ewc_loss": 0.07915839552879333, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041804875945672393, + "grad_norm": 9.222150802612305, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8746105432510376, + "num_tokens": 746976140.0, + "step": 19574 + }, + { + "epoch": 2.4901412034092356, + "ewc_loss": 0.07891640067100525, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041562883416190743, + "grad_norm": 9.210222244262695, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8645962476730347, + "num_tokens": 747012576.0, + "step": 19575 + }, + { + "epoch": 2.490268413687826, + "ewc_loss": 0.07899338752031326, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004163987177889794, + "grad_norm": 9.251670837402344, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8638099431991577, + "num_tokens": 747048179.0, + "step": 19576 + }, + { + "epoch": 2.4903956239664167, + "ewc_loss": 0.07881297171115875, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004145945713389665, + "grad_norm": 9.175806045532227, + "learning_rate": 1e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8468834161758423, + "num_tokens": 747085291.0, + "step": 19577 + }, + { + "epoch": 2.4905228342450068, + "ewc_loss": 0.07905463874340057, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004170111787971109, + "grad_norm": 9.267313957214355, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8714768886566162, + "num_tokens": 747119085.0, + "step": 19578 + }, + { + "epoch": 2.4906500445235977, + "ewc_loss": 0.0786958783864975, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004134236369282007, + "grad_norm": 9.194467544555664, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8643505573272705, + "num_tokens": 747153463.0, + "step": 19579 + }, + { + "epoch": 2.490777254802188, + "ewc_loss": 0.07908927649259567, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004173576016910374, + "grad_norm": 9.266357421875, + "learning_rate": 1e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.840426504611969, + "num_tokens": 747194781.0, + "step": 19580 + }, + { + "epoch": 2.4909044650807783, + "ewc_loss": 0.07862789183855057, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004127437714487314, + "grad_norm": 9.16877555847168, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8670060634613037, + "num_tokens": 747235328.0, + "step": 19581 + }, + { + "epoch": 2.491031675359369, + "ewc_loss": 0.07918715476989746, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041833639261312783, + "grad_norm": 9.280899047851562, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8618025779724121, + "num_tokens": 747278481.0, + "step": 19582 + }, + { + "epoch": 2.4911588856379594, + "ewc_loss": 0.07866625487804413, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041312744724564254, + "grad_norm": 9.16640853881836, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8803291916847229, + "num_tokens": 747313077.0, + "step": 19583 + }, + { + "epoch": 2.49128609591655, + "ewc_loss": 0.07895462214946747, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004160110838711262, + "grad_norm": 9.281502723693848, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8710267543792725, + "num_tokens": 747352693.0, + "step": 19584 + }, + { + "epoch": 2.4914133061951405, + "ewc_loss": 0.07850516587495804, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004115165211260319, + "grad_norm": 9.068770408630371, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8565974235534668, + "num_tokens": 747397717.0, + "step": 19585 + }, + { + "epoch": 2.491540516473731, + "ewc_loss": 0.0794363021850586, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042082788422703743, + "grad_norm": 9.338109016418457, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8673526644706726, + "num_tokens": 747435457.0, + "step": 19586 + }, + { + "epoch": 2.4916677267523215, + "ewc_loss": 0.0783625990152359, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041009081178344786, + "grad_norm": 9.078725814819336, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8692888617515564, + "num_tokens": 747479017.0, + "step": 19587 + }, + { + "epoch": 2.491794937030912, + "ewc_loss": 0.07959704846143723, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042243534699082375, + "grad_norm": 9.347414016723633, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8640841245651245, + "num_tokens": 747520247.0, + "step": 19588 + }, + { + "epoch": 2.4919221473095026, + "ewc_loss": 0.07840532064437866, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041051802691072226, + "grad_norm": 9.057772636413574, + "learning_rate": 1e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8504514098167419, + "num_tokens": 747559983.0, + "step": 19589 + }, + { + "epoch": 2.492049357588093, + "ewc_loss": 0.07981052249670029, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042457005474716425, + "grad_norm": 9.403851509094238, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8599071502685547, + "num_tokens": 747595746.0, + "step": 19590 + }, + { + "epoch": 2.4921765678666836, + "ewc_loss": 0.07841255515813828, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004105903790332377, + "grad_norm": 9.11400032043457, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8615756034851074, + "num_tokens": 747639261.0, + "step": 19591 + }, + { + "epoch": 2.492303778145274, + "ewc_loss": 0.07964376360177994, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004229024634696543, + "grad_norm": 9.31969928741455, + "learning_rate": 1e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8466360569000244, + "num_tokens": 747680665.0, + "step": 19592 + }, + { + "epoch": 2.4924309884238647, + "ewc_loss": 0.07861436903476715, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004126084968447685, + "grad_norm": 9.195889472961426, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8690813183784485, + "num_tokens": 747713320.0, + "step": 19593 + }, + { + "epoch": 2.492558198702455, + "ewc_loss": 0.07929901778697968, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041945496923290193, + "grad_norm": 9.333536148071289, + "learning_rate": 1e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8387175798416138, + "num_tokens": 747752167.0, + "step": 19594 + }, + { + "epoch": 2.4926854089810457, + "ewc_loss": 0.07865552604198456, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041302008321508765, + "grad_norm": 9.11742115020752, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.869868278503418, + "num_tokens": 747787274.0, + "step": 19595 + }, + { + "epoch": 2.4928126192596363, + "ewc_loss": 0.07945913076400757, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042105617467314005, + "grad_norm": 9.328937530517578, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8582735657691956, + "num_tokens": 747826647.0, + "step": 19596 + }, + { + "epoch": 2.492939829538227, + "ewc_loss": 0.07858685404062271, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000412333378335461, + "grad_norm": 9.068256378173828, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8613668084144592, + "num_tokens": 747866797.0, + "step": 19597 + }, + { + "epoch": 2.4930670398168173, + "ewc_loss": 0.07973212748765945, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042378611396998167, + "grad_norm": 9.346028327941895, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8658764362335205, + "num_tokens": 747907589.0, + "step": 19598 + }, + { + "epoch": 2.493194250095408, + "ewc_loss": 0.078496053814888, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004114254261367023, + "grad_norm": 9.111080169677734, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8575344085693359, + "num_tokens": 747939865.0, + "step": 19599 + }, + { + "epoch": 2.4933214603739984, + "ewc_loss": 0.07974434643983841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004239083209540695, + "grad_norm": 9.369552612304688, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8624178767204285, + "num_tokens": 747976238.0, + "step": 19600 + }, + { + "epoch": 2.493448670652589, + "ewc_loss": 0.07848665118217468, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041133136255666614, + "grad_norm": 9.073400497436523, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8619866371154785, + "num_tokens": 748012076.0, + "step": 19601 + }, + { + "epoch": 2.4935758809311794, + "ewc_loss": 0.07985969632863998, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004250618221703917, + "grad_norm": 9.407745361328125, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8574647307395935, + "num_tokens": 748042518.0, + "step": 19602 + }, + { + "epoch": 2.4937030912097695, + "ewc_loss": 0.07844890654087067, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004109539440833032, + "grad_norm": 9.069493293762207, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8721462488174438, + "num_tokens": 748077509.0, + "step": 19603 + }, + { + "epoch": 2.4938303014883605, + "ewc_loss": 0.07983125746250153, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042477744864299893, + "grad_norm": 9.430026054382324, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8703566193580627, + "num_tokens": 748118164.0, + "step": 19604 + }, + { + "epoch": 2.4939575117669506, + "ewc_loss": 0.07844306528568268, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041089547448791564, + "grad_norm": 9.075251579284668, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8560653328895569, + "num_tokens": 748153609.0, + "step": 19605 + }, + { + "epoch": 2.494084722045541, + "ewc_loss": 0.0798092633485794, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000424557423684746, + "grad_norm": 9.331338882446289, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8633647561073303, + "num_tokens": 748189573.0, + "step": 19606 + }, + { + "epoch": 2.4942119323241316, + "ewc_loss": 0.07860483229160309, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004125132109038532, + "grad_norm": 9.143509864807129, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8592802286148071, + "num_tokens": 748231121.0, + "step": 19607 + }, + { + "epoch": 2.494339142602722, + "ewc_loss": 0.07959675788879395, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042243246571160853, + "grad_norm": 9.31924819946289, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.874854326248169, + "num_tokens": 748268744.0, + "step": 19608 + }, + { + "epoch": 2.4944663528813127, + "ewc_loss": 0.07862325012683868, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000412697292631492, + "grad_norm": 9.132025718688965, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8682870864868164, + "num_tokens": 748307441.0, + "step": 19609 + }, + { + "epoch": 2.494593563159903, + "ewc_loss": 0.07943737506866455, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042083862354047596, + "grad_norm": 9.301036834716797, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8637607097625732, + "num_tokens": 748343506.0, + "step": 19610 + }, + { + "epoch": 2.4947207734384937, + "ewc_loss": 0.07866659760475159, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004131308232899755, + "grad_norm": 9.135570526123047, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8662724494934082, + "num_tokens": 748382209.0, + "step": 19611 + }, + { + "epoch": 2.4948479837170843, + "ewc_loss": 0.07934354990720749, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004199003451503813, + "grad_norm": 9.298704147338867, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8846507668495178, + "num_tokens": 748413660.0, + "step": 19612 + }, + { + "epoch": 2.494975193995675, + "ewc_loss": 0.07862333953380585, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004126981948502362, + "grad_norm": 9.087124824523926, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8651413321495056, + "num_tokens": 748457037.0, + "step": 19613 + }, + { + "epoch": 2.4951024042742653, + "ewc_loss": 0.07966198027133942, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042308459524065256, + "grad_norm": 9.419464111328125, + "learning_rate": 1e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8429734706878662, + "num_tokens": 748497227.0, + "step": 19614 + }, + { + "epoch": 2.495229614552856, + "ewc_loss": 0.07840010523796082, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004104658728465438, + "grad_norm": 9.079197883605957, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8613929748535156, + "num_tokens": 748529901.0, + "step": 19615 + }, + { + "epoch": 2.4953568248314464, + "ewc_loss": 0.07970328629016876, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042349775321781635, + "grad_norm": 9.334059715270996, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8517679572105408, + "num_tokens": 748568238.0, + "step": 19616 + }, + { + "epoch": 2.495484035110037, + "ewc_loss": 0.07852067053318024, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041167152812704444, + "grad_norm": 9.08843994140625, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8537310361862183, + "num_tokens": 748608153.0, + "step": 19617 + }, + { + "epoch": 2.4956112453886274, + "ewc_loss": 0.07957188785076141, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004221837443765253, + "grad_norm": 9.337379455566406, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8593509197235107, + "num_tokens": 748646977.0, + "step": 19618 + }, + { + "epoch": 2.495738455667218, + "ewc_loss": 0.07856254279613495, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004120902740396559, + "grad_norm": 9.113332748413086, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8543795943260193, + "num_tokens": 748689586.0, + "step": 19619 + }, + { + "epoch": 2.4958656659458085, + "ewc_loss": 0.07946689426898956, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004211338236927986, + "grad_norm": 9.300209045410156, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8626782298088074, + "num_tokens": 748730915.0, + "step": 19620 + }, + { + "epoch": 2.495992876224399, + "ewc_loss": 0.07881109416484833, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004145757993683219, + "grad_norm": 9.123533248901367, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8594129681587219, + "num_tokens": 748766996.0, + "step": 19621 + }, + { + "epoch": 2.4961200865029896, + "ewc_loss": 0.07929249852895737, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041938983486033976, + "grad_norm": 9.270174980163574, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8560726046562195, + "num_tokens": 748801225.0, + "step": 19622 + }, + { + "epoch": 2.49624729678158, + "ewc_loss": 0.07883105427026749, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004147753934375942, + "grad_norm": 9.13386344909668, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.858820378780365, + "num_tokens": 748841355.0, + "step": 19623 + }, + { + "epoch": 2.4963745070601706, + "ewc_loss": 0.07940100133419037, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004204748256597668, + "grad_norm": 9.280701637268066, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8631991744041443, + "num_tokens": 748879636.0, + "step": 19624 + }, + { + "epoch": 2.496501717338761, + "ewc_loss": 0.07878005504608154, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004142653488088399, + "grad_norm": 9.12514591217041, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8599821925163269, + "num_tokens": 748917795.0, + "step": 19625 + }, + { + "epoch": 2.4966289276173517, + "ewc_loss": 0.07935458421707153, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042001064866781235, + "grad_norm": 9.25078010559082, + "learning_rate": 1e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.8352006673812866, + "num_tokens": 748956512.0, + "step": 19626 + }, + { + "epoch": 2.496756137895942, + "ewc_loss": 0.07880405336618423, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004145053680986166, + "grad_norm": 9.169984817504883, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8682876825332642, + "num_tokens": 748995577.0, + "step": 19627 + }, + { + "epoch": 2.4968833481745323, + "ewc_loss": 0.07916934788227081, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000418158364482224, + "grad_norm": 9.243829727172852, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8599317073822021, + "num_tokens": 749027672.0, + "step": 19628 + }, + { + "epoch": 2.4970105584531233, + "ewc_loss": 0.07895110547542572, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041597592644393444, + "grad_norm": 9.166448593139648, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8738076090812683, + "num_tokens": 749059101.0, + "step": 19629 + }, + { + "epoch": 2.4971377687317133, + "ewc_loss": 0.07903548330068588, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004168196755927056, + "grad_norm": 9.166141510009766, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8492200970649719, + "num_tokens": 749097034.0, + "step": 19630 + }, + { + "epoch": 2.497264979010304, + "ewc_loss": 0.0793333500623703, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041735696140676737, + "grad_norm": 9.162564277648926, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8617148995399475, + "num_tokens": 749137536.0, + "step": 19631 + }, + { + "epoch": 2.4973921892888944, + "ewc_loss": 0.0790151059627533, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041661594877950847, + "grad_norm": 9.125809669494629, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8630460500717163, + "num_tokens": 749171487.0, + "step": 19632 + }, + { + "epoch": 2.497519399567485, + "ewc_loss": 0.07947543263435364, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004187778104096651, + "grad_norm": 9.223051071166992, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8565462827682495, + "num_tokens": 749213234.0, + "step": 19633 + }, + { + "epoch": 2.4976466098460754, + "ewc_loss": 0.07887826859951019, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041524748667143285, + "grad_norm": 9.135266304016113, + "learning_rate": 1e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8531302213668823, + "num_tokens": 749256184.0, + "step": 19634 + }, + { + "epoch": 2.497773820124666, + "ewc_loss": 0.07964830845594406, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004205065197311342, + "grad_norm": 9.272818565368652, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8607870936393738, + "num_tokens": 749292909.0, + "step": 19635 + }, + { + "epoch": 2.4979010304032565, + "ewc_loss": 0.07877074182033539, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041417230386286974, + "grad_norm": 9.192022323608398, + "learning_rate": 1e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.8501179814338684, + "num_tokens": 749327947.0, + "step": 19636 + }, + { + "epoch": 2.498028240681847, + "ewc_loss": 0.0796828418970108, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004184104036539793, + "grad_norm": 14.76004409790039, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8719158172607422, + "num_tokens": 749364107.0, + "step": 19637 + }, + { + "epoch": 2.4981554509604376, + "ewc_loss": 0.08616335690021515, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00048809839063324034, + "grad_norm": 9.864556312561035, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8778993487358093, + "num_tokens": 749398055.0, + "step": 19638 + }, + { + "epoch": 2.498282661239028, + "ewc_loss": 0.08213771134614944, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044540053931996226, + "grad_norm": 9.755833625793457, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8623400926589966, + "num_tokens": 749434626.0, + "step": 19639 + }, + { + "epoch": 2.4984098715176186, + "ewc_loss": 0.0797695443034172, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004217188688926399, + "grad_norm": 9.27396297454834, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8569274544715881, + "num_tokens": 749470616.0, + "step": 19640 + }, + { + "epoch": 2.498537081796209, + "ewc_loss": 0.08271731436252594, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045119659625925124, + "grad_norm": 9.777480125427246, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8669151067733765, + "num_tokens": 749513072.0, + "step": 19641 + }, + { + "epoch": 2.4986642920747997, + "ewc_loss": 0.07932963967323303, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004197611997369677, + "grad_norm": 9.24899673461914, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8771626949310303, + "num_tokens": 749546988.0, + "step": 19642 + }, + { + "epoch": 2.49879150235339, + "ewc_loss": 0.08124186098575592, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004388834931887686, + "grad_norm": 9.630406379699707, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8557456731796265, + "num_tokens": 749585851.0, + "step": 19643 + }, + { + "epoch": 2.4989187126319807, + "ewc_loss": 0.07924892008304596, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004189540049992502, + "grad_norm": 9.251996994018555, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8604143857955933, + "num_tokens": 749619510.0, + "step": 19644 + }, + { + "epoch": 2.4990459229105713, + "ewc_loss": 0.08070934563875198, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043355830712243915, + "grad_norm": 9.57238483428955, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8696802854537964, + "num_tokens": 749653660.0, + "step": 19645 + }, + { + "epoch": 2.499173133189162, + "ewc_loss": 0.07910075783729553, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174723871983588, + "grad_norm": 9.198322296142578, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8635026216506958, + "num_tokens": 749691321.0, + "step": 19646 + }, + { + "epoch": 2.4993003434677523, + "ewc_loss": 0.08042536675930023, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004307185299694538, + "grad_norm": 9.488460540771484, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8643766641616821, + "num_tokens": 749729305.0, + "step": 19647 + }, + { + "epoch": 2.499427553746343, + "ewc_loss": 0.07898396253585815, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041630450868979096, + "grad_norm": 9.1972017288208, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8698552846908569, + "num_tokens": 749766134.0, + "step": 19648 + }, + { + "epoch": 2.4995547640249334, + "ewc_loss": 0.08012228459119797, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042768768616952, + "grad_norm": 9.474703788757324, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8598856329917908, + "num_tokens": 749807715.0, + "step": 19649 + }, + { + "epoch": 2.499681974303524, + "ewc_loss": 0.07869688421487808, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004134336777497083, + "grad_norm": 9.124286651611328, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8642728328704834, + "num_tokens": 749838799.0, + "step": 19650 + }, + { + "epoch": 2.499809184582114, + "ewc_loss": 0.08016860485076904, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042815087363123894, + "grad_norm": 9.479120254516602, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8763407468795776, + "num_tokens": 749874283.0, + "step": 19651 + }, + { + "epoch": 2.499936394860705, + "ewc_loss": 0.07864413410425186, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041290617082268, + "grad_norm": 9.12509822845459, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8624573349952698, + "num_tokens": 749915912.0, + "step": 19652 + }, + { + "epoch": 2.500063605139295, + "ewc_loss": 0.08017677068710327, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042823253897950053, + "grad_norm": 9.408683776855469, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8700805902481079, + "num_tokens": 749959133.0, + "step": 19653 + }, + { + "epoch": 2.500190815417886, + "ewc_loss": 0.0786857083439827, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041332191904075444, + "grad_norm": 9.150486946105957, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8616696000099182, + "num_tokens": 749997846.0, + "step": 19654 + }, + { + "epoch": 2.500318025696476, + "ewc_loss": 0.0798325389623642, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042479022522456944, + "grad_norm": 9.35669994354248, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8684324026107788, + "num_tokens": 750040973.0, + "step": 19655 + }, + { + "epoch": 2.5004452359750666, + "ewc_loss": 0.07887330651283264, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004151978646405041, + "grad_norm": 9.175081253051758, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8590737581253052, + "num_tokens": 750082171.0, + "step": 19656 + }, + { + "epoch": 2.500572446253657, + "ewc_loss": 0.07973788678646088, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042140233563259244, + "grad_norm": 9.332542419433594, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8744938373565674, + "num_tokens": 750123083.0, + "step": 19657 + }, + { + "epoch": 2.5006996565322477, + "ewc_loss": 0.07892020046710968, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004156668728683144, + "grad_norm": 9.186458587646484, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.849812388420105, + "num_tokens": 750162235.0, + "step": 19658 + }, + { + "epoch": 2.500826866810838, + "ewc_loss": 0.07944321632385254, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042089703492820263, + "grad_norm": 9.313366889953613, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8756897449493408, + "num_tokens": 750196501.0, + "step": 19659 + }, + { + "epoch": 2.5009540770894287, + "ewc_loss": 0.07880325615406036, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041449745185673237, + "grad_norm": 9.220064163208008, + "learning_rate": 1e-06, + "loss": 0.5777, + "mean_token_accuracy": 0.8361045122146606, + "num_tokens": 750241178.0, + "step": 19660 + }, + { + "epoch": 2.5010812873680193, + "ewc_loss": 0.07934277504682541, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041989260353147984, + "grad_norm": 9.278000831604004, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8684185743331909, + "num_tokens": 750284104.0, + "step": 19661 + }, + { + "epoch": 2.50120849764661, + "ewc_loss": 0.07897870242595673, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041625191806815565, + "grad_norm": 9.23521614074707, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8698344230651855, + "num_tokens": 750318525.0, + "step": 19662 + }, + { + "epoch": 2.5013357079252003, + "ewc_loss": 0.07916758954524994, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041814075666479766, + "grad_norm": 9.257467269897461, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8716220855712891, + "num_tokens": 750358892.0, + "step": 19663 + }, + { + "epoch": 2.501462918203791, + "ewc_loss": 0.07910705357789993, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041753536788746715, + "grad_norm": 9.24905014038086, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8504236936569214, + "num_tokens": 750399325.0, + "step": 19664 + }, + { + "epoch": 2.5015901284823814, + "ewc_loss": 0.07911893725395203, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004176542570348829, + "grad_norm": 9.231001853942871, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.858417272567749, + "num_tokens": 750437493.0, + "step": 19665 + }, + { + "epoch": 2.501717338760972, + "ewc_loss": 0.079160675406456, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000418071576859802, + "grad_norm": 9.24409294128418, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8708951473236084, + "num_tokens": 750472381.0, + "step": 19666 + }, + { + "epoch": 2.5018445490395624, + "ewc_loss": 0.07917214184999466, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041818624595180154, + "grad_norm": 9.203133583068848, + "learning_rate": 1e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8455316424369812, + "num_tokens": 750511786.0, + "step": 19667 + }, + { + "epoch": 2.501971759318153, + "ewc_loss": 0.07939870655536652, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042045186273753643, + "grad_norm": 9.243696212768555, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8705506324768066, + "num_tokens": 750550596.0, + "step": 19668 + }, + { + "epoch": 2.5020989695967435, + "ewc_loss": 0.0792299285531044, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041876413160935044, + "grad_norm": 9.253450393676758, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8668453097343445, + "num_tokens": 750594790.0, + "step": 19669 + }, + { + "epoch": 2.502226179875334, + "ewc_loss": 0.0790521577000618, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041698641143739223, + "grad_norm": 9.170368194580078, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8745167851448059, + "num_tokens": 750636738.0, + "step": 19670 + }, + { + "epoch": 2.5023533901539246, + "ewc_loss": 0.07952694594860077, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004217343230266124, + "grad_norm": 9.40160846710205, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.875572681427002, + "num_tokens": 750672338.0, + "step": 19671 + }, + { + "epoch": 2.502480600432515, + "ewc_loss": 0.07875397801399231, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041400460759177804, + "grad_norm": 9.12905216217041, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.857184886932373, + "num_tokens": 750709149.0, + "step": 19672 + }, + { + "epoch": 2.5026078107111056, + "ewc_loss": 0.07987699657678604, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004252348153386265, + "grad_norm": 9.393821716308594, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8756839036941528, + "num_tokens": 750741727.0, + "step": 19673 + }, + { + "epoch": 2.5027350209896957, + "ewc_loss": 0.07855242490768433, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004119890509173274, + "grad_norm": 9.20016860961914, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8525272011756897, + "num_tokens": 750775903.0, + "step": 19674 + }, + { + "epoch": 2.5028622312682867, + "ewc_loss": 0.07974772900342941, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004239421396050602, + "grad_norm": 9.419787406921387, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8583700656890869, + "num_tokens": 750810502.0, + "step": 19675 + }, + { + "epoch": 2.5029894415468767, + "ewc_loss": 0.07871638238430023, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041362864430993795, + "grad_norm": 9.162811279296875, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8717443943023682, + "num_tokens": 750847762.0, + "step": 19676 + }, + { + "epoch": 2.5031166518254677, + "ewc_loss": 0.07949770987033844, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004214419168420136, + "grad_norm": 9.318175315856934, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8644651770591736, + "num_tokens": 750887201.0, + "step": 19677 + }, + { + "epoch": 2.503243862104058, + "ewc_loss": 0.07877746224403381, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041423950460739434, + "grad_norm": 9.121438026428223, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.862084150314331, + "num_tokens": 750927323.0, + "step": 19678 + }, + { + "epoch": 2.5033710723826488, + "ewc_loss": 0.07950447499752045, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042150955414399505, + "grad_norm": 9.296100616455078, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8544974327087402, + "num_tokens": 750972501.0, + "step": 19679 + }, + { + "epoch": 2.503498282661239, + "ewc_loss": 0.07877104729413986, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004141753015574068, + "grad_norm": 9.13316535949707, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8732163310050964, + "num_tokens": 751010743.0, + "step": 19680 + }, + { + "epoch": 2.5036254929398294, + "ewc_loss": 0.07952842116355896, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004217490495648235, + "grad_norm": 9.283464431762695, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8584582209587097, + "num_tokens": 751050440.0, + "step": 19681 + }, + { + "epoch": 2.50375270321842, + "ewc_loss": 0.07883510738611221, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041481590596958995, + "grad_norm": 9.15859317779541, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8665730953216553, + "num_tokens": 751088319.0, + "step": 19682 + }, + { + "epoch": 2.5038799134970104, + "ewc_loss": 0.07945675402879715, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042103236773982644, + "grad_norm": 9.305071830749512, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8628073334693909, + "num_tokens": 751123766.0, + "step": 19683 + }, + { + "epoch": 2.504007123775601, + "ewc_loss": 0.07898666709661484, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004163315170444548, + "grad_norm": 9.154401779174805, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8801390528678894, + "num_tokens": 751163375.0, + "step": 19684 + }, + { + "epoch": 2.5041343340541915, + "ewc_loss": 0.07972553372383118, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004212787898723036, + "grad_norm": 9.281344413757324, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.858132004737854, + "num_tokens": 751204526.0, + "step": 19685 + }, + { + "epoch": 2.504261544332782, + "ewc_loss": 0.07893547415733337, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004158196097705513, + "grad_norm": 9.259135246276855, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8681282997131348, + "num_tokens": 751245133.0, + "step": 19686 + }, + { + "epoch": 2.5043887546113726, + "ewc_loss": 0.07926148176193237, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004190797044429928, + "grad_norm": 9.262999534606934, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8524536490440369, + "num_tokens": 751284578.0, + "step": 19687 + }, + { + "epoch": 2.504515964889963, + "ewc_loss": 0.0792897567152977, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004193624190520495, + "grad_norm": 9.29368782043457, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8558684587478638, + "num_tokens": 751325794.0, + "step": 19688 + }, + { + "epoch": 2.5046431751685536, + "ewc_loss": 0.0790896937251091, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041736176353879273, + "grad_norm": 9.256421089172363, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8694557547569275, + "num_tokens": 751363777.0, + "step": 19689 + }, + { + "epoch": 2.504770385447144, + "ewc_loss": 0.07932935655117035, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004197584348730743, + "grad_norm": 9.359580039978027, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8549025058746338, + "num_tokens": 751400499.0, + "step": 19690 + }, + { + "epoch": 2.5048975957257347, + "ewc_loss": 0.07876373082399368, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041410213452763855, + "grad_norm": 9.244919776916504, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8603062629699707, + "num_tokens": 751434895.0, + "step": 19691 + }, + { + "epoch": 2.505024806004325, + "ewc_loss": 0.07929160445928574, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004193808708805591, + "grad_norm": 9.358453750610352, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8739835023880005, + "num_tokens": 751473984.0, + "step": 19692 + }, + { + "epoch": 2.5051520162829157, + "ewc_loss": 0.07884366810321808, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041490147123113275, + "grad_norm": 9.229873657226562, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8629144430160522, + "num_tokens": 751515030.0, + "step": 19693 + }, + { + "epoch": 2.5052792265615063, + "ewc_loss": 0.07933540642261505, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041981894173659384, + "grad_norm": 9.333160400390625, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8608791828155518, + "num_tokens": 751550023.0, + "step": 19694 + }, + { + "epoch": 2.505406436840097, + "ewc_loss": 0.07878734171390533, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004143382248003036, + "grad_norm": 9.186628341674805, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8638237714767456, + "num_tokens": 751584793.0, + "step": 19695 + }, + { + "epoch": 2.5055336471186873, + "ewc_loss": 0.07955674827098846, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004220323753543198, + "grad_norm": 9.37137508392334, + "learning_rate": 1e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8440448045730591, + "num_tokens": 751619817.0, + "step": 19696 + }, + { + "epoch": 2.505660857397278, + "ewc_loss": 0.07879357784986496, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004144006234128028, + "grad_norm": 9.185712814331055, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8705753684043884, + "num_tokens": 751657097.0, + "step": 19697 + }, + { + "epoch": 2.5057880676758684, + "ewc_loss": 0.07945193350315094, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004209841717965901, + "grad_norm": 9.348569869995117, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8539080023765564, + "num_tokens": 751687042.0, + "step": 19698 + }, + { + "epoch": 2.5059152779544585, + "ewc_loss": 0.0787050873041153, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004135157505515963, + "grad_norm": 9.166540145874023, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8729242086410522, + "num_tokens": 751724281.0, + "step": 19699 + }, + { + "epoch": 2.5060424882330494, + "ewc_loss": 0.0798037201166153, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004220606351736933, + "grad_norm": 9.363240242004395, + "learning_rate": 1e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.84410560131073, + "num_tokens": 751765144.0, + "step": 19700 + }, + { + "epoch": 2.5061696985116395, + "ewc_loss": 0.078993059694767, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041395402513444424, + "grad_norm": 9.21186351776123, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8715590238571167, + "num_tokens": 751799602.0, + "step": 19701 + }, + { + "epoch": 2.5062969087902305, + "ewc_loss": 0.07937314361333847, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004201962728984654, + "grad_norm": 9.301591873168945, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8716143369674683, + "num_tokens": 751836972.0, + "step": 19702 + }, + { + "epoch": 2.5064241190688206, + "ewc_loss": 0.07891063392162323, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041557123768143356, + "grad_norm": 9.198358535766602, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8642668724060059, + "num_tokens": 751869627.0, + "step": 19703 + }, + { + "epoch": 2.5065513293474115, + "ewc_loss": 0.0793289840221405, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004197546513751149, + "grad_norm": 9.314350128173828, + "learning_rate": 1e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8566121459007263, + "num_tokens": 751911103.0, + "step": 19704 + }, + { + "epoch": 2.5066785396260016, + "ewc_loss": 0.0791221410036087, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004152448964305222, + "grad_norm": 9.246905326843262, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8574291467666626, + "num_tokens": 751948956.0, + "step": 19705 + }, + { + "epoch": 2.506805749904592, + "ewc_loss": 0.0791252851486206, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041771764517761767, + "grad_norm": 9.246878623962402, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8665921688079834, + "num_tokens": 751992185.0, + "step": 19706 + }, + { + "epoch": 2.5069329601831827, + "ewc_loss": 0.07930924743413925, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004171159234829247, + "grad_norm": 9.252296447753906, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8594397902488708, + "num_tokens": 752032661.0, + "step": 19707 + }, + { + "epoch": 2.507060170461773, + "ewc_loss": 0.07932141423225403, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004147961444687098, + "grad_norm": 9.250381469726562, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8675228357315063, + "num_tokens": 752073946.0, + "step": 19708 + }, + { + "epoch": 2.5071873807403637, + "ewc_loss": 0.07901088893413544, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004165736900176853, + "grad_norm": 9.201425552368164, + "learning_rate": 1e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.8471987843513489, + "num_tokens": 752118766.0, + "step": 19709 + }, + { + "epoch": 2.5073145910189543, + "ewc_loss": 0.07931855320930481, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041965037235058844, + "grad_norm": 9.317333221435547, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8680779337882996, + "num_tokens": 752159669.0, + "step": 19710 + }, + { + "epoch": 2.507441801297545, + "ewc_loss": 0.07902617752552032, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004142852558288723, + "grad_norm": 9.222770690917969, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8665869832038879, + "num_tokens": 752198139.0, + "step": 19711 + }, + { + "epoch": 2.5075690115761353, + "ewc_loss": 0.07922548055648804, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041871960274875164, + "grad_norm": 9.234960556030273, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8655499815940857, + "num_tokens": 752239686.0, + "step": 19712 + }, + { + "epoch": 2.507696221854726, + "ewc_loss": 0.0789453387260437, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041591827175579965, + "grad_norm": 9.207619667053223, + "learning_rate": 1e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8455580472946167, + "num_tokens": 752275740.0, + "step": 19713 + }, + { + "epoch": 2.5078234321333164, + "ewc_loss": 0.07901929318904877, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041665780008770525, + "grad_norm": 14.646557807922363, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.850435733795166, + "num_tokens": 752310554.0, + "step": 19714 + }, + { + "epoch": 2.507950642411907, + "ewc_loss": 0.08572973310947418, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00048376218182966113, + "grad_norm": 9.812531471252441, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8693370819091797, + "num_tokens": 752347761.0, + "step": 19715 + }, + { + "epoch": 2.5080778526904974, + "ewc_loss": 0.08218233287334442, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004482881922740489, + "grad_norm": 9.805891990661621, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8518643379211426, + "num_tokens": 752387128.0, + "step": 19716 + }, + { + "epoch": 2.508205062969088, + "ewc_loss": 0.07949443906545639, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004214092332404107, + "grad_norm": 9.261734008789062, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8611566424369812, + "num_tokens": 752428722.0, + "step": 19717 + }, + { + "epoch": 2.5083322732476785, + "ewc_loss": 0.08286473155021667, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004551121382974088, + "grad_norm": 9.823119163513184, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8616507649421692, + "num_tokens": 752468920.0, + "step": 19718 + }, + { + "epoch": 2.508459483526269, + "ewc_loss": 0.07958413660526276, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004223061550874263, + "grad_norm": 9.281572341918945, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8538816571235657, + "num_tokens": 752505226.0, + "step": 19719 + }, + { + "epoch": 2.5085866938048595, + "ewc_loss": 0.08151931315660477, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004416579904500395, + "grad_norm": 9.646613121032715, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8643314838409424, + "num_tokens": 752543693.0, + "step": 19720 + }, + { + "epoch": 2.50871390408345, + "ewc_loss": 0.07974094152450562, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004238742694724351, + "grad_norm": 9.348052024841309, + "learning_rate": 1e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8480169177055359, + "num_tokens": 752578758.0, + "step": 19721 + }, + { + "epoch": 2.5088411143620406, + "ewc_loss": 0.0806926041841507, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043339087278582156, + "grad_norm": 9.535085678100586, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.874899685382843, + "num_tokens": 752612115.0, + "step": 19722 + }, + { + "epoch": 2.508968324640631, + "ewc_loss": 0.07954388856887817, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004219037073198706, + "grad_norm": 9.344317436218262, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.869027853012085, + "num_tokens": 752644601.0, + "step": 19723 + }, + { + "epoch": 2.509095534919221, + "ewc_loss": 0.08007428050041199, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042720764758996665, + "grad_norm": 9.392866134643555, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8674099445343018, + "num_tokens": 752682333.0, + "step": 19724 + }, + { + "epoch": 2.509222745197812, + "ewc_loss": 0.07936538755893707, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004201187693979591, + "grad_norm": 9.255240440368652, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8581480979919434, + "num_tokens": 752721802.0, + "step": 19725 + }, + { + "epoch": 2.5093499554764023, + "ewc_loss": 0.07985036820173264, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004249685152899474, + "grad_norm": 9.42833137512207, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8772523403167725, + "num_tokens": 752755673.0, + "step": 19726 + }, + { + "epoch": 2.5094771657549932, + "ewc_loss": 0.07945132255554199, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004185366560705006, + "grad_norm": 9.227900505065918, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.873734712600708, + "num_tokens": 752795575.0, + "step": 19727 + }, + { + "epoch": 2.5096043760335833, + "ewc_loss": 0.07974773645401001, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004239422269165516, + "grad_norm": 9.416444778442383, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8802753686904907, + "num_tokens": 752833172.0, + "step": 19728 + }, + { + "epoch": 2.5097315863121743, + "ewc_loss": 0.07922502607107162, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041627371683716774, + "grad_norm": 9.831949234008789, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8484406471252441, + "num_tokens": 752872857.0, + "step": 19729 + }, + { + "epoch": 2.5098587965907644, + "ewc_loss": 0.07836979627609253, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004101627564523369, + "grad_norm": 9.165202140808105, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8724368214607239, + "num_tokens": 752908553.0, + "step": 19730 + }, + { + "epoch": 2.509986006869355, + "ewc_loss": 0.08004462718963623, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004269111668691039, + "grad_norm": 9.555006980895996, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8695403337478638, + "num_tokens": 752937762.0, + "step": 19731 + }, + { + "epoch": 2.5101132171479454, + "ewc_loss": 0.07802224904298782, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040668732253834605, + "grad_norm": 9.053030014038086, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8496075868606567, + "num_tokens": 752977184.0, + "step": 19732 + }, + { + "epoch": 2.510240427426536, + "ewc_loss": 0.0803675502538681, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004301402950659394, + "grad_norm": 9.511725425720215, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8581743836402893, + "num_tokens": 753021071.0, + "step": 19733 + }, + { + "epoch": 2.5103676377051265, + "ewc_loss": 0.07828228175640106, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040928766247816384, + "grad_norm": 9.106317520141602, + "learning_rate": 1e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8367947340011597, + "num_tokens": 753056597.0, + "step": 19734 + }, + { + "epoch": 2.510494847983717, + "ewc_loss": 0.08031222224235535, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004295870312489569, + "grad_norm": 9.49422836303711, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8657849431037903, + "num_tokens": 753096178.0, + "step": 19735 + }, + { + "epoch": 2.5106220582623076, + "ewc_loss": 0.07855331897735596, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004119980148971081, + "grad_norm": 9.130376815795898, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8660733699798584, + "num_tokens": 753130388.0, + "step": 19736 + }, + { + "epoch": 2.510749268540898, + "ewc_loss": 0.08027631044387817, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042922794818878174, + "grad_norm": 9.470512390136719, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.871971070766449, + "num_tokens": 753166585.0, + "step": 19737 + }, + { + "epoch": 2.5108764788194886, + "ewc_loss": 0.07879061996936798, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041437108302488923, + "grad_norm": 9.179879188537598, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8728135228157043, + "num_tokens": 753203403.0, + "step": 19738 + }, + { + "epoch": 2.511003689098079, + "ewc_loss": 0.07993195950984955, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004257844411768019, + "grad_norm": 9.37955379486084, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8704817295074463, + "num_tokens": 753241632.0, + "step": 19739 + }, + { + "epoch": 2.5111308993766697, + "ewc_loss": 0.07894425094127655, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041590729961171746, + "grad_norm": 9.223482131958008, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8771566152572632, + "num_tokens": 753279292.0, + "step": 19740 + }, + { + "epoch": 2.51125810965526, + "ewc_loss": 0.0796896368265152, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004233611689414829, + "grad_norm": 9.39660930633545, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8526337146759033, + "num_tokens": 753319121.0, + "step": 19741 + }, + { + "epoch": 2.5113853199338507, + "ewc_loss": 0.07879333198070526, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041439812048338354, + "grad_norm": 9.274703025817871, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8495641946792603, + "num_tokens": 753353074.0, + "step": 19742 + }, + { + "epoch": 2.5115125302124413, + "ewc_loss": 0.07932035624980927, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041966844582930207, + "grad_norm": 9.241548538208008, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8813704252243042, + "num_tokens": 753388994.0, + "step": 19743 + }, + { + "epoch": 2.511639740491032, + "ewc_loss": 0.0793505311012268, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004199701943434775, + "grad_norm": 9.26038932800293, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8702107667922974, + "num_tokens": 753430271.0, + "step": 19744 + }, + { + "epoch": 2.5117669507696223, + "ewc_loss": 0.07902562618255615, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041672110091894865, + "grad_norm": 9.232584953308105, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8558882474899292, + "num_tokens": 753473820.0, + "step": 19745 + }, + { + "epoch": 2.511894161048213, + "ewc_loss": 0.07943718135356903, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042083667358383536, + "grad_norm": 9.29238510131836, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8592878580093384, + "num_tokens": 753516397.0, + "step": 19746 + }, + { + "epoch": 2.5120213713268034, + "ewc_loss": 0.07895605266094208, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041602534474805, + "grad_norm": 9.209285736083984, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8744379281997681, + "num_tokens": 753557589.0, + "step": 19747 + }, + { + "epoch": 2.512148581605394, + "ewc_loss": 0.07936467230319977, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004201115225441754, + "grad_norm": 9.291986465454102, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8725643157958984, + "num_tokens": 753593764.0, + "step": 19748 + }, + { + "epoch": 2.512275791883984, + "ewc_loss": 0.07897721230983734, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004162370169069618, + "grad_norm": 9.182430267333984, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8503252267837524, + "num_tokens": 753635967.0, + "step": 19749 + }, + { + "epoch": 2.512403002162575, + "ewc_loss": 0.07940056920051575, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004204705765005201, + "grad_norm": 9.292830467224121, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8718011975288391, + "num_tokens": 753674595.0, + "step": 19750 + }, + { + "epoch": 2.512530212441165, + "ewc_loss": 0.07916112244129181, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004180760879535228, + "grad_norm": 9.242992401123047, + "learning_rate": 1e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8475040197372437, + "num_tokens": 753714118.0, + "step": 19751 + }, + { + "epoch": 2.512657422719756, + "ewc_loss": 0.07939890027046204, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004204538417980075, + "grad_norm": 9.284687995910645, + "learning_rate": 1e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8497216105461121, + "num_tokens": 753758139.0, + "step": 19752 + }, + { + "epoch": 2.512784632998346, + "ewc_loss": 0.07946814596652985, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042114630923606455, + "grad_norm": 9.331310272216797, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8532239198684692, + "num_tokens": 753800373.0, + "step": 19753 + }, + { + "epoch": 2.5129118432769366, + "ewc_loss": 0.07927815616130829, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041924644028767943, + "grad_norm": 9.303686141967773, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8585795760154724, + "num_tokens": 753832366.0, + "step": 19754 + }, + { + "epoch": 2.513039053555527, + "ewc_loss": 0.0795239806175232, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004217046080157161, + "grad_norm": 9.289321899414062, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8826833963394165, + "num_tokens": 753866176.0, + "step": 19755 + }, + { + "epoch": 2.5131662638341177, + "ewc_loss": 0.07937419414520264, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004202067793812603, + "grad_norm": 9.278108596801758, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8624682426452637, + "num_tokens": 753911470.0, + "step": 19756 + }, + { + "epoch": 2.513293474112708, + "ewc_loss": 0.07945225387811661, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042098737321794033, + "grad_norm": 9.395350456237793, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8618969321250916, + "num_tokens": 753953027.0, + "step": 19757 + }, + { + "epoch": 2.5134206843912987, + "ewc_loss": 0.07908783853054047, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004173432243987918, + "grad_norm": 9.281181335449219, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8581147193908691, + "num_tokens": 753987992.0, + "step": 19758 + }, + { + "epoch": 2.5135478946698893, + "ewc_loss": 0.0794115886092186, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042058073449879885, + "grad_norm": 9.35888671875, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8677108287811279, + "num_tokens": 754026671.0, + "step": 19759 + }, + { + "epoch": 2.51367510494848, + "ewc_loss": 0.07897180318832397, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041618291288614273, + "grad_norm": 9.198554992675781, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8641812801361084, + "num_tokens": 754063526.0, + "step": 19760 + }, + { + "epoch": 2.5138023152270703, + "ewc_loss": 0.07954024523496628, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004218672984279692, + "grad_norm": 9.33824348449707, + "learning_rate": 1e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8477423191070557, + "num_tokens": 754103361.0, + "step": 19761 + }, + { + "epoch": 2.513929525505661, + "ewc_loss": 0.07897259294986725, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041619077092036605, + "grad_norm": 9.205126762390137, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8659121990203857, + "num_tokens": 754141969.0, + "step": 19762 + }, + { + "epoch": 2.5140567357842514, + "ewc_loss": 0.07945539057254791, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004210187471471727, + "grad_norm": 9.270312309265137, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.879173994064331, + "num_tokens": 754182691.0, + "step": 19763 + }, + { + "epoch": 2.514183946062842, + "ewc_loss": 0.07923801988363266, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041884504025802016, + "grad_norm": 9.230359077453613, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8633130192756653, + "num_tokens": 754219956.0, + "step": 19764 + }, + { + "epoch": 2.5143111563414324, + "ewc_loss": 0.07940490543842316, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042051388300023973, + "grad_norm": 9.270686149597168, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8581894636154175, + "num_tokens": 754259467.0, + "step": 19765 + }, + { + "epoch": 2.514438366620023, + "ewc_loss": 0.0794472023844719, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004209368780720979, + "grad_norm": 9.284198760986328, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8661868572235107, + "num_tokens": 754300344.0, + "step": 19766 + }, + { + "epoch": 2.5145655768986135, + "ewc_loss": 0.07933977991342545, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004198626265861094, + "grad_norm": 9.305221557617188, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8650482892990112, + "num_tokens": 754342606.0, + "step": 19767 + }, + { + "epoch": 2.514692787177204, + "ewc_loss": 0.07959156483411789, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041993908234871924, + "grad_norm": 9.215556144714355, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8597270250320435, + "num_tokens": 754379946.0, + "step": 19768 + }, + { + "epoch": 2.5148199974557945, + "ewc_loss": 0.07954490929841995, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042191395186819136, + "grad_norm": 9.309173583984375, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8771839141845703, + "num_tokens": 754414353.0, + "step": 19769 + }, + { + "epoch": 2.514947207734385, + "ewc_loss": 0.07909870147705078, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174518398940563, + "grad_norm": 9.211277961730957, + "learning_rate": 1e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8490321636199951, + "num_tokens": 754455506.0, + "step": 19770 + }, + { + "epoch": 2.5150744180129756, + "ewc_loss": 0.07959647476673126, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042242961353622377, + "grad_norm": 9.272872924804688, + "learning_rate": 1e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8467311263084412, + "num_tokens": 754498316.0, + "step": 19771 + }, + { + "epoch": 2.5152016282915657, + "ewc_loss": 0.07922552525997162, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041872006841003895, + "grad_norm": 9.305975914001465, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8517361879348755, + "num_tokens": 754534943.0, + "step": 19772 + }, + { + "epoch": 2.5153288385701567, + "ewc_loss": 0.07936741411685944, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042013899656012654, + "grad_norm": 9.26913833618164, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.861223578453064, + "num_tokens": 754572775.0, + "step": 19773 + }, + { + "epoch": 2.5154560488487467, + "ewc_loss": 0.07937490940093994, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042021393892355263, + "grad_norm": 9.21759033203125, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.872874915599823, + "num_tokens": 754610473.0, + "step": 19774 + }, + { + "epoch": 2.5155832591273377, + "ewc_loss": 0.07958324998617172, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004222973366267979, + "grad_norm": 9.361369132995605, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8641617298126221, + "num_tokens": 754649527.0, + "step": 19775 + }, + { + "epoch": 2.515710469405928, + "ewc_loss": 0.0790012925863266, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004164778220001608, + "grad_norm": 9.272003173828125, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8653976917266846, + "num_tokens": 754691513.0, + "step": 19776 + }, + { + "epoch": 2.5158376796845188, + "ewc_loss": 0.07943864166736603, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042085122549906373, + "grad_norm": 9.298515319824219, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8509266972541809, + "num_tokens": 754727932.0, + "step": 19777 + }, + { + "epoch": 2.515964889963109, + "ewc_loss": 0.07907734811306, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041723830508999527, + "grad_norm": 9.244034767150879, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8691229820251465, + "num_tokens": 754764571.0, + "step": 19778 + }, + { + "epoch": 2.5160921002416994, + "ewc_loss": 0.07948169112205505, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004212817584630102, + "grad_norm": 9.218402862548828, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8776875734329224, + "num_tokens": 754805846.0, + "step": 19779 + }, + { + "epoch": 2.51621931052029, + "ewc_loss": 0.07937085628509521, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004201734554953873, + "grad_norm": 9.265824317932129, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8724104762077332, + "num_tokens": 754844016.0, + "step": 19780 + }, + { + "epoch": 2.5163465207988804, + "ewc_loss": 0.07950283586978912, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004214932559989393, + "grad_norm": 9.26806926727295, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8629125952720642, + "num_tokens": 754886493.0, + "step": 19781 + }, + { + "epoch": 2.516473731077471, + "ewc_loss": 0.0793142318725586, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041960718226619065, + "grad_norm": 9.24764347076416, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8666820526123047, + "num_tokens": 754928195.0, + "step": 19782 + }, + { + "epoch": 2.5166009413560615, + "ewc_loss": 0.07959174364805222, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004223822907079011, + "grad_norm": 9.364191055297852, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8731176853179932, + "num_tokens": 754958399.0, + "step": 19783 + }, + { + "epoch": 2.516728151634652, + "ewc_loss": 0.07914569973945618, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000417921895859763, + "grad_norm": 9.144340515136719, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8681257367134094, + "num_tokens": 754997926.0, + "step": 19784 + }, + { + "epoch": 2.5168553619132426, + "ewc_loss": 0.07996479421854019, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004261127905920148, + "grad_norm": 9.375197410583496, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8705215454101562, + "num_tokens": 755040577.0, + "step": 19785 + }, + { + "epoch": 2.516982572191833, + "ewc_loss": 0.07899445295333862, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004164093697909266, + "grad_norm": 9.19321060180664, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8604499101638794, + "num_tokens": 755070458.0, + "step": 19786 + }, + { + "epoch": 2.5171097824704236, + "ewc_loss": 0.07998350262641907, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004262998700141907, + "grad_norm": 9.394023895263672, + "learning_rate": 1e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8430041074752808, + "num_tokens": 755117205.0, + "step": 19787 + }, + { + "epoch": 2.517236992749014, + "ewc_loss": 0.07881477475166321, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041461261571384966, + "grad_norm": 9.13224983215332, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.866464376449585, + "num_tokens": 755162098.0, + "step": 19788 + }, + { + "epoch": 2.5173642030276047, + "ewc_loss": 0.08004368841648102, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004269017663318664, + "grad_norm": 9.421003341674805, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8604937791824341, + "num_tokens": 755199655.0, + "step": 19789 + }, + { + "epoch": 2.517491413306195, + "ewc_loss": 0.07888332009315491, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041529806912876666, + "grad_norm": 9.164634704589844, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8777530789375305, + "num_tokens": 755234165.0, + "step": 19790 + }, + { + "epoch": 2.5176186235847857, + "ewc_loss": 0.07997780293226242, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042624285561032593, + "grad_norm": 9.35053539276123, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8648121953010559, + "num_tokens": 755271933.0, + "step": 19791 + }, + { + "epoch": 2.5177458338633762, + "ewc_loss": 0.07894017547369003, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004158665833529085, + "grad_norm": 9.168267250061035, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8732411861419678, + "num_tokens": 755306903.0, + "step": 19792 + }, + { + "epoch": 2.5178730441419668, + "ewc_loss": 0.08044258505105972, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00042600787128321826, + "grad_norm": 9.65323543548584, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8721731901168823, + "num_tokens": 755347613.0, + "step": 19793 + }, + { + "epoch": 2.5180002544205573, + "ewc_loss": 0.07877861708402634, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004118095966987312, + "grad_norm": 9.022672653198242, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8833770155906677, + "num_tokens": 755380541.0, + "step": 19794 + }, + { + "epoch": 2.518127464699148, + "ewc_loss": 0.08071434497833252, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004336082492955029, + "grad_norm": 9.521156311035156, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8627185225486755, + "num_tokens": 755422502.0, + "step": 19795 + }, + { + "epoch": 2.5182546749777384, + "ewc_loss": 0.07861365377902985, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041260133730247617, + "grad_norm": 9.094438552856445, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8651915788650513, + "num_tokens": 755463975.0, + "step": 19796 + }, + { + "epoch": 2.5183818852563284, + "ewc_loss": 0.08065368235111237, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004330016381572932, + "grad_norm": 9.468308448791504, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8644050359725952, + "num_tokens": 755504873.0, + "step": 19797 + }, + { + "epoch": 2.5185090955349194, + "ewc_loss": 0.07881021499633789, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004145670391153544, + "grad_norm": 9.113216400146484, + "learning_rate": 1e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8389075994491577, + "num_tokens": 755545646.0, + "step": 19798 + }, + { + "epoch": 2.5186363058135095, + "ewc_loss": 0.08040357381105423, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043050057138316333, + "grad_norm": 9.52199935913086, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8514649271965027, + "num_tokens": 755577228.0, + "step": 19799 + }, + { + "epoch": 2.5187635160921005, + "ewc_loss": 0.0787171870470047, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041363670607097447, + "grad_norm": 9.09683609008789, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8731399774551392, + "num_tokens": 755621437.0, + "step": 19800 + }, + { + "epoch": 2.5188907263706906, + "ewc_loss": 0.08063594996929169, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004328243958298117, + "grad_norm": 9.480657577514648, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8636919856071472, + "num_tokens": 755658266.0, + "step": 19801 + }, + { + "epoch": 2.5190179366492815, + "ewc_loss": 0.07876487821340561, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041411363054066896, + "grad_norm": 9.136617660522461, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8517581224441528, + "num_tokens": 755696755.0, + "step": 19802 + }, + { + "epoch": 2.5191451469278716, + "ewc_loss": 0.08037495613098145, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043021439341828227, + "grad_norm": 9.474802017211914, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8786758184432983, + "num_tokens": 755732434.0, + "step": 19803 + }, + { + "epoch": 2.519272357206462, + "ewc_loss": 0.0789460614323616, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041592546040192246, + "grad_norm": 9.137449264526367, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8681203126907349, + "num_tokens": 755768488.0, + "step": 19804 + }, + { + "epoch": 2.5193995674850527, + "ewc_loss": 0.08053542673587799, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004293776582926512, + "grad_norm": 9.520586013793945, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8688218593597412, + "num_tokens": 755799637.0, + "step": 19805 + }, + { + "epoch": 2.519526777763643, + "ewc_loss": 0.07870366424322128, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004135014896746725, + "grad_norm": 9.16772747039795, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.849573016166687, + "num_tokens": 755834229.0, + "step": 19806 + }, + { + "epoch": 2.5196539880422337, + "ewc_loss": 0.08028681576251984, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004293329839129001, + "grad_norm": 9.456564903259277, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8684854507446289, + "num_tokens": 755871776.0, + "step": 19807 + }, + { + "epoch": 2.5197811983208243, + "ewc_loss": 0.0789143368601799, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041560822864994407, + "grad_norm": 9.197677612304688, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8746711015701294, + "num_tokens": 755911125.0, + "step": 19808 + }, + { + "epoch": 2.519908408599415, + "ewc_loss": 0.07996723055839539, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042613717960193753, + "grad_norm": 9.386165618896484, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.871952474117279, + "num_tokens": 755952204.0, + "step": 19809 + }, + { + "epoch": 2.5200356188780053, + "ewc_loss": 0.07904611527919769, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004169260209891945, + "grad_norm": 9.27917766571045, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8541327714920044, + "num_tokens": 755996702.0, + "step": 19810 + }, + { + "epoch": 2.520162829156596, + "ewc_loss": 0.0796172246336937, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004226370947435498, + "grad_norm": 9.285177230834961, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8822425007820129, + "num_tokens": 756033285.0, + "step": 19811 + }, + { + "epoch": 2.5202900394351864, + "ewc_loss": 0.07920288294553757, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000418493669712916, + "grad_norm": 9.34925365447998, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8638497591018677, + "num_tokens": 756068851.0, + "step": 19812 + }, + { + "epoch": 2.520417249713777, + "ewc_loss": 0.07913684844970703, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041783327469602227, + "grad_norm": 9.235386848449707, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8561157584190369, + "num_tokens": 756111676.0, + "step": 19813 + }, + { + "epoch": 2.5205444599923674, + "ewc_loss": 0.07964259386062622, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042289079283364117, + "grad_norm": 9.606036186218262, + "learning_rate": 1e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.8475064635276794, + "num_tokens": 756146575.0, + "step": 19814 + }, + { + "epoch": 2.520671670270958, + "ewc_loss": 0.07830716669559479, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040953652933239937, + "grad_norm": 9.080020904541016, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8540134429931641, + "num_tokens": 756190006.0, + "step": 19815 + }, + { + "epoch": 2.5207988805495485, + "ewc_loss": 0.08033210039138794, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042978583951480687, + "grad_norm": 9.591371536254883, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8762322664260864, + "num_tokens": 756232828.0, + "step": 19816 + }, + { + "epoch": 2.520926090828139, + "ewc_loss": 0.07819485664367676, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000408413412515074, + "grad_norm": 9.064789772033691, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8699029684066772, + "num_tokens": 756273077.0, + "step": 19817 + }, + { + "epoch": 2.5210533011067295, + "ewc_loss": 0.08063210546970367, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043278589146211743, + "grad_norm": 9.638141632080078, + "learning_rate": 1e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8483215570449829, + "num_tokens": 756313421.0, + "step": 19818 + }, + { + "epoch": 2.52118051138532, + "ewc_loss": 0.0782080739736557, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040854557300917804, + "grad_norm": 9.095732688903809, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.859358549118042, + "num_tokens": 756356461.0, + "step": 19819 + }, + { + "epoch": 2.5213077216639106, + "ewc_loss": 0.08105827867984772, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004321648448240012, + "grad_norm": 9.63390827178955, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8589745759963989, + "num_tokens": 756388309.0, + "step": 19820 + }, + { + "epoch": 2.521434931942501, + "ewc_loss": 0.07843266427516937, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041079145739786327, + "grad_norm": 9.173075675964355, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8753929734230042, + "num_tokens": 756426655.0, + "step": 19821 + }, + { + "epoch": 2.521562142221091, + "ewc_loss": 0.08028712868690491, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004293361271265894, + "grad_norm": 9.559444427490234, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8597415685653687, + "num_tokens": 756464961.0, + "step": 19822 + }, + { + "epoch": 2.521689352499682, + "ewc_loss": 0.07883447408676147, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004123681574128568, + "grad_norm": 9.191946029663086, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8786217570304871, + "num_tokens": 756497338.0, + "step": 19823 + }, + { + "epoch": 2.5218165627782723, + "ewc_loss": 0.08022040128707886, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004262274887878448, + "grad_norm": 9.434237480163574, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8703787326812744, + "num_tokens": 756535324.0, + "step": 19824 + }, + { + "epoch": 2.5219437730568632, + "ewc_loss": 0.07901670783758163, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041419052286073565, + "grad_norm": 9.170427322387695, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8732136487960815, + "num_tokens": 756574138.0, + "step": 19825 + }, + { + "epoch": 2.5220709833354533, + "ewc_loss": 0.08011359721422195, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004251594073139131, + "grad_norm": 9.436284065246582, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8751932382583618, + "num_tokens": 756608967.0, + "step": 19826 + }, + { + "epoch": 2.522198193614044, + "ewc_loss": 0.07913925498723984, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004154159687459469, + "grad_norm": 9.2208890914917, + "learning_rate": 1e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8427673578262329, + "num_tokens": 756653882.0, + "step": 19827 + }, + { + "epoch": 2.5223254038926344, + "ewc_loss": 0.08002109825611115, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042423440027050674, + "grad_norm": 9.357966423034668, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8578506708145142, + "num_tokens": 756695638.0, + "step": 19828 + }, + { + "epoch": 2.522452614171225, + "ewc_loss": 0.07926109433174133, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041663434240035713, + "grad_norm": 9.217187881469727, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8549243211746216, + "num_tokens": 756734964.0, + "step": 19829 + }, + { + "epoch": 2.5225798244498154, + "ewc_loss": 0.08004026859998703, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042442610720172524, + "grad_norm": 9.377411842346191, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8676744699478149, + "num_tokens": 756768454.0, + "step": 19830 + }, + { + "epoch": 2.522707034728406, + "ewc_loss": 0.07936479151248932, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041767131187953055, + "grad_norm": 9.219332695007324, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8528201580047607, + "num_tokens": 756801696.0, + "step": 19831 + }, + { + "epoch": 2.5228342450069965, + "ewc_loss": 0.0800119936466217, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004241433634888381, + "grad_norm": 9.345772743225098, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.867184042930603, + "num_tokens": 756839874.0, + "step": 19832 + }, + { + "epoch": 2.522961455285587, + "ewc_loss": 0.0793486088514328, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004175094945821911, + "grad_norm": 9.202117919921875, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8535580635070801, + "num_tokens": 756884179.0, + "step": 19833 + }, + { + "epoch": 2.5230886655641775, + "ewc_loss": 0.0798742026090622, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042276541353203356, + "grad_norm": 9.320176124572754, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8656542301177979, + "num_tokens": 756922725.0, + "step": 19834 + }, + { + "epoch": 2.523215875842768, + "ewc_loss": 0.07938767969608307, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041790021350607276, + "grad_norm": 9.20945930480957, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8601592779159546, + "num_tokens": 756956080.0, + "step": 19835 + }, + { + "epoch": 2.5233430861213586, + "ewc_loss": 0.07996438443660736, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004236672830302268, + "grad_norm": 9.35007095336914, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8539857864379883, + "num_tokens": 756995137.0, + "step": 19836 + }, + { + "epoch": 2.523470296399949, + "ewc_loss": 0.07944218069314957, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000418445240939036, + "grad_norm": 9.169780731201172, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8676029443740845, + "num_tokens": 757035300.0, + "step": 19837 + }, + { + "epoch": 2.5235975066785397, + "ewc_loss": 0.0800635814666748, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004246591997798532, + "grad_norm": 9.367169380187988, + "learning_rate": 1e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.8390752673149109, + "num_tokens": 757068789.0, + "step": 19838 + }, + { + "epoch": 2.52372471695713, + "ewc_loss": 0.07925339043140411, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004165573918726295, + "grad_norm": 9.203842163085938, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8571445941925049, + "num_tokens": 757102913.0, + "step": 19839 + }, + { + "epoch": 2.5238519272357207, + "ewc_loss": 0.08014561235904694, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042547957855276763, + "grad_norm": 9.313121795654297, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.865715742111206, + "num_tokens": 757138932.0, + "step": 19840 + }, + { + "epoch": 2.5239791375143112, + "ewc_loss": 0.07942728698253632, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041829628753475845, + "grad_norm": 9.152039527893066, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8661510944366455, + "num_tokens": 757179181.0, + "step": 19841 + }, + { + "epoch": 2.5241063477929018, + "ewc_loss": 0.08015662431716919, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042558967834338546, + "grad_norm": 9.365265846252441, + "learning_rate": 1e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8328605890274048, + "num_tokens": 757224209.0, + "step": 19842 + }, + { + "epoch": 2.5242335580714923, + "ewc_loss": 0.07941002398729324, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041812367271631956, + "grad_norm": 9.165771484375, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8774609565734863, + "num_tokens": 757262398.0, + "step": 19843 + }, + { + "epoch": 2.524360768350083, + "ewc_loss": 0.08012109249830246, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004252343496773392, + "grad_norm": 9.313614845275879, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8542810678482056, + "num_tokens": 757301744.0, + "step": 19844 + }, + { + "epoch": 2.5244879786286734, + "ewc_loss": 0.07947863638401031, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004188097664155066, + "grad_norm": 9.219335556030273, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8681060075759888, + "num_tokens": 757338273.0, + "step": 19845 + }, + { + "epoch": 2.524615188907264, + "ewc_loss": 0.07989683002233505, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004229917249176651, + "grad_norm": 9.277798652648926, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8555155992507935, + "num_tokens": 757383169.0, + "step": 19846 + }, + { + "epoch": 2.524742399185854, + "ewc_loss": 0.07971454411745071, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004211688647046685, + "grad_norm": 9.257660865783691, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8520676493644714, + "num_tokens": 757420998.0, + "step": 19847 + }, + { + "epoch": 2.524869609464445, + "ewc_loss": 0.07953176647424698, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004193411150481552, + "grad_norm": 9.204671859741211, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8679366111755371, + "num_tokens": 757463281.0, + "step": 19848 + }, + { + "epoch": 2.524996819743035, + "ewc_loss": 0.07981778681278229, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004222012939862907, + "grad_norm": 9.303513526916504, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8574497103691101, + "num_tokens": 757495022.0, + "step": 19849 + }, + { + "epoch": 2.525124030021626, + "ewc_loss": 0.0794738307595253, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000418761745095253, + "grad_norm": 9.212356567382812, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8685541749000549, + "num_tokens": 757536801.0, + "step": 19850 + }, + { + "epoch": 2.525251240300216, + "ewc_loss": 0.0798632800579071, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004226562741678208, + "grad_norm": 9.27836799621582, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8783426880836487, + "num_tokens": 757573419.0, + "step": 19851 + }, + { + "epoch": 2.5253784505788066, + "ewc_loss": 0.07955460250377655, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004195694637019187, + "grad_norm": 9.23586654663086, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8568544387817383, + "num_tokens": 757610773.0, + "step": 19852 + }, + { + "epoch": 2.525505660857397, + "ewc_loss": 0.07971495389938354, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042117293924093246, + "grad_norm": 9.268921852111816, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8729432821273804, + "num_tokens": 757648067.0, + "step": 19853 + }, + { + "epoch": 2.5256328711359877, + "ewc_loss": 0.07953140139579773, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004193374770693481, + "grad_norm": 9.20766544342041, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.869112491607666, + "num_tokens": 757681598.0, + "step": 19854 + }, + { + "epoch": 2.525760081414578, + "ewc_loss": 0.07987439632415771, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042276736348867416, + "grad_norm": 9.282968521118164, + "learning_rate": 1e-06, + "loss": 0.556, + "mean_token_accuracy": 0.844690203666687, + "num_tokens": 757724752.0, + "step": 19855 + }, + { + "epoch": 2.5258872916931687, + "ewc_loss": 0.07950033992528915, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004190268518868834, + "grad_norm": 9.192484855651855, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8753046989440918, + "num_tokens": 757763801.0, + "step": 19856 + }, + { + "epoch": 2.5260145019717593, + "ewc_loss": 0.07992050051689148, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004232283972669393, + "grad_norm": 9.353554725646973, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8612475395202637, + "num_tokens": 757799235.0, + "step": 19857 + }, + { + "epoch": 2.52614171225035, + "ewc_loss": 0.07926519215106964, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004166754079051316, + "grad_norm": 9.199767112731934, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8677403330802917, + "num_tokens": 757839695.0, + "step": 19858 + }, + { + "epoch": 2.5262689225289403, + "ewc_loss": 0.08013589680194855, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042538242996670306, + "grad_norm": 9.420846939086914, + "learning_rate": 1e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8516262173652649, + "num_tokens": 757877152.0, + "step": 19859 + }, + { + "epoch": 2.526396132807531, + "ewc_loss": 0.0791543498635292, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041556693031452596, + "grad_norm": 9.19329833984375, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8626843690872192, + "num_tokens": 757916173.0, + "step": 19860 + }, + { + "epoch": 2.5265233430861214, + "ewc_loss": 0.08032143115997314, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00042479633702896535, + "grad_norm": 9.362366676330566, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8688490390777588, + "num_tokens": 757957271.0, + "step": 19861 + }, + { + "epoch": 2.526650553364712, + "ewc_loss": 0.07924691587686539, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041649260674603283, + "grad_norm": 9.199113845825195, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8576310276985168, + "num_tokens": 757995553.0, + "step": 19862 + }, + { + "epoch": 2.5267777636433024, + "ewc_loss": 0.0799664855003357, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004236882377881557, + "grad_norm": 9.335960388183594, + "learning_rate": 1e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8521254062652588, + "num_tokens": 758035095.0, + "step": 19863 + }, + { + "epoch": 2.526904973921893, + "ewc_loss": 0.0793319046497345, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004173424676991999, + "grad_norm": 9.19224739074707, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8628392219543457, + "num_tokens": 758071185.0, + "step": 19864 + }, + { + "epoch": 2.5270321842004835, + "ewc_loss": 0.08000406622886658, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004240640555508435, + "grad_norm": 9.427532196044922, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.865530788898468, + "num_tokens": 758110251.0, + "step": 19865 + }, + { + "epoch": 2.527159394479074, + "ewc_loss": 0.0792492926120758, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000416516384575516, + "grad_norm": 9.204594612121582, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8493800759315491, + "num_tokens": 758144385.0, + "step": 19866 + }, + { + "epoch": 2.5272866047576645, + "ewc_loss": 0.08002574741840363, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042428087908774614, + "grad_norm": 9.338371276855469, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8749629855155945, + "num_tokens": 758185935.0, + "step": 19867 + }, + { + "epoch": 2.527413815036255, + "ewc_loss": 0.07931733131408691, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004171967157162726, + "grad_norm": 9.193763732910156, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.866919755935669, + "num_tokens": 758226688.0, + "step": 19868 + }, + { + "epoch": 2.5275410253148456, + "ewc_loss": 0.07999122142791748, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042393564945086837, + "grad_norm": 9.391289710998535, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8700900077819824, + "num_tokens": 758263866.0, + "step": 19869 + }, + { + "epoch": 2.5276682355934357, + "ewc_loss": 0.07931029796600342, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004171264299657196, + "grad_norm": 9.256446838378906, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8699938058853149, + "num_tokens": 758303607.0, + "step": 19870 + }, + { + "epoch": 2.5277954458720266, + "ewc_loss": 0.0797744169831276, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042176758870482445, + "grad_norm": 9.403987884521484, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8665551543235779, + "num_tokens": 758340120.0, + "step": 19871 + }, + { + "epoch": 2.5279226561506167, + "ewc_loss": 0.07964952290058136, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004180772230029106, + "grad_norm": 9.283469200134277, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.861796498298645, + "num_tokens": 758373792.0, + "step": 19872 + }, + { + "epoch": 2.5280498664292077, + "ewc_loss": 0.0798206776380539, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004222301649861038, + "grad_norm": 9.481719970703125, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8517541289329529, + "num_tokens": 758418157.0, + "step": 19873 + }, + { + "epoch": 2.528177076707798, + "ewc_loss": 0.07914560288190842, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004154794733040035, + "grad_norm": 9.294321060180664, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8551303148269653, + "num_tokens": 758454042.0, + "step": 19874 + }, + { + "epoch": 2.5283042869863888, + "ewc_loss": 0.0799420177936554, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042344359098933637, + "grad_norm": 9.369466781616211, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8659155368804932, + "num_tokens": 758495305.0, + "step": 19875 + }, + { + "epoch": 2.528431497264979, + "ewc_loss": 0.07908869534730911, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041491040610708296, + "grad_norm": 9.171832084655762, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8705273866653442, + "num_tokens": 758532646.0, + "step": 19876 + }, + { + "epoch": 2.5285587075435694, + "ewc_loss": 0.08004040271043777, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004244274750817567, + "grad_norm": 9.450841903686523, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8582466244697571, + "num_tokens": 758568329.0, + "step": 19877 + }, + { + "epoch": 2.52868591782216, + "ewc_loss": 0.07891517877578735, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041317520663142204, + "grad_norm": 9.088607788085938, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8676161766052246, + "num_tokens": 758608246.0, + "step": 19878 + }, + { + "epoch": 2.5288131281007504, + "ewc_loss": 0.08033683896064758, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004273918748367578, + "grad_norm": 9.566166877746582, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8731042146682739, + "num_tokens": 758645356.0, + "step": 19879 + }, + { + "epoch": 2.528940338379341, + "ewc_loss": 0.07869845628738403, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004110080481041223, + "grad_norm": 9.035033226013184, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8792490363121033, + "num_tokens": 758685657.0, + "step": 19880 + }, + { + "epoch": 2.5290675486579315, + "ewc_loss": 0.08095692098140717, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043359267874620855, + "grad_norm": 9.831268310546875, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8684843182563782, + "num_tokens": 758718843.0, + "step": 19881 + }, + { + "epoch": 2.529194758936522, + "ewc_loss": 0.07835585623979568, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004075820033904165, + "grad_norm": 8.97996711730957, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8679239153862, + "num_tokens": 758757946.0, + "step": 19882 + }, + { + "epoch": 2.5293219692151125, + "ewc_loss": 0.08198095858097076, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004438330652192235, + "grad_norm": 10.171882629394531, + "learning_rate": 1e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8437120914459229, + "num_tokens": 758792832.0, + "step": 19883 + }, + { + "epoch": 2.529449179493703, + "ewc_loss": 0.07830491662025452, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004070726572535932, + "grad_norm": 8.938394546508789, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8551223278045654, + "num_tokens": 758826062.0, + "step": 19884 + }, + { + "epoch": 2.5295763897722936, + "ewc_loss": 0.08308230340480804, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004548465076368302, + "grad_norm": 10.250093460083008, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8734577298164368, + "num_tokens": 758860940.0, + "step": 19885 + }, + { + "epoch": 2.529703600050884, + "ewc_loss": 0.07886059582233429, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041262939339503646, + "grad_norm": 8.988018989562988, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8627377152442932, + "num_tokens": 758900740.0, + "step": 19886 + }, + { + "epoch": 2.5298308103294747, + "ewc_loss": 0.08329325914382935, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004569559823721647, + "grad_norm": 9.933342933654785, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.859470784664154, + "num_tokens": 758942786.0, + "step": 19887 + }, + { + "epoch": 2.529958020608065, + "ewc_loss": 0.07970797270536423, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004211031482554972, + "grad_norm": 9.369723320007324, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8729671239852905, + "num_tokens": 758983376.0, + "step": 19888 + }, + { + "epoch": 2.5300852308866557, + "ewc_loss": 0.08167487382888794, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004407721571624279, + "grad_norm": 9.62701416015625, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8518882989883423, + "num_tokens": 759024007.0, + "step": 19889 + }, + { + "epoch": 2.5302124411652462, + "ewc_loss": 0.08007916808128357, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042481510899960995, + "grad_norm": 9.341109275817871, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8719663023948669, + "num_tokens": 759061611.0, + "step": 19890 + }, + { + "epoch": 2.5303396514438368, + "ewc_loss": 0.08054561167955399, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004294795508030802, + "grad_norm": 9.579253196716309, + "learning_rate": 1e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.845756471157074, + "num_tokens": 759101942.0, + "step": 19891 + }, + { + "epoch": 2.5304668617224273, + "ewc_loss": 0.07969038188457489, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004233686195220798, + "grad_norm": 9.381814956665039, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.879942774772644, + "num_tokens": 759140281.0, + "step": 19892 + }, + { + "epoch": 2.530594072001018, + "ewc_loss": 0.08018404245376587, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004258638364262879, + "grad_norm": 9.546168327331543, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8639799952507019, + "num_tokens": 759174319.0, + "step": 19893 + }, + { + "epoch": 2.5307212822796084, + "ewc_loss": 0.07948678731918335, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041889125714078546, + "grad_norm": 9.570067405700684, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8698908090591431, + "num_tokens": 759208981.0, + "step": 19894 + }, + { + "epoch": 2.5308484925581984, + "ewc_loss": 0.07907912135124207, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004172560293227434, + "grad_norm": 9.30090618133545, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8653473854064941, + "num_tokens": 759240559.0, + "step": 19895 + }, + { + "epoch": 2.5309757028367894, + "ewc_loss": 0.07974402606487274, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004239051486365497, + "grad_norm": 9.457752227783203, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8754885196685791, + "num_tokens": 759280550.0, + "step": 19896 + }, + { + "epoch": 2.5311029131153795, + "ewc_loss": 0.07878506928682327, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004143155238125473, + "grad_norm": 9.331260681152344, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8572033047676086, + "num_tokens": 759321298.0, + "step": 19897 + }, + { + "epoch": 2.5312301233939705, + "ewc_loss": 0.07940946519374847, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042055954691022635, + "grad_norm": 9.406754493713379, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8754801750183105, + "num_tokens": 759354627.0, + "step": 19898 + }, + { + "epoch": 2.5313573336725606, + "ewc_loss": 0.07891540229320526, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004156188515480608, + "grad_norm": 9.241721153259277, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8584888577461243, + "num_tokens": 759388129.0, + "step": 19899 + }, + { + "epoch": 2.5314845439511515, + "ewc_loss": 0.0794496238231659, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000420961034251377, + "grad_norm": 9.340909957885742, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8731739521026611, + "num_tokens": 759423667.0, + "step": 19900 + }, + { + "epoch": 2.5316117542297416, + "ewc_loss": 0.07916708290576935, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041813572170212865, + "grad_norm": 9.27455997467041, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8550058007240295, + "num_tokens": 759466586.0, + "step": 19901 + }, + { + "epoch": 2.531738964508332, + "ewc_loss": 0.0795169398188591, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004216342349536717, + "grad_norm": 9.374134063720703, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.858587384223938, + "num_tokens": 759504214.0, + "step": 19902 + }, + { + "epoch": 2.5318661747869227, + "ewc_loss": 0.07904194295406342, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004168842569924891, + "grad_norm": 9.276467323303223, + "learning_rate": 1e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8487688899040222, + "num_tokens": 759543865.0, + "step": 19903 + }, + { + "epoch": 2.531993385065513, + "ewc_loss": 0.07937704026699066, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004202352138236165, + "grad_norm": 9.282772064208984, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8595025539398193, + "num_tokens": 759581917.0, + "step": 19904 + }, + { + "epoch": 2.5321205953441037, + "ewc_loss": 0.07913769781589508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041784177301451564, + "grad_norm": 9.288005828857422, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8658227324485779, + "num_tokens": 759617641.0, + "step": 19905 + }, + { + "epoch": 2.5322478056226942, + "ewc_loss": 0.07940732687711716, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042053809738717973, + "grad_norm": 9.319016456604004, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8752785921096802, + "num_tokens": 759649097.0, + "step": 19906 + }, + { + "epoch": 2.5323750159012848, + "ewc_loss": 0.07925059646368027, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004189707979094237, + "grad_norm": 9.266756057739258, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.859349250793457, + "num_tokens": 759688802.0, + "step": 19907 + }, + { + "epoch": 2.5325022261798753, + "ewc_loss": 0.0793682336807251, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004201471747364849, + "grad_norm": 9.24521541595459, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8724246025085449, + "num_tokens": 759724243.0, + "step": 19908 + }, + { + "epoch": 2.532629436458466, + "ewc_loss": 0.07943160086870193, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042078085243701935, + "grad_norm": 9.273463249206543, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8576480150222778, + "num_tokens": 759764646.0, + "step": 19909 + }, + { + "epoch": 2.5327566467370564, + "ewc_loss": 0.07924912869930267, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004189561295788735, + "grad_norm": 9.152107238769531, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8668614625930786, + "num_tokens": 759806939.0, + "step": 19910 + }, + { + "epoch": 2.532883857015647, + "ewc_loss": 0.07976797968149185, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042414464405737817, + "grad_norm": 9.338695526123047, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8681637644767761, + "num_tokens": 759848138.0, + "step": 19911 + }, + { + "epoch": 2.5330110672942374, + "ewc_loss": 0.07902461290359497, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041671100188978016, + "grad_norm": 9.166056632995605, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8608866930007935, + "num_tokens": 759881877.0, + "step": 19912 + }, + { + "epoch": 2.533138277572828, + "ewc_loss": 0.0798390656709671, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000424855446908623, + "grad_norm": 9.358213424682617, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.87450110912323, + "num_tokens": 759922625.0, + "step": 19913 + }, + { + "epoch": 2.5332654878514185, + "ewc_loss": 0.07906046509742737, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004170694446656853, + "grad_norm": 9.236705780029297, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8758385181427002, + "num_tokens": 759952429.0, + "step": 19914 + }, + { + "epoch": 2.533392698130009, + "ewc_loss": 0.07958699017763138, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004223347641527653, + "grad_norm": 9.344751358032227, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8660399913787842, + "num_tokens": 759987678.0, + "step": 19915 + }, + { + "epoch": 2.5335199084085995, + "ewc_loss": 0.07907575368881226, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004172223270870745, + "grad_norm": 9.199946403503418, + "learning_rate": 1e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8504875302314758, + "num_tokens": 760023943.0, + "step": 19916 + }, + { + "epoch": 2.53364711868719, + "ewc_loss": 0.0797533318400383, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042399816447868943, + "grad_norm": 9.381372451782227, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8677955865859985, + "num_tokens": 760068229.0, + "step": 19917 + }, + { + "epoch": 2.5337743289657806, + "ewc_loss": 0.078855961561203, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000415024405810982, + "grad_norm": 9.214272499084473, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8508290648460388, + "num_tokens": 760109440.0, + "step": 19918 + }, + { + "epoch": 2.533901539244371, + "ewc_loss": 0.0797400251030922, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042386510176584125, + "grad_norm": 9.443081855773926, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8640439510345459, + "num_tokens": 760146616.0, + "step": 19919 + }, + { + "epoch": 2.534028749522961, + "ewc_loss": 0.07875658571720123, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004140307428315282, + "grad_norm": 9.152796745300293, + "learning_rate": 1e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8447117805480957, + "num_tokens": 760188708.0, + "step": 19920 + }, + { + "epoch": 2.534155959801552, + "ewc_loss": 0.07975254207849503, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004239902482368052, + "grad_norm": 9.319164276123047, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8803825378417969, + "num_tokens": 760231355.0, + "step": 19921 + }, + { + "epoch": 2.5342831700801423, + "ewc_loss": 0.0788370817899704, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041483566747047007, + "grad_norm": 9.187002182006836, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8506127595901489, + "num_tokens": 760268316.0, + "step": 19922 + }, + { + "epoch": 2.5344103803587332, + "ewc_loss": 0.07965922355651855, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004230570630170405, + "grad_norm": 9.352313041687012, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8551054000854492, + "num_tokens": 760305420.0, + "step": 19923 + }, + { + "epoch": 2.5345375906373233, + "ewc_loss": 0.0790681540966034, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041714642429724336, + "grad_norm": 9.231912612915039, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8823890089988708, + "num_tokens": 760342195.0, + "step": 19924 + }, + { + "epoch": 2.534664800915914, + "ewc_loss": 0.0795656368136406, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042212120024487376, + "grad_norm": 9.376236915588379, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8778005838394165, + "num_tokens": 760375394.0, + "step": 19925 + }, + { + "epoch": 2.5347920111945044, + "ewc_loss": 0.07913750410079956, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041783993947319686, + "grad_norm": 9.284399032592773, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8530527353286743, + "num_tokens": 760411629.0, + "step": 19926 + }, + { + "epoch": 2.534919221473095, + "ewc_loss": 0.07948878407478333, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004213527136016637, + "grad_norm": 9.311790466308594, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8653879165649414, + "num_tokens": 760456212.0, + "step": 19927 + }, + { + "epoch": 2.5350464317516854, + "ewc_loss": 0.07915765047073364, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000418041308876127, + "grad_norm": 9.220686912536621, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8556911945343018, + "num_tokens": 760496385.0, + "step": 19928 + }, + { + "epoch": 2.535173642030276, + "ewc_loss": 0.07969649136066437, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004234297084622085, + "grad_norm": 9.399690628051758, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8624730706214905, + "num_tokens": 760530426.0, + "step": 19929 + }, + { + "epoch": 2.5353008523088665, + "ewc_loss": 0.07901221513748169, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004165870195720345, + "grad_norm": 9.265254020690918, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.875998854637146, + "num_tokens": 760568174.0, + "step": 19930 + }, + { + "epoch": 2.535428062587457, + "ewc_loss": 0.07967811822891235, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004232460632920265, + "grad_norm": 9.328978538513184, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8819496631622314, + "num_tokens": 760608097.0, + "step": 19931 + }, + { + "epoch": 2.5355552728660475, + "ewc_loss": 0.07913820445537567, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004178468370810151, + "grad_norm": 9.27326488494873, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8648011088371277, + "num_tokens": 760643018.0, + "step": 19932 + }, + { + "epoch": 2.535682483144638, + "ewc_loss": 0.07973278313875198, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042135125841014087, + "grad_norm": 9.327088356018066, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8708083033561707, + "num_tokens": 760676553.0, + "step": 19933 + }, + { + "epoch": 2.5358096934232286, + "ewc_loss": 0.07955821603536606, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000419605610659346, + "grad_norm": 9.292938232421875, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8789961338043213, + "num_tokens": 760708850.0, + "step": 19934 + }, + { + "epoch": 2.535936903701819, + "ewc_loss": 0.07966864109039307, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042070980998687446, + "grad_norm": 9.305070877075195, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8596134781837463, + "num_tokens": 760746088.0, + "step": 19935 + }, + { + "epoch": 2.5360641139804097, + "ewc_loss": 0.0796314999461174, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042033844511024654, + "grad_norm": 9.216980934143066, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8700292110443115, + "num_tokens": 760785716.0, + "step": 19936 + }, + { + "epoch": 2.536191324259, + "ewc_loss": 0.07969744503498077, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042099785059690475, + "grad_norm": 9.306812286376953, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8728930354118347, + "num_tokens": 760821301.0, + "step": 19937 + }, + { + "epoch": 2.5363185345375907, + "ewc_loss": 0.07933063805103302, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004173298366367817, + "grad_norm": 9.170676231384277, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8543483018875122, + "num_tokens": 760861200.0, + "step": 19938 + }, + { + "epoch": 2.5364457448161812, + "ewc_loss": 0.08015349507331848, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042555833351798356, + "grad_norm": 9.446075439453125, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.862098217010498, + "num_tokens": 760896398.0, + "step": 19939 + }, + { + "epoch": 2.5365729550947718, + "ewc_loss": 0.07899540662765503, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004139775119256228, + "grad_norm": 9.141996383666992, + "learning_rate": 1e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.8436135053634644, + "num_tokens": 760937300.0, + "step": 19940 + }, + { + "epoch": 2.5367001653733623, + "ewc_loss": 0.08017126470804214, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004257360997144133, + "grad_norm": 9.403814315795898, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8582326173782349, + "num_tokens": 760973726.0, + "step": 19941 + }, + { + "epoch": 2.536827375651953, + "ewc_loss": 0.07921765744686127, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004161999677307904, + "grad_norm": 9.192716598510742, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8610808849334717, + "num_tokens": 761009048.0, + "step": 19942 + }, + { + "epoch": 2.5369545859305433, + "ewc_loss": 0.08024850487709045, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042650848627090454, + "grad_norm": 9.37999153137207, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8616082668304443, + "num_tokens": 761050836.0, + "step": 19943 + }, + { + "epoch": 2.537081796209134, + "ewc_loss": 0.07950501143932343, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00041663218871690333, + "grad_norm": 9.234747886657715, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8550031185150146, + "num_tokens": 761087517.0, + "step": 19944 + }, + { + "epoch": 2.537209006487724, + "ewc_loss": 0.07991483807563782, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042317179031670094, + "grad_norm": 9.28553295135498, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8771159648895264, + "num_tokens": 761121401.0, + "step": 19945 + }, + { + "epoch": 2.537336216766315, + "ewc_loss": 0.07954483479261398, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004194717912469059, + "grad_norm": 9.280986785888672, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8527428507804871, + "num_tokens": 761154973.0, + "step": 19946 + }, + { + "epoch": 2.537463427044905, + "ewc_loss": 0.07959994673728943, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042002290138043463, + "grad_norm": 9.300232887268066, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.87240070104599, + "num_tokens": 761194211.0, + "step": 19947 + }, + { + "epoch": 2.537590637323496, + "ewc_loss": 0.07947385311126709, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041876197792589664, + "grad_norm": 9.211161613464355, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8564726114273071, + "num_tokens": 761233486.0, + "step": 19948 + }, + { + "epoch": 2.537717847602086, + "ewc_loss": 0.07982562482357025, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004222796414978802, + "grad_norm": 9.326098442077637, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8643166422843933, + "num_tokens": 761275846.0, + "step": 19949 + }, + { + "epoch": 2.5378450578806766, + "ewc_loss": 0.07933807373046875, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041740413871593773, + "grad_norm": 9.225571632385254, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8544197082519531, + "num_tokens": 761311210.0, + "step": 19950 + }, + { + "epoch": 2.537972268159267, + "ewc_loss": 0.07987193763256073, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004227427707519382, + "grad_norm": 9.262331008911133, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8638076782226562, + "num_tokens": 761349961.0, + "step": 19951 + }, + { + "epoch": 2.5380994784378577, + "ewc_loss": 0.07959918677806854, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004200153052806854, + "grad_norm": 9.264629364013672, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8710241317749023, + "num_tokens": 761386993.0, + "step": 19952 + }, + { + "epoch": 2.538226688716448, + "ewc_loss": 0.07973599433898926, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042138341814279556, + "grad_norm": 9.235218048095703, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8551470041275024, + "num_tokens": 761426910.0, + "step": 19953 + }, + { + "epoch": 2.5383538989950387, + "ewc_loss": 0.07980280369520187, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042205146746709943, + "grad_norm": 9.266880989074707, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8674376606941223, + "num_tokens": 761466633.0, + "step": 19954 + }, + { + "epoch": 2.5384811092736292, + "ewc_loss": 0.07959727942943573, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004199962713755667, + "grad_norm": 9.289939880371094, + "learning_rate": 1e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8498586416244507, + "num_tokens": 761497670.0, + "step": 19955 + }, + { + "epoch": 2.5386083195522198, + "ewc_loss": 0.07953809946775436, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041940444498322904, + "grad_norm": 9.241835594177246, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8608955144882202, + "num_tokens": 761534626.0, + "step": 19956 + }, + { + "epoch": 2.5387355298308103, + "ewc_loss": 0.0796971470117569, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042099488200619817, + "grad_norm": 9.229681015014648, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8705199956893921, + "num_tokens": 761581125.0, + "step": 19957 + }, + { + "epoch": 2.538862740109401, + "ewc_loss": 0.07995802164077759, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004211622872389853, + "grad_norm": 14.694782257080078, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8645914793014526, + "num_tokens": 761616533.0, + "step": 19958 + }, + { + "epoch": 2.5389899503879914, + "ewc_loss": 0.08641766011714935, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00048820002120919526, + "grad_norm": 9.867898941040039, + "learning_rate": 1e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8552088737487793, + "num_tokens": 761650615.0, + "step": 19959 + }, + { + "epoch": 2.539117160666582, + "ewc_loss": 0.08303509652614594, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000454374443506822, + "grad_norm": 9.89106273651123, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8745978474617004, + "num_tokens": 761687020.0, + "step": 19960 + }, + { + "epoch": 2.5392443709451724, + "ewc_loss": 0.08012444525957108, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004252679063938558, + "grad_norm": 9.377187728881836, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8788602948188782, + "num_tokens": 761721653.0, + "step": 19961 + }, + { + "epoch": 2.539371581223763, + "ewc_loss": 0.08350229263305664, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004590463940985501, + "grad_norm": 9.901385307312012, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8710949420928955, + "num_tokens": 761758377.0, + "step": 19962 + }, + { + "epoch": 2.5394987915023535, + "ewc_loss": 0.08010472357273102, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004250706697348505, + "grad_norm": 9.433073043823242, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8621326684951782, + "num_tokens": 761795263.0, + "step": 19963 + }, + { + "epoch": 2.539626001780944, + "ewc_loss": 0.0818297266960144, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004423207137733698, + "grad_norm": 9.655973434448242, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8752796053886414, + "num_tokens": 761834758.0, + "step": 19964 + }, + { + "epoch": 2.5397532120595345, + "ewc_loss": 0.07995887100696564, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004236121312715113, + "grad_norm": 9.373431205749512, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8562769293785095, + "num_tokens": 761880482.0, + "step": 19965 + }, + { + "epoch": 2.539880422338125, + "ewc_loss": 0.08124365657567978, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043645998812280595, + "grad_norm": 9.642797470092773, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8634818196296692, + "num_tokens": 761922494.0, + "step": 19966 + }, + { + "epoch": 2.5400076326167156, + "ewc_loss": 0.07921817898750305, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041864666854962707, + "grad_norm": 9.253035545349121, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8818985223770142, + "num_tokens": 761959462.0, + "step": 19967 + }, + { + "epoch": 2.5401348428953057, + "ewc_loss": 0.08107953518629074, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004348187940195203, + "grad_norm": 9.694086074829102, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8535255789756775, + "num_tokens": 761990316.0, + "step": 19968 + }, + { + "epoch": 2.5402620531738966, + "ewc_loss": 0.07936209440231323, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041764441994018853, + "grad_norm": 9.248470306396484, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.858591616153717, + "num_tokens": 762023671.0, + "step": 19969 + }, + { + "epoch": 2.5403892634524867, + "ewc_loss": 0.08084232360124588, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004324466863181442, + "grad_norm": 9.59178638458252, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8695723414421082, + "num_tokens": 762056719.0, + "step": 19970 + }, + { + "epoch": 2.5405164737310777, + "ewc_loss": 0.07938256114721298, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004178490489721298, + "grad_norm": 9.243728637695312, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8655025362968445, + "num_tokens": 762098802.0, + "step": 19971 + }, + { + "epoch": 2.540643684009668, + "ewc_loss": 0.08075208961963654, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004315443220548332, + "grad_norm": 9.555848121643066, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.866814374923706, + "num_tokens": 762134470.0, + "step": 19972 + }, + { + "epoch": 2.5407708942882588, + "ewc_loss": 0.07993514835834503, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004160506941843778, + "grad_norm": 9.379396438598633, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.866066038608551, + "num_tokens": 762177163.0, + "step": 19973 + }, + { + "epoch": 2.540898104566849, + "ewc_loss": 0.07999731600284576, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042399659287184477, + "grad_norm": 9.41562271118164, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8720656633377075, + "num_tokens": 762217323.0, + "step": 19974 + }, + { + "epoch": 2.5410253148454394, + "ewc_loss": 0.07954199612140656, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041944344411604106, + "grad_norm": 9.3638277053833, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8588477373123169, + "num_tokens": 762252144.0, + "step": 19975 + }, + { + "epoch": 2.54115252512403, + "ewc_loss": 0.07938829809427261, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041790641262196004, + "grad_norm": 9.373117446899414, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.856451153755188, + "num_tokens": 762282108.0, + "step": 19976 + }, + { + "epoch": 2.5412797354026204, + "ewc_loss": 0.07953609526157379, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041938439244404435, + "grad_norm": 9.332115173339844, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8680901527404785, + "num_tokens": 762318154.0, + "step": 19977 + }, + { + "epoch": 2.541406945681211, + "ewc_loss": 0.07928697764873505, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004168932209722698, + "grad_norm": 9.295633316040039, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8614705801010132, + "num_tokens": 762354704.0, + "step": 19978 + }, + { + "epoch": 2.5415341559598015, + "ewc_loss": 0.07930866628885269, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004195515066385269, + "grad_norm": 9.425004959106445, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8650925159454346, + "num_tokens": 762389943.0, + "step": 19979 + }, + { + "epoch": 2.541661366238392, + "ewc_loss": 0.07882524281740189, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004147172730881721, + "grad_norm": 9.22357177734375, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8665874600410461, + "num_tokens": 762421560.0, + "step": 19980 + }, + { + "epoch": 2.5417885765169825, + "ewc_loss": 0.07957436144351959, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042220851173624396, + "grad_norm": 9.432296752929688, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8693000078201294, + "num_tokens": 762458509.0, + "step": 19981 + }, + { + "epoch": 2.541915786795573, + "ewc_loss": 0.0785810649394989, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004122754617128521, + "grad_norm": 9.184663772583008, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8624049425125122, + "num_tokens": 762502382.0, + "step": 19982 + }, + { + "epoch": 2.5420429970741636, + "ewc_loss": 0.07968419790267944, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004233068029861897, + "grad_norm": 9.476773262023926, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8603388071060181, + "num_tokens": 762536611.0, + "step": 19983 + }, + { + "epoch": 2.542170207352754, + "ewc_loss": 0.07850778102874756, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041154262726195157, + "grad_norm": 9.117757797241211, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8648418188095093, + "num_tokens": 762577801.0, + "step": 19984 + }, + { + "epoch": 2.5422974176313446, + "ewc_loss": 0.08004771918058395, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042694201692938805, + "grad_norm": 9.541156768798828, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8657000064849854, + "num_tokens": 762617413.0, + "step": 19985 + }, + { + "epoch": 2.542424627909935, + "ewc_loss": 0.0782056450843811, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00040852127131074667, + "grad_norm": 9.017141342163086, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8734092712402344, + "num_tokens": 762654811.0, + "step": 19986 + }, + { + "epoch": 2.5425518381885257, + "ewc_loss": 0.08049113303422928, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004313761892262846, + "grad_norm": 9.48971939086914, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8692071437835693, + "num_tokens": 762692902.0, + "step": 19987 + }, + { + "epoch": 2.5426790484671162, + "ewc_loss": 0.07847023010253906, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004111671878490597, + "grad_norm": 9.09489631652832, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.865568995475769, + "num_tokens": 762731984.0, + "step": 19988 + }, + { + "epoch": 2.5428062587457068, + "ewc_loss": 0.08037254959344864, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043019032455049455, + "grad_norm": 9.44973373413086, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8728841543197632, + "num_tokens": 762767104.0, + "step": 19989 + }, + { + "epoch": 2.5429334690242973, + "ewc_loss": 0.07888400554656982, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041530493763275445, + "grad_norm": 9.187355041503906, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8526606559753418, + "num_tokens": 762804982.0, + "step": 19990 + }, + { + "epoch": 2.543060679302888, + "ewc_loss": 0.0800352543592453, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042681742343120277, + "grad_norm": 9.457154273986816, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8558417558670044, + "num_tokens": 762842956.0, + "step": 19991 + }, + { + "epoch": 2.5431878895814783, + "ewc_loss": 0.07916160672903061, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004156394861638546, + "grad_norm": 9.152461051940918, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8711076974868774, + "num_tokens": 762875445.0, + "step": 19992 + }, + { + "epoch": 2.5433150998600684, + "ewc_loss": 0.08023318648338318, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042879674583673477, + "grad_norm": 9.493888854980469, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8772439956665039, + "num_tokens": 762906061.0, + "step": 19993 + }, + { + "epoch": 2.5434423101386594, + "ewc_loss": 0.07897431403398514, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000416207971284166, + "grad_norm": 9.240217208862305, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.873054027557373, + "num_tokens": 762945550.0, + "step": 19994 + }, + { + "epoch": 2.5435695204172495, + "ewc_loss": 0.07995045930147171, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004259694251231849, + "grad_norm": 9.393964767456055, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8640060424804688, + "num_tokens": 762982586.0, + "step": 19995 + }, + { + "epoch": 2.5436967306958405, + "ewc_loss": 0.07901866734027863, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004166515718679875, + "grad_norm": 9.274717330932617, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8640297651290894, + "num_tokens": 763013527.0, + "step": 19996 + }, + { + "epoch": 2.5438239409744305, + "ewc_loss": 0.0796191394329071, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004226562159601599, + "grad_norm": 9.357331275939941, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8574309349060059, + "num_tokens": 763054363.0, + "step": 19997 + }, + { + "epoch": 2.5439511512530215, + "ewc_loss": 0.07926972955465317, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004191621264908463, + "grad_norm": 9.280069351196289, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8606742024421692, + "num_tokens": 763096834.0, + "step": 19998 + }, + { + "epoch": 2.5440783615316116, + "ewc_loss": 0.07952164113521576, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004216812667436898, + "grad_norm": 9.31155014038086, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8637078404426575, + "num_tokens": 763139611.0, + "step": 19999 + }, + { + "epoch": 2.544205571810202, + "ewc_loss": 0.07927922904491425, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004192570922896266, + "grad_norm": 9.299893379211426, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8649416565895081, + "num_tokens": 763181085.0, + "step": 20000 + }, + { + "epoch": 2.5443327820887927, + "ewc_loss": 0.07948945462703705, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004189179453533143, + "grad_norm": 9.241621971130371, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8622434139251709, + "num_tokens": 763219811.0, + "step": 20001 + }, + { + "epoch": 2.544459992367383, + "ewc_loss": 0.0793386846780777, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004198517417535186, + "grad_norm": 9.317124366760254, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8537462949752808, + "num_tokens": 763257745.0, + "step": 20002 + }, + { + "epoch": 2.5445872026459737, + "ewc_loss": 0.07920046150684357, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041846948442980647, + "grad_norm": 9.310104370117188, + "learning_rate": 1e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8498232364654541, + "num_tokens": 763299475.0, + "step": 20003 + }, + { + "epoch": 2.5447144129245642, + "ewc_loss": 0.07925538718700409, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041901873191818595, + "grad_norm": 9.324617385864258, + "learning_rate": 1e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.8424005508422852, + "num_tokens": 763333960.0, + "step": 20004 + }, + { + "epoch": 2.5448416232031548, + "ewc_loss": 0.07931925356388092, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041965735726989806, + "grad_norm": 9.292343139648438, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8562732338905334, + "num_tokens": 763378646.0, + "step": 20005 + }, + { + "epoch": 2.5449688334817453, + "ewc_loss": 0.07927665114402771, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004192313936073333, + "grad_norm": 9.300254821777344, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.868384599685669, + "num_tokens": 763412441.0, + "step": 20006 + }, + { + "epoch": 2.545096043760336, + "ewc_loss": 0.07934379577636719, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004199028480798006, + "grad_norm": 9.28537654876709, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8674912452697754, + "num_tokens": 763446713.0, + "step": 20007 + }, + { + "epoch": 2.5452232540389264, + "ewc_loss": 0.07928028702735901, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041926768608391285, + "grad_norm": 9.322471618652344, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8740077614784241, + "num_tokens": 763478686.0, + "step": 20008 + }, + { + "epoch": 2.545350464317517, + "ewc_loss": 0.07913056015968323, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004177704977337271, + "grad_norm": 9.220274925231934, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8800072073936462, + "num_tokens": 763517258.0, + "step": 20009 + }, + { + "epoch": 2.5454776745961074, + "ewc_loss": 0.07958801835775375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004223450378049165, + "grad_norm": 9.347460746765137, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8595119118690491, + "num_tokens": 763553229.0, + "step": 20010 + }, + { + "epoch": 2.545604884874698, + "ewc_loss": 0.0790221095085144, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041668591438792646, + "grad_norm": 9.255736351013184, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8725606799125671, + "num_tokens": 763594274.0, + "step": 20011 + }, + { + "epoch": 2.5457320951532885, + "ewc_loss": 0.07948407530784607, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004213055653963238, + "grad_norm": 9.304411888122559, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8567625284194946, + "num_tokens": 763632162.0, + "step": 20012 + }, + { + "epoch": 2.545859305431879, + "ewc_loss": 0.07907797396183014, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004172445915173739, + "grad_norm": 9.283647537231445, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8663605451583862, + "num_tokens": 763666797.0, + "step": 20013 + }, + { + "epoch": 2.5459865157104695, + "ewc_loss": 0.07944238930940628, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042088874033652246, + "grad_norm": 9.308902740478516, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8607068061828613, + "num_tokens": 763705475.0, + "step": 20014 + }, + { + "epoch": 2.54611372598906, + "ewc_loss": 0.07915826141834259, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041804747888818383, + "grad_norm": 9.260519027709961, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8799307346343994, + "num_tokens": 763739434.0, + "step": 20015 + }, + { + "epoch": 2.5462409362676506, + "ewc_loss": 0.07946018129587173, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004210666520521045, + "grad_norm": 9.356315612792969, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8512752652168274, + "num_tokens": 763773262.0, + "step": 20016 + }, + { + "epoch": 2.546368146546241, + "ewc_loss": 0.07909692823886871, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174340865574777, + "grad_norm": 9.233299255371094, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8534891605377197, + "num_tokens": 763812573.0, + "step": 20017 + }, + { + "epoch": 2.546495356824831, + "ewc_loss": 0.0794752836227417, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004212177300360054, + "grad_norm": 9.357872009277344, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8661165237426758, + "num_tokens": 763848555.0, + "step": 20018 + }, + { + "epoch": 2.546622567103422, + "ewc_loss": 0.07896347343921661, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041609961772337556, + "grad_norm": 9.254026412963867, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8615477085113525, + "num_tokens": 763884608.0, + "step": 20019 + }, + { + "epoch": 2.5467497773820122, + "ewc_loss": 0.07954826951026917, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042194753768853843, + "grad_norm": 9.335134506225586, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.859023928642273, + "num_tokens": 763922038.0, + "step": 20020 + }, + { + "epoch": 2.546876987660603, + "ewc_loss": 0.07909369468688965, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174018104095012, + "grad_norm": 9.258306503295898, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.860595703125, + "num_tokens": 763958089.0, + "step": 20021 + }, + { + "epoch": 2.5470041979391933, + "ewc_loss": 0.07955144345760345, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004219793190713972, + "grad_norm": 9.369344711303711, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8657646179199219, + "num_tokens": 764000802.0, + "step": 20022 + }, + { + "epoch": 2.547131408217784, + "ewc_loss": 0.07915061712265015, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041797099402174354, + "grad_norm": 9.222179412841797, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8687483668327332, + "num_tokens": 764036628.0, + "step": 20023 + }, + { + "epoch": 2.5472586184963744, + "ewc_loss": 0.07985034584999084, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042252690764144063, + "grad_norm": 9.372570991516113, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8664372563362122, + "num_tokens": 764073643.0, + "step": 20024 + }, + { + "epoch": 2.547385828774965, + "ewc_loss": 0.0790976881980896, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041744174086488783, + "grad_norm": 9.2952241897583, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8703863620758057, + "num_tokens": 764110473.0, + "step": 20025 + }, + { + "epoch": 2.5475130390535554, + "ewc_loss": 0.07982257008552551, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042224914068356156, + "grad_norm": 9.361971855163574, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.861548662185669, + "num_tokens": 764142941.0, + "step": 20026 + }, + { + "epoch": 2.547640249332146, + "ewc_loss": 0.07937157899141312, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004177392111159861, + "grad_norm": 9.250924110412598, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8665976524353027, + "num_tokens": 764178752.0, + "step": 20027 + }, + { + "epoch": 2.5477674596107365, + "ewc_loss": 0.07943875342607498, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000420852389652282, + "grad_norm": 9.360121726989746, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8617573380470276, + "num_tokens": 764217440.0, + "step": 20028 + }, + { + "epoch": 2.547894669889327, + "ewc_loss": 0.07910075783729553, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041747241630218923, + "grad_norm": 9.280187606811523, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8531936407089233, + "num_tokens": 764257524.0, + "step": 20029 + }, + { + "epoch": 2.5480218801679175, + "ewc_loss": 0.07938406616449356, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004203054995741695, + "grad_norm": 9.349485397338867, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8580437898635864, + "num_tokens": 764293645.0, + "step": 20030 + }, + { + "epoch": 2.548149090446508, + "ewc_loss": 0.07908813655376434, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004173461638856679, + "grad_norm": 9.27720832824707, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.871671199798584, + "num_tokens": 764331660.0, + "step": 20031 + }, + { + "epoch": 2.5482763007250986, + "ewc_loss": 0.07938243448734283, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042028914322145283, + "grad_norm": 9.24580192565918, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.883185625076294, + "num_tokens": 764376577.0, + "step": 20032 + }, + { + "epoch": 2.548403511003689, + "ewc_loss": 0.07944761961698532, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004209410399198532, + "grad_norm": 9.359915733337402, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8623208999633789, + "num_tokens": 764415492.0, + "step": 20033 + }, + { + "epoch": 2.5485307212822796, + "ewc_loss": 0.07917600870132446, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041822492494247854, + "grad_norm": 9.362676620483398, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8541576266288757, + "num_tokens": 764453948.0, + "step": 20034 + }, + { + "epoch": 2.54865793156087, + "ewc_loss": 0.0792609304189682, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041907414561137557, + "grad_norm": 9.261106491088867, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8688855767250061, + "num_tokens": 764493906.0, + "step": 20035 + }, + { + "epoch": 2.5487851418394607, + "ewc_loss": 0.07967107743024826, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004207341989967972, + "grad_norm": 9.365485191345215, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8579180836677551, + "num_tokens": 764532379.0, + "step": 20036 + }, + { + "epoch": 2.5489123521180512, + "ewc_loss": 0.07906143367290497, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004170791944488883, + "grad_norm": 9.249712944030762, + "learning_rate": 1e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8451231718063354, + "num_tokens": 764567726.0, + "step": 20037 + }, + { + "epoch": 2.5490395623966418, + "ewc_loss": 0.07960085570812225, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004224733856972307, + "grad_norm": 9.33069896697998, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8647404909133911, + "num_tokens": 764606229.0, + "step": 20038 + }, + { + "epoch": 2.5491667726752323, + "ewc_loss": 0.07915417850017548, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004180065880063921, + "grad_norm": 9.27128791809082, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.875609278678894, + "num_tokens": 764641346.0, + "step": 20039 + }, + { + "epoch": 2.549293982953823, + "ewc_loss": 0.07959973067045212, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004224621516186744, + "grad_norm": 9.38554859161377, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8641358613967896, + "num_tokens": 764687286.0, + "step": 20040 + }, + { + "epoch": 2.5494211932324133, + "ewc_loss": 0.07930772751569748, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004195421061012894, + "grad_norm": 9.27695369720459, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8669303059577942, + "num_tokens": 764729235.0, + "step": 20041 + }, + { + "epoch": 2.549548403511004, + "ewc_loss": 0.07970327138900757, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004234975203871727, + "grad_norm": 9.453973770141602, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8570151329040527, + "num_tokens": 764771834.0, + "step": 20042 + }, + { + "epoch": 2.549675613789594, + "ewc_loss": 0.07911530137062073, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000417617877246812, + "grad_norm": 9.346701622009277, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8640837669372559, + "num_tokens": 764808307.0, + "step": 20043 + }, + { + "epoch": 2.549802824068185, + "ewc_loss": 0.07944047451019287, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042086959001608193, + "grad_norm": 9.312227249145508, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.857824981212616, + "num_tokens": 764848373.0, + "step": 20044 + }, + { + "epoch": 2.549930034346775, + "ewc_loss": 0.07944342494010925, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042089904309250414, + "grad_norm": 9.412773132324219, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8608206510543823, + "num_tokens": 764886468.0, + "step": 20045 + }, + { + "epoch": 2.550057244625366, + "ewc_loss": 0.0794186145067215, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004182095290161669, + "grad_norm": 9.376322746276855, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8670563697814941, + "num_tokens": 764928690.0, + "step": 20046 + }, + { + "epoch": 2.550184454903956, + "ewc_loss": 0.07908711582422256, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004173360066488385, + "grad_norm": 9.353371620178223, + "learning_rate": 1e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.829169511795044, + "num_tokens": 764969148.0, + "step": 20047 + }, + { + "epoch": 2.5503116651825466, + "ewc_loss": 0.07928935438394547, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004193584027234465, + "grad_norm": 9.492219924926758, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8580343127250671, + "num_tokens": 765010754.0, + "step": 20048 + }, + { + "epoch": 2.550438875461137, + "ewc_loss": 0.07882878184318542, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000414752634242177, + "grad_norm": 9.387792587280273, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8677623867988586, + "num_tokens": 765043562.0, + "step": 20049 + }, + { + "epoch": 2.5505660857397277, + "ewc_loss": 0.07909560203552246, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174208443146199, + "grad_norm": 9.358589172363281, + "learning_rate": 1e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.844841718673706, + "num_tokens": 765085638.0, + "step": 20050 + }, + { + "epoch": 2.550693296018318, + "ewc_loss": 0.07898285984992981, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041629342013038695, + "grad_norm": 9.416543006896973, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8703093528747559, + "num_tokens": 765115847.0, + "step": 20051 + }, + { + "epoch": 2.5508205062969087, + "ewc_loss": 0.07876855134963989, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004141503886785358, + "grad_norm": 9.49472427368164, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8628089427947998, + "num_tokens": 765146839.0, + "step": 20052 + }, + { + "epoch": 2.5509477165754992, + "ewc_loss": 0.07879045605659485, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041436945321038365, + "grad_norm": 9.344538688659668, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8638248443603516, + "num_tokens": 765185921.0, + "step": 20053 + }, + { + "epoch": 2.5510749268540898, + "ewc_loss": 0.0789656788110733, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004161215911153704, + "grad_norm": 9.398158073425293, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8625248670578003, + "num_tokens": 765224534.0, + "step": 20054 + }, + { + "epoch": 2.5512021371326803, + "ewc_loss": 0.07887022197246552, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004127256397623569, + "grad_norm": 9.178415298461914, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8738399744033813, + "num_tokens": 765267160.0, + "step": 20055 + }, + { + "epoch": 2.551329347411271, + "ewc_loss": 0.07964572310447693, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042048064642585814, + "grad_norm": 9.551237106323242, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8649775385856628, + "num_tokens": 765304770.0, + "step": 20056 + }, + { + "epoch": 2.5514565576898613, + "ewc_loss": 0.07846691459417343, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000408692576456815, + "grad_norm": 9.149897575378418, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.859994649887085, + "num_tokens": 765341966.0, + "step": 20057 + }, + { + "epoch": 2.551583767968452, + "ewc_loss": 0.07987667620182037, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004252316430211067, + "grad_norm": 9.42350959777832, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8798185586929321, + "num_tokens": 765382653.0, + "step": 20058 + }, + { + "epoch": 2.5517109782470424, + "ewc_loss": 0.07839326560497284, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041039747884497046, + "grad_norm": 9.195839881896973, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8631544709205627, + "num_tokens": 765414940.0, + "step": 20059 + }, + { + "epoch": 2.551838188525633, + "ewc_loss": 0.0796566903591156, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042303180089220405, + "grad_norm": 9.483786582946777, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8652125597000122, + "num_tokens": 765445549.0, + "step": 20060 + }, + { + "epoch": 2.5519653988042235, + "ewc_loss": 0.07855622470378876, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041202708962373435, + "grad_norm": 9.179244995117188, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8713171482086182, + "num_tokens": 765484474.0, + "step": 20061 + }, + { + "epoch": 2.552092609082814, + "ewc_loss": 0.07986843585968018, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042514916276559234, + "grad_norm": 9.50402545928955, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8640463948249817, + "num_tokens": 765522651.0, + "step": 20062 + }, + { + "epoch": 2.5522198193614045, + "ewc_loss": 0.0783991813659668, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004104567051399499, + "grad_norm": 9.161393165588379, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8604397773742676, + "num_tokens": 765565983.0, + "step": 20063 + }, + { + "epoch": 2.552347029639995, + "ewc_loss": 0.08018961548805237, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000425919599365443, + "grad_norm": 9.52438735961914, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8645550012588501, + "num_tokens": 765604056.0, + "step": 20064 + }, + { + "epoch": 2.5524742399185856, + "ewc_loss": 0.07848536223173141, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004113184695597738, + "grad_norm": 9.187664985656738, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8723125457763672, + "num_tokens": 765646062.0, + "step": 20065 + }, + { + "epoch": 2.5526014501971757, + "ewc_loss": 0.07993099093437195, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042577474960125983, + "grad_norm": 9.49375057220459, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8618204593658447, + "num_tokens": 765687284.0, + "step": 20066 + }, + { + "epoch": 2.5527286604757666, + "ewc_loss": 0.07863610982894897, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041282596066594124, + "grad_norm": 9.254064559936523, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8517148494720459, + "num_tokens": 765723743.0, + "step": 20067 + }, + { + "epoch": 2.5528558707543567, + "ewc_loss": 0.07976619899272919, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004241268616169691, + "grad_norm": 9.470508575439453, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8619515299797058, + "num_tokens": 765768472.0, + "step": 20068 + }, + { + "epoch": 2.5529830810329477, + "ewc_loss": 0.07877972722053528, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041426208917982876, + "grad_norm": 9.181417465209961, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8814340233802795, + "num_tokens": 765809040.0, + "step": 20069 + }, + { + "epoch": 2.5531102913115378, + "ewc_loss": 0.08003611862659454, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004268260090611875, + "grad_norm": 9.510662078857422, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8653943538665771, + "num_tokens": 765845968.0, + "step": 20070 + }, + { + "epoch": 2.5532375015901287, + "ewc_loss": 0.07861596345901489, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004126245330553502, + "grad_norm": 9.175167083740234, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.869472086429596, + "num_tokens": 765881037.0, + "step": 20071 + }, + { + "epoch": 2.553364711868719, + "ewc_loss": 0.08028872311115265, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004293520760256797, + "grad_norm": 9.558836936950684, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8687012791633606, + "num_tokens": 765918945.0, + "step": 20072 + }, + { + "epoch": 2.5534919221473094, + "ewc_loss": 0.0785793662071228, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004122584650758654, + "grad_norm": 9.140023231506348, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.876649022102356, + "num_tokens": 765956128.0, + "step": 20073 + }, + { + "epoch": 2.5536191324259, + "ewc_loss": 0.08044981956481934, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043096308945678174, + "grad_norm": 9.574034690856934, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8774484992027283, + "num_tokens": 765997109.0, + "step": 20074 + }, + { + "epoch": 2.5537463427044904, + "ewc_loss": 0.07866711914539337, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004131360328756273, + "grad_norm": 9.195779800415039, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8604053854942322, + "num_tokens": 766032557.0, + "step": 20075 + }, + { + "epoch": 2.553873552983081, + "ewc_loss": 0.08039720356464386, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004304368339944631, + "grad_norm": 9.572393417358398, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8621939420700073, + "num_tokens": 766075285.0, + "step": 20076 + }, + { + "epoch": 2.5540007632616715, + "ewc_loss": 0.07869695127010345, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004134344053454697, + "grad_norm": 9.20043659210205, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8627489805221558, + "num_tokens": 766110600.0, + "step": 20077 + }, + { + "epoch": 2.554127973540262, + "ewc_loss": 0.08025074005126953, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004289722128305584, + "grad_norm": 9.540746688842773, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8526595830917358, + "num_tokens": 766146877.0, + "step": 20078 + }, + { + "epoch": 2.5542551838188525, + "ewc_loss": 0.07877659797668457, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004142308607697487, + "grad_norm": 9.191934585571289, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8612676858901978, + "num_tokens": 766185164.0, + "step": 20079 + }, + { + "epoch": 2.554382394097443, + "ewc_loss": 0.08011287450790405, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004275935934856534, + "grad_norm": 9.450167655944824, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8849548101425171, + "num_tokens": 766221442.0, + "step": 20080 + }, + { + "epoch": 2.5545096043760336, + "ewc_loss": 0.07889668643474579, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000415431713918224, + "grad_norm": 9.23610782623291, + "learning_rate": 1e-06, + "loss": 0.5544, + "mean_token_accuracy": 0.8398362398147583, + "num_tokens": 766255258.0, + "step": 20081 + }, + { + "epoch": 2.554636814654624, + "ewc_loss": 0.07992316037416458, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004256964602973312, + "grad_norm": 9.429073333740234, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8611411452293396, + "num_tokens": 766292392.0, + "step": 20082 + }, + { + "epoch": 2.5547640249332146, + "ewc_loss": 0.07894260436296463, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041589088505133986, + "grad_norm": 9.143160820007324, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8541810512542725, + "num_tokens": 766335312.0, + "step": 20083 + }, + { + "epoch": 2.554891235211805, + "ewc_loss": 0.08023136109113693, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004287784395273775, + "grad_norm": 9.481316566467285, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8665488958358765, + "num_tokens": 766372189.0, + "step": 20084 + }, + { + "epoch": 2.5550184454903957, + "ewc_loss": 0.07926477491855621, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004166711587458849, + "grad_norm": 9.175661087036133, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8531850576400757, + "num_tokens": 766414021.0, + "step": 20085 + }, + { + "epoch": 2.5551456557689862, + "ewc_loss": 0.08025753498077393, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042904014117084444, + "grad_norm": 9.44045639038086, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8776257634162903, + "num_tokens": 766452005.0, + "step": 20086 + }, + { + "epoch": 2.5552728660475768, + "ewc_loss": 0.07922561466693878, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041872099973261356, + "grad_norm": 9.201034545898438, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8579593300819397, + "num_tokens": 766492150.0, + "step": 20087 + }, + { + "epoch": 2.5554000763261673, + "ewc_loss": 0.0800887793302536, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004273526428733021, + "grad_norm": 9.38811206817627, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8745651245117188, + "num_tokens": 766533889.0, + "step": 20088 + }, + { + "epoch": 2.555527286604758, + "ewc_loss": 0.07962897419929504, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004203131829854101, + "grad_norm": 9.284945487976074, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8643007874488831, + "num_tokens": 766565008.0, + "step": 20089 + }, + { + "epoch": 2.5556544968833483, + "ewc_loss": 0.07982444763183594, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042470928747206926, + "grad_norm": 9.335304260253906, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8646057844161987, + "num_tokens": 766605744.0, + "step": 20090 + }, + { + "epoch": 2.5557817071619384, + "ewc_loss": 0.07953573018312454, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004218221583869308, + "grad_norm": 9.286020278930664, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8729759454727173, + "num_tokens": 766645782.0, + "step": 20091 + }, + { + "epoch": 2.5559089174405294, + "ewc_loss": 0.07986946403980255, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000425159465521574, + "grad_norm": 9.376638412475586, + "learning_rate": 1e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8541110754013062, + "num_tokens": 766683135.0, + "step": 20092 + }, + { + "epoch": 2.5560361277191195, + "ewc_loss": 0.07931205630302429, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000419585412601009, + "grad_norm": 9.249011039733887, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8648410439491272, + "num_tokens": 766720691.0, + "step": 20093 + }, + { + "epoch": 2.5561633379977104, + "ewc_loss": 0.07994218170642853, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004258866247255355, + "grad_norm": 9.416163444519043, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8593777418136597, + "num_tokens": 766755426.0, + "step": 20094 + }, + { + "epoch": 2.5562905482763005, + "ewc_loss": 0.07919348776340485, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004183997225482017, + "grad_norm": 9.238706588745117, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8702114820480347, + "num_tokens": 766788315.0, + "step": 20095 + }, + { + "epoch": 2.5564177585548915, + "ewc_loss": 0.07983412593603134, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042480608681216836, + "grad_norm": 9.368247032165527, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8796812295913696, + "num_tokens": 766825664.0, + "step": 20096 + }, + { + "epoch": 2.5565449688334816, + "ewc_loss": 0.07922361046075821, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004187009471934289, + "grad_norm": 9.196097373962402, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8746010065078735, + "num_tokens": 766868145.0, + "step": 20097 + }, + { + "epoch": 2.556672179112072, + "ewc_loss": 0.07993048429489136, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042576968553476036, + "grad_norm": 9.386176109313965, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8526897430419922, + "num_tokens": 766907278.0, + "step": 20098 + }, + { + "epoch": 2.5567993893906626, + "ewc_loss": 0.07909750193357468, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174398782197386, + "grad_norm": 9.20051383972168, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8755781650543213, + "num_tokens": 766941662.0, + "step": 20099 + }, + { + "epoch": 2.556926599669253, + "ewc_loss": 0.08009640872478485, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004274289822205901, + "grad_norm": 9.405335426330566, + "learning_rate": 1e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.84676194190979, + "num_tokens": 766977725.0, + "step": 20100 + }, + { + "epoch": 2.5570538099478437, + "ewc_loss": 0.07910996675491333, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041756450082175434, + "grad_norm": 9.166726112365723, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8584003448486328, + "num_tokens": 767011011.0, + "step": 20101 + }, + { + "epoch": 2.5571810202264342, + "ewc_loss": 0.08014116436243057, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042787648271769285, + "grad_norm": 9.597702980041504, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8621237277984619, + "num_tokens": 767047078.0, + "step": 20102 + }, + { + "epoch": 2.5573082305050248, + "ewc_loss": 0.07900229096412659, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041404637158848345, + "grad_norm": 9.101175308227539, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8601024150848389, + "num_tokens": 767089208.0, + "step": 20103 + }, + { + "epoch": 2.5574354407836153, + "ewc_loss": 0.08067911118268967, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043325594742782414, + "grad_norm": 9.494593620300293, + "learning_rate": 1e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8496441841125488, + "num_tokens": 767131225.0, + "step": 20104 + }, + { + "epoch": 2.557562651062206, + "ewc_loss": 0.07882185280323029, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041468339622952044, + "grad_norm": 9.13619613647461, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8510222434997559, + "num_tokens": 767165222.0, + "step": 20105 + }, + { + "epoch": 2.5576898613407963, + "ewc_loss": 0.08053134381771088, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043177822954021394, + "grad_norm": 9.437620162963867, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8644048571586609, + "num_tokens": 767200870.0, + "step": 20106 + }, + { + "epoch": 2.557817071619387, + "ewc_loss": 0.07921364903450012, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041860135388560593, + "grad_norm": 9.178536415100098, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8622989654541016, + "num_tokens": 767236881.0, + "step": 20107 + }, + { + "epoch": 2.5579442818979774, + "ewc_loss": 0.08018676936626434, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004283325106371194, + "grad_norm": 9.447020530700684, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8687455654144287, + "num_tokens": 767269716.0, + "step": 20108 + }, + { + "epoch": 2.558071492176568, + "ewc_loss": 0.07916297763586044, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041809462709352374, + "grad_norm": 9.17994213104248, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8541820049285889, + "num_tokens": 767306840.0, + "step": 20109 + }, + { + "epoch": 2.5581987024551585, + "ewc_loss": 0.07999841123819351, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004264489689376205, + "grad_norm": 9.396647453308105, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8764885067939758, + "num_tokens": 767346966.0, + "step": 20110 + }, + { + "epoch": 2.558325912733749, + "ewc_loss": 0.0792139321565628, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004186041187494993, + "grad_norm": 9.20525074005127, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8615652322769165, + "num_tokens": 767387084.0, + "step": 20111 + }, + { + "epoch": 2.5584531230123395, + "ewc_loss": 0.07997018098831177, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042616663267835975, + "grad_norm": 9.494723320007324, + "learning_rate": 1e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8446792364120483, + "num_tokens": 767417857.0, + "step": 20112 + }, + { + "epoch": 2.55858033329093, + "ewc_loss": 0.07901103049516678, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041657514520920813, + "grad_norm": 9.204573631286621, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8654571771621704, + "num_tokens": 767455771.0, + "step": 20113 + }, + { + "epoch": 2.5587075435695206, + "ewc_loss": 0.0798967033624649, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042543187737464905, + "grad_norm": 9.326895713806152, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8518145680427551, + "num_tokens": 767493187.0, + "step": 20114 + }, + { + "epoch": 2.558834753848111, + "ewc_loss": 0.07925198972225189, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041898476774804294, + "grad_norm": 9.200054168701172, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.85680091381073, + "num_tokens": 767534526.0, + "step": 20115 + }, + { + "epoch": 2.558961964126701, + "ewc_loss": 0.07976166903972626, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004240815178491175, + "grad_norm": 9.318540573120117, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8615318536758423, + "num_tokens": 767569297.0, + "step": 20116 + }, + { + "epoch": 2.559089174405292, + "ewc_loss": 0.07931908220052719, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041965566924773157, + "grad_norm": 9.243136405944824, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8598676323890686, + "num_tokens": 767605860.0, + "step": 20117 + }, + { + "epoch": 2.5592163846838822, + "ewc_loss": 0.07968160510063171, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042328090057708323, + "grad_norm": 9.296910285949707, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8660107851028442, + "num_tokens": 767645252.0, + "step": 20118 + }, + { + "epoch": 2.559343594962473, + "ewc_loss": 0.07929959893226624, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004194608482066542, + "grad_norm": 9.266263961791992, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8633683323860168, + "num_tokens": 767681942.0, + "step": 20119 + }, + { + "epoch": 2.5594708052410633, + "ewc_loss": 0.07954266667366028, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042189148371107876, + "grad_norm": 9.263172149658203, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8643889427185059, + "num_tokens": 767720631.0, + "step": 20120 + }, + { + "epoch": 2.559598015519654, + "ewc_loss": 0.0795878916978836, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042234372813254595, + "grad_norm": 9.302818298339844, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8687071204185486, + "num_tokens": 767757018.0, + "step": 20121 + }, + { + "epoch": 2.5597252257982444, + "ewc_loss": 0.07930131256580353, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041947801946662366, + "grad_norm": 9.265596389770508, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.867293119430542, + "num_tokens": 767796046.0, + "step": 20122 + }, + { + "epoch": 2.559852436076835, + "ewc_loss": 0.07959528267383575, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004224176227580756, + "grad_norm": 9.265728950500488, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8758516907691956, + "num_tokens": 767835678.0, + "step": 20123 + }, + { + "epoch": 2.5599796463554254, + "ewc_loss": 0.07955031096935272, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004219679976813495, + "grad_norm": 9.301839828491211, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8782727718353271, + "num_tokens": 767870515.0, + "step": 20124 + }, + { + "epoch": 2.560106856634016, + "ewc_loss": 0.0795031264424324, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042149610817432404, + "grad_norm": 9.315949440002441, + "learning_rate": 1e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8476566076278687, + "num_tokens": 767911573.0, + "step": 20125 + }, + { + "epoch": 2.5602340669126065, + "ewc_loss": 0.07952587306499481, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042172352550551295, + "grad_norm": 9.284233093261719, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8624911308288574, + "num_tokens": 767950161.0, + "step": 20126 + }, + { + "epoch": 2.560361277191197, + "ewc_loss": 0.08020822703838348, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004236642853356898, + "grad_norm": 9.4160737991333, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.860668957233429, + "num_tokens": 767989431.0, + "step": 20127 + }, + { + "epoch": 2.5604884874697875, + "ewc_loss": 0.07924218475818634, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041888668783940375, + "grad_norm": 9.284146308898926, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8742105960845947, + "num_tokens": 768023551.0, + "step": 20128 + }, + { + "epoch": 2.560615697748378, + "ewc_loss": 0.07976347208023071, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004240995622240007, + "grad_norm": 9.372203826904297, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8727135062217712, + "num_tokens": 768055317.0, + "step": 20129 + }, + { + "epoch": 2.5607429080269686, + "ewc_loss": 0.07935689389705658, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004200338153168559, + "grad_norm": 9.351512908935547, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8558609485626221, + "num_tokens": 768089575.0, + "step": 20130 + }, + { + "epoch": 2.560870118305559, + "ewc_loss": 0.07970096170902252, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042347447015345097, + "grad_norm": 9.365574836730957, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8543114066123962, + "num_tokens": 768131795.0, + "step": 20131 + }, + { + "epoch": 2.5609973285841496, + "ewc_loss": 0.07938464730978012, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042031132034026086, + "grad_norm": 9.318512916564941, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8687393665313721, + "num_tokens": 768165714.0, + "step": 20132 + }, + { + "epoch": 2.56112453886274, + "ewc_loss": 0.0795816108584404, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004222809511702508, + "grad_norm": 9.351285934448242, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8732390403747559, + "num_tokens": 768203475.0, + "step": 20133 + }, + { + "epoch": 2.5612517491413307, + "ewc_loss": 0.0791836753487587, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041830161353573203, + "grad_norm": 9.252375602722168, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.880488932132721, + "num_tokens": 768244841.0, + "step": 20134 + }, + { + "epoch": 2.561378959419921, + "ewc_loss": 0.07950139045715332, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004214787622913718, + "grad_norm": 9.35618782043457, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8602956533432007, + "num_tokens": 768286502.0, + "step": 20135 + }, + { + "epoch": 2.5615061696985117, + "ewc_loss": 0.0790938138961792, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004174030327703804, + "grad_norm": 9.257576942443848, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8713992238044739, + "num_tokens": 768320628.0, + "step": 20136 + }, + { + "epoch": 2.5616333799771023, + "ewc_loss": 0.0794103592634201, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042056848178617656, + "grad_norm": 9.373714447021484, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.856476902961731, + "num_tokens": 768353630.0, + "step": 20137 + }, + { + "epoch": 2.561760590255693, + "ewc_loss": 0.07931938767433167, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041721732122823596, + "grad_norm": 9.254388809204102, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8626343607902527, + "num_tokens": 768388170.0, + "step": 20138 + }, + { + "epoch": 2.5618878005342833, + "ewc_loss": 0.07942916452884674, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004207564634270966, + "grad_norm": 9.332864761352539, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8802206516265869, + "num_tokens": 768428238.0, + "step": 20139 + }, + { + "epoch": 2.562015010812874, + "ewc_loss": 0.07902457565069199, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041671059443615377, + "grad_norm": 9.271890640258789, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8787644505500793, + "num_tokens": 768460075.0, + "step": 20140 + }, + { + "epoch": 2.562142221091464, + "ewc_loss": 0.07944355905056, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004209004109725356, + "grad_norm": 9.334382057189941, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8821338415145874, + "num_tokens": 768493982.0, + "step": 20141 + }, + { + "epoch": 2.562269431370055, + "ewc_loss": 0.07910973578691483, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041756220161914825, + "grad_norm": 9.2560396194458, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8676347136497498, + "num_tokens": 768529121.0, + "step": 20142 + }, + { + "epoch": 2.562396641648645, + "ewc_loss": 0.07935136556625366, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004199784598313272, + "grad_norm": 9.372222900390625, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8658115863800049, + "num_tokens": 768560730.0, + "step": 20143 + }, + { + "epoch": 2.562523851927236, + "ewc_loss": 0.07901972532272339, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004166621365584433, + "grad_norm": 9.208723068237305, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8762357234954834, + "num_tokens": 768603015.0, + "step": 20144 + }, + { + "epoch": 2.562651062205826, + "ewc_loss": 0.07943589240312576, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004208237514831126, + "grad_norm": 9.3174467086792, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8624388575553894, + "num_tokens": 768639847.0, + "step": 20145 + }, + { + "epoch": 2.5627782724844166, + "ewc_loss": 0.079120934009552, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004176742222625762, + "grad_norm": 9.260441780090332, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8618574738502502, + "num_tokens": 768681614.0, + "step": 20146 + }, + { + "epoch": 2.562905482763007, + "ewc_loss": 0.07938237488269806, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042028859024867415, + "grad_norm": 9.327919960021973, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8675436973571777, + "num_tokens": 768722052.0, + "step": 20147 + }, + { + "epoch": 2.5630326930415976, + "ewc_loss": 0.07910953462123871, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004175601643510163, + "grad_norm": 9.230298042297363, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8572286367416382, + "num_tokens": 768754191.0, + "step": 20148 + }, + { + "epoch": 2.563159903320188, + "ewc_loss": 0.07944557070732117, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042092055082321167, + "grad_norm": 9.301027297973633, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8542293310165405, + "num_tokens": 768792981.0, + "step": 20149 + }, + { + "epoch": 2.5632871135987787, + "ewc_loss": 0.07907813787460327, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041724619222804904, + "grad_norm": 9.299915313720703, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8545105457305908, + "num_tokens": 768827040.0, + "step": 20150 + }, + { + "epoch": 2.5634143238773692, + "ewc_loss": 0.07924383878707886, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004189032770227641, + "grad_norm": 9.273350715637207, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8631622791290283, + "num_tokens": 768865669.0, + "step": 20151 + }, + { + "epoch": 2.5635415341559598, + "ewc_loss": 0.0792594850063324, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004190596519038081, + "grad_norm": 9.274441719055176, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8626363277435303, + "num_tokens": 768903306.0, + "step": 20152 + }, + { + "epoch": 2.5636687444345503, + "ewc_loss": 0.07918202877044678, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004182851407676935, + "grad_norm": 9.300575256347656, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8677029013633728, + "num_tokens": 768946322.0, + "step": 20153 + }, + { + "epoch": 2.563795954713141, + "ewc_loss": 0.0794726237654686, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042119110003113747, + "grad_norm": 9.280740737915039, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8821394443511963, + "num_tokens": 768987350.0, + "step": 20154 + }, + { + "epoch": 2.5639231649917313, + "ewc_loss": 0.07928887754678726, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004193536296952516, + "grad_norm": 9.315735816955566, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.873397171497345, + "num_tokens": 769030762.0, + "step": 20155 + }, + { + "epoch": 2.564050375270322, + "ewc_loss": 0.07930073142051697, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004194721987005323, + "grad_norm": 9.287818908691406, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8773025274276733, + "num_tokens": 769069827.0, + "step": 20156 + }, + { + "epoch": 2.5641775855489124, + "ewc_loss": 0.07933392375707626, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004198040987830609, + "grad_norm": 9.322813987731934, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8637741804122925, + "num_tokens": 769107742.0, + "step": 20157 + }, + { + "epoch": 2.564304795827503, + "ewc_loss": 0.07932229340076447, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041968777077272534, + "grad_norm": 9.317501068115234, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8619063496589661, + "num_tokens": 769153286.0, + "step": 20158 + }, + { + "epoch": 2.5644320061060935, + "ewc_loss": 0.07948435097932816, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042130835936404765, + "grad_norm": 9.342357635498047, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8664283156394958, + "num_tokens": 769192646.0, + "step": 20159 + }, + { + "epoch": 2.564559216384684, + "ewc_loss": 0.07933131605386734, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004197779926471412, + "grad_norm": 9.295848846435547, + "learning_rate": 1e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8420161604881287, + "num_tokens": 769230360.0, + "step": 20160 + }, + { + "epoch": 2.5646864266632745, + "ewc_loss": 0.0794297456741333, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004207622550893575, + "grad_norm": 9.310115814208984, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8779951333999634, + "num_tokens": 769270754.0, + "step": 20161 + }, + { + "epoch": 2.564813636941865, + "ewc_loss": 0.07942929118871689, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004207577439956367, + "grad_norm": 9.31149959564209, + "learning_rate": 1e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8410012722015381, + "num_tokens": 769310109.0, + "step": 20162 + }, + { + "epoch": 2.5649408472204556, + "ewc_loss": 0.07993718981742859, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00042095393291674554, + "grad_norm": 9.288907051086426, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8631424903869629, + "num_tokens": 769352101.0, + "step": 20163 + }, + { + "epoch": 2.5650680574990457, + "ewc_loss": 0.07945630699396133, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042102791485376656, + "grad_norm": 9.33065128326416, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8792203068733215, + "num_tokens": 769386570.0, + "step": 20164 + }, + { + "epoch": 2.5651952677776366, + "ewc_loss": 0.07941257953643799, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004205906006973237, + "grad_norm": 9.24862003326416, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8766075372695923, + "num_tokens": 769427167.0, + "step": 20165 + }, + { + "epoch": 2.5653224780562267, + "ewc_loss": 0.07964570820331573, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042292187572456896, + "grad_norm": 9.337617874145508, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8533138632774353, + "num_tokens": 769465456.0, + "step": 20166 + }, + { + "epoch": 2.5654496883348177, + "ewc_loss": 0.07923662662506104, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041883112862706184, + "grad_norm": 9.262025833129883, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8770912289619446, + "num_tokens": 769497135.0, + "step": 20167 + }, + { + "epoch": 2.5655768986134078, + "ewc_loss": 0.07973785698413849, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004238433903083205, + "grad_norm": 9.385695457458496, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8592955470085144, + "num_tokens": 769533706.0, + "step": 20168 + }, + { + "epoch": 2.5657041088919987, + "ewc_loss": 0.07927466928958893, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041921157389879227, + "grad_norm": 9.232324600219727, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8597695231437683, + "num_tokens": 769578698.0, + "step": 20169 + }, + { + "epoch": 2.565831319170589, + "ewc_loss": 0.0798480361700058, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042494520312175155, + "grad_norm": 9.352168083190918, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8779911994934082, + "num_tokens": 769619799.0, + "step": 20170 + }, + { + "epoch": 2.5659585294491793, + "ewc_loss": 0.07927416265010834, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004192064225208014, + "grad_norm": 9.21741771697998, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8641623258590698, + "num_tokens": 769655973.0, + "step": 20171 + }, + { + "epoch": 2.56608573972777, + "ewc_loss": 0.07994583249092102, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042592312092892826, + "grad_norm": 9.397499084472656, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8602679967880249, + "num_tokens": 769692333.0, + "step": 20172 + }, + { + "epoch": 2.5662129500063604, + "ewc_loss": 0.07932467758655548, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041971163591369987, + "grad_norm": 9.239356994628906, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.868324875831604, + "num_tokens": 769731804.0, + "step": 20173 + }, + { + "epoch": 2.566340160284951, + "ewc_loss": 0.08000635355710983, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004265283641871065, + "grad_norm": 9.419663429260254, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8700283765792847, + "num_tokens": 769767409.0, + "step": 20174 + }, + { + "epoch": 2.5664673705635415, + "ewc_loss": 0.07919497042894363, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004184145655017346, + "grad_norm": 9.222419738769531, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8601995706558228, + "num_tokens": 769806468.0, + "step": 20175 + }, + { + "epoch": 2.566594580842132, + "ewc_loss": 0.08019201457500458, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042838501394726336, + "grad_norm": 9.447997093200684, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8712366819381714, + "num_tokens": 769842025.0, + "step": 20176 + }, + { + "epoch": 2.5667217911207225, + "ewc_loss": 0.0791253075003624, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041771793621592224, + "grad_norm": 9.205361366271973, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8596072196960449, + "num_tokens": 769884125.0, + "step": 20177 + }, + { + "epoch": 2.566849001399313, + "ewc_loss": 0.0801147073507309, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004276119580026716, + "grad_norm": 9.43252944946289, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8523677587509155, + "num_tokens": 769920523.0, + "step": 20178 + }, + { + "epoch": 2.5669762116779036, + "ewc_loss": 0.07904692739248276, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004169341118540615, + "grad_norm": 9.248420715332031, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8623920679092407, + "num_tokens": 769960141.0, + "step": 20179 + }, + { + "epoch": 2.567103421956494, + "ewc_loss": 0.07999202609062195, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004263851442374289, + "grad_norm": 9.421710968017578, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8661078214645386, + "num_tokens": 769995281.0, + "step": 20180 + }, + { + "epoch": 2.5672306322350846, + "ewc_loss": 0.0792979747056961, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004194446373730898, + "grad_norm": 9.270123481750488, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8686897158622742, + "num_tokens": 770028230.0, + "step": 20181 + }, + { + "epoch": 2.567357842513675, + "ewc_loss": 0.07984497398138046, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042491458589211106, + "grad_norm": 9.36248779296875, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8712068796157837, + "num_tokens": 770068996.0, + "step": 20182 + }, + { + "epoch": 2.5674850527922657, + "ewc_loss": 0.07926934957504272, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004191583429928869, + "grad_norm": 9.266363143920898, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8530129790306091, + "num_tokens": 770107018.0, + "step": 20183 + }, + { + "epoch": 2.567612263070856, + "ewc_loss": 0.0796954482793808, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004234193474985659, + "grad_norm": 9.388090133666992, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8627352714538574, + "num_tokens": 770140251.0, + "step": 20184 + }, + { + "epoch": 2.5677394733494467, + "ewc_loss": 0.07920905947685242, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004185554280411452, + "grad_norm": 9.261335372924805, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8561267852783203, + "num_tokens": 770182241.0, + "step": 20185 + }, + { + "epoch": 2.5678666836280373, + "ewc_loss": 0.07994583249092102, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004259231500327587, + "grad_norm": 9.37877082824707, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8788545727729797, + "num_tokens": 770223267.0, + "step": 20186 + }, + { + "epoch": 2.567993893906628, + "ewc_loss": 0.07915898412466049, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004180546966381371, + "grad_norm": 9.202484130859375, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8537477254867554, + "num_tokens": 770261856.0, + "step": 20187 + }, + { + "epoch": 2.5681211041852183, + "ewc_loss": 0.0800480842590332, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042694565490819514, + "grad_norm": 9.424702644348145, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8536375164985657, + "num_tokens": 770299762.0, + "step": 20188 + }, + { + "epoch": 2.5682483144638084, + "ewc_loss": 0.0791303887963295, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041776872240006924, + "grad_norm": 9.25186824798584, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8586927652359009, + "num_tokens": 770333690.0, + "step": 20189 + }, + { + "epoch": 2.5683755247423994, + "ewc_loss": 0.08000253140926361, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042649012175388634, + "grad_norm": 9.321343421936035, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.870582103729248, + "num_tokens": 770375755.0, + "step": 20190 + }, + { + "epoch": 2.5685027350209895, + "ewc_loss": 0.07939134538173676, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004203782882541418, + "grad_norm": 9.303528785705566, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8737077713012695, + "num_tokens": 770417805.0, + "step": 20191 + }, + { + "epoch": 2.5686299452995804, + "ewc_loss": 0.07968506217002869, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042331547592766583, + "grad_norm": 9.394862174987793, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8636739253997803, + "num_tokens": 770458183.0, + "step": 20192 + }, + { + "epoch": 2.5687571555781705, + "ewc_loss": 0.07924269884824753, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004188918392173946, + "grad_norm": 9.296120643615723, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8665287494659424, + "num_tokens": 770488973.0, + "step": 20193 + }, + { + "epoch": 2.5688843658567615, + "ewc_loss": 0.07961343973875046, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042259925976395607, + "grad_norm": 9.320228576660156, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.858435332775116, + "num_tokens": 770529030.0, + "step": 20194 + }, + { + "epoch": 2.5690115761353516, + "ewc_loss": 0.07953671365976334, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004218319954816252, + "grad_norm": 9.286334037780762, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8532336354255676, + "num_tokens": 770574697.0, + "step": 20195 + }, + { + "epoch": 2.569138786413942, + "ewc_loss": 0.07966172695159912, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004230820923112333, + "grad_norm": 9.299652099609375, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.871967613697052, + "num_tokens": 770615417.0, + "step": 20196 + }, + { + "epoch": 2.5692659966925326, + "ewc_loss": 0.0797269344329834, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004237341636326164, + "grad_norm": 9.38244915008545, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8511711359024048, + "num_tokens": 770653228.0, + "step": 20197 + }, + { + "epoch": 2.569393206971123, + "ewc_loss": 0.07945359498262405, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004210007900837809, + "grad_norm": 9.234721183776855, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8804887533187866, + "num_tokens": 770693876.0, + "step": 20198 + }, + { + "epoch": 2.5695204172497137, + "ewc_loss": 0.08002425730228424, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042670738184824586, + "grad_norm": 9.406740188598633, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8628242015838623, + "num_tokens": 770728754.0, + "step": 20199 + }, + { + "epoch": 2.5696476275283042, + "ewc_loss": 0.07957178354263306, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042218269663862884, + "grad_norm": 9.306931495666504, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.862960159778595, + "num_tokens": 770767961.0, + "step": 20200 + }, + { + "epoch": 2.5697748378068948, + "ewc_loss": 0.0799444168806076, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042590906377881765, + "grad_norm": 9.366562843322754, + "learning_rate": 1e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8376275897026062, + "num_tokens": 770808880.0, + "step": 20201 + }, + { + "epoch": 2.5699020480854853, + "ewc_loss": 0.07970789074897766, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000423543737269938, + "grad_norm": 9.30754280090332, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8591036796569824, + "num_tokens": 770846483.0, + "step": 20202 + }, + { + "epoch": 2.570029258364076, + "ewc_loss": 0.08012557029724121, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004277205152902752, + "grad_norm": 9.385897636413574, + "learning_rate": 1e-06, + "loss": 0.5526, + "mean_token_accuracy": 0.8440459966659546, + "num_tokens": 770884234.0, + "step": 20203 + }, + { + "epoch": 2.5701564686426663, + "ewc_loss": 0.07966543734073639, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004231191996950656, + "grad_norm": 9.31335163116455, + "learning_rate": 1e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8498920202255249, + "num_tokens": 770918785.0, + "step": 20204 + }, + { + "epoch": 2.570283678921257, + "ewc_loss": 0.08009748160839081, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042743966332636774, + "grad_norm": 9.349714279174805, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8860905170440674, + "num_tokens": 770956889.0, + "step": 20205 + }, + { + "epoch": 2.5704108891998474, + "ewc_loss": 0.07963826507329941, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042284748633392155, + "grad_norm": 9.283145904541016, + "learning_rate": 1e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8529422283172607, + "num_tokens": 770994937.0, + "step": 20206 + }, + { + "epoch": 2.570538099478438, + "ewc_loss": 0.07990089058876038, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004254736995790154, + "grad_norm": 9.317719459533691, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8716834187507629, + "num_tokens": 771031451.0, + "step": 20207 + }, + { + "epoch": 2.5706653097570284, + "ewc_loss": 0.0799049586057663, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042551447404548526, + "grad_norm": 9.352169036865234, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8725637197494507, + "num_tokens": 771061575.0, + "step": 20208 + }, + { + "epoch": 2.570792520035619, + "ewc_loss": 0.07969047129154205, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042336960905231535, + "grad_norm": 9.298727035522461, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8644170761108398, + "num_tokens": 771103366.0, + "step": 20209 + }, + { + "epoch": 2.5709197303142095, + "ewc_loss": 0.07994165271520615, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004258813860360533, + "grad_norm": 9.324665069580078, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8694099187850952, + "num_tokens": 771142178.0, + "step": 20210 + }, + { + "epoch": 2.5710469405928, + "ewc_loss": 0.07982644438743591, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004247292526997626, + "grad_norm": 9.329834938049316, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8640057444572449, + "num_tokens": 771179296.0, + "step": 20211 + }, + { + "epoch": 2.5711741508713906, + "ewc_loss": 0.07981681078672409, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042463294812478125, + "grad_norm": 9.356672286987305, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.857793927192688, + "num_tokens": 771219084.0, + "step": 20212 + }, + { + "epoch": 2.571301361149981, + "ewc_loss": 0.07979577779769897, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004244226438459009, + "grad_norm": 9.340631484985352, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8708198070526123, + "num_tokens": 771250609.0, + "step": 20213 + }, + { + "epoch": 2.571428571428571, + "ewc_loss": 0.0799308717250824, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042577358544804156, + "grad_norm": 9.35840129852295, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8631227016448975, + "num_tokens": 771286191.0, + "step": 20214 + }, + { + "epoch": 2.571555781707162, + "ewc_loss": 0.07966223359107971, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004230871854815632, + "grad_norm": 9.299190521240234, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8620638847351074, + "num_tokens": 771320898.0, + "step": 20215 + }, + { + "epoch": 2.5716829919857522, + "ewc_loss": 0.07986511290073395, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004251160135027021, + "grad_norm": 9.406899452209473, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8560529351234436, + "num_tokens": 771353977.0, + "step": 20216 + }, + { + "epoch": 2.571810202264343, + "ewc_loss": 0.07948639988899231, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042132887756451964, + "grad_norm": 9.246513366699219, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8715378642082214, + "num_tokens": 771391960.0, + "step": 20217 + }, + { + "epoch": 2.5719374125429333, + "ewc_loss": 0.08008500188589096, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042731486610136926, + "grad_norm": 9.39862060546875, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8557860851287842, + "num_tokens": 771427764.0, + "step": 20218 + }, + { + "epoch": 2.572064622821524, + "ewc_loss": 0.07949580252170563, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042142291204072535, + "grad_norm": 9.2245454788208, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8649033308029175, + "num_tokens": 771469706.0, + "step": 20219 + }, + { + "epoch": 2.5721918331001143, + "ewc_loss": 0.08049636334180832, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004289870848879218, + "grad_norm": 9.389425277709961, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8852608799934387, + "num_tokens": 771504296.0, + "step": 20220 + }, + { + "epoch": 2.572319043378705, + "ewc_loss": 0.07957946509122849, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042225950164720416, + "grad_norm": 9.33199405670166, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8640594482421875, + "num_tokens": 771544496.0, + "step": 20221 + }, + { + "epoch": 2.5724462536572954, + "ewc_loss": 0.07997839897871017, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004262488509994, + "grad_norm": 9.341537475585938, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.872904896736145, + "num_tokens": 771584322.0, + "step": 20222 + }, + { + "epoch": 2.572573463935886, + "ewc_loss": 0.0796307772397995, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004227726021781564, + "grad_norm": 9.330557823181152, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8738346099853516, + "num_tokens": 771621424.0, + "step": 20223 + }, + { + "epoch": 2.5727006742144765, + "ewc_loss": 0.07973122596740723, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000423777149990201, + "grad_norm": 9.325054168701172, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8651665449142456, + "num_tokens": 771658173.0, + "step": 20224 + }, + { + "epoch": 2.572827884493067, + "ewc_loss": 0.07962616533041, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004227265017107129, + "grad_norm": 9.303954124450684, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8564154505729675, + "num_tokens": 771698182.0, + "step": 20225 + }, + { + "epoch": 2.5729550947716575, + "ewc_loss": 0.07984523475170135, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004249172343406826, + "grad_norm": 9.360906600952148, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8492468595504761, + "num_tokens": 771731559.0, + "step": 20226 + }, + { + "epoch": 2.573082305050248, + "ewc_loss": 0.07941129058599472, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042057776590809226, + "grad_norm": 9.256080627441406, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8609727621078491, + "num_tokens": 771774362.0, + "step": 20227 + }, + { + "epoch": 2.5732095153288386, + "ewc_loss": 0.07974015921354294, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042386644054204226, + "grad_norm": 9.322449684143066, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8535030484199524, + "num_tokens": 771810540.0, + "step": 20228 + }, + { + "epoch": 2.573336725607429, + "ewc_loss": 0.07946887612342834, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004211535560898483, + "grad_norm": 9.328661918640137, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8683847188949585, + "num_tokens": 771846221.0, + "step": 20229 + }, + { + "epoch": 2.5734639358860196, + "ewc_loss": 0.0794571042060852, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004210358310956508, + "grad_norm": 9.252370834350586, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8733464479446411, + "num_tokens": 771882268.0, + "step": 20230 + }, + { + "epoch": 2.57359114616461, + "ewc_loss": 0.0797441303730011, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042390613816678524, + "grad_norm": 9.326866149902344, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.862105131149292, + "num_tokens": 771913176.0, + "step": 20231 + }, + { + "epoch": 2.5737183564432007, + "ewc_loss": 0.07984168827533722, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042244026553817093, + "grad_norm": 9.372422218322754, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8587343692779541, + "num_tokens": 771948163.0, + "step": 20232 + }, + { + "epoch": 2.573845566721791, + "ewc_loss": 0.07969146966934204, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042337950435467064, + "grad_norm": 9.299680709838867, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8770102262496948, + "num_tokens": 771988296.0, + "step": 20233 + }, + { + "epoch": 2.5739727770003817, + "ewc_loss": 0.07973027229309082, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004237675166223198, + "grad_norm": 9.344555854797363, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8711816072463989, + "num_tokens": 772024330.0, + "step": 20234 + }, + { + "epoch": 2.5740999872789723, + "ewc_loss": 0.07936625927686691, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004201274423394352, + "grad_norm": 9.333016395568848, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8518251180648804, + "num_tokens": 772062291.0, + "step": 20235 + }, + { + "epoch": 2.574227197557563, + "ewc_loss": 0.079848513007164, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000424949947046116, + "grad_norm": 9.379157066345215, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8660250902175903, + "num_tokens": 772110239.0, + "step": 20236 + }, + { + "epoch": 2.5743544078361533, + "ewc_loss": 0.07976177334785461, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042164113256148994, + "grad_norm": 9.325441360473633, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8603465557098389, + "num_tokens": 772153201.0, + "step": 20237 + }, + { + "epoch": 2.574481618114744, + "ewc_loss": 0.0797773227095604, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004242380673531443, + "grad_norm": 9.39584732055664, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8653289675712585, + "num_tokens": 772186816.0, + "step": 20238 + }, + { + "epoch": 2.574608828393334, + "ewc_loss": 0.07974673062562943, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042149075306952, + "grad_norm": 9.277706146240234, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8747665882110596, + "num_tokens": 772224238.0, + "step": 20239 + }, + { + "epoch": 2.574736038671925, + "ewc_loss": 0.07985414564609528, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042500629206188023, + "grad_norm": 9.364749908447266, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8746368885040283, + "num_tokens": 772263505.0, + "step": 20240 + }, + { + "epoch": 2.574863248950515, + "ewc_loss": 0.07949715852737427, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004214364744257182, + "grad_norm": 9.297308921813965, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8738604784011841, + "num_tokens": 772303205.0, + "step": 20241 + }, + { + "epoch": 2.574990459229106, + "ewc_loss": 0.0796566903591156, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042303180089220405, + "grad_norm": 9.291265487670898, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8695967197418213, + "num_tokens": 772344495.0, + "step": 20242 + }, + { + "epoch": 2.575117669507696, + "ewc_loss": 0.07965768128633499, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004230416670907289, + "grad_norm": 9.398935317993164, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.859878420829773, + "num_tokens": 772380296.0, + "step": 20243 + }, + { + "epoch": 2.5752448797862866, + "ewc_loss": 0.07942377030849457, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004207025049254298, + "grad_norm": 9.230077743530273, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8777666091918945, + "num_tokens": 772423933.0, + "step": 20244 + }, + { + "epoch": 2.575372090064877, + "ewc_loss": 0.07985945791006088, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042505940655246377, + "grad_norm": 9.370452880859375, + "learning_rate": 1e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8500277996063232, + "num_tokens": 772464514.0, + "step": 20245 + }, + { + "epoch": 2.5754993003434676, + "ewc_loss": 0.07946597039699554, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042112459777854383, + "grad_norm": 9.245551109313965, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8827225565910339, + "num_tokens": 772498501.0, + "step": 20246 + }, + { + "epoch": 2.575626510622058, + "ewc_loss": 0.07997047901153564, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004261696885805577, + "grad_norm": 9.388420104980469, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8611414432525635, + "num_tokens": 772538072.0, + "step": 20247 + }, + { + "epoch": 2.5757537209006487, + "ewc_loss": 0.07930593192577362, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004195242072455585, + "grad_norm": 9.27981185913086, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8646461963653564, + "num_tokens": 772577434.0, + "step": 20248 + }, + { + "epoch": 2.575880931179239, + "ewc_loss": 0.08004552125930786, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042692007264122367, + "grad_norm": 9.352502822875977, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8803768157958984, + "num_tokens": 772612917.0, + "step": 20249 + }, + { + "epoch": 2.5760081414578297, + "ewc_loss": 0.07941609621047974, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004206257581245154, + "grad_norm": 9.299175262451172, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8758763074874878, + "num_tokens": 772652939.0, + "step": 20250 + }, + { + "epoch": 2.5761353517364203, + "ewc_loss": 0.07989853620529175, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042545018368400633, + "grad_norm": 9.372501373291016, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8715495467185974, + "num_tokens": 772688018.0, + "step": 20251 + }, + { + "epoch": 2.576262562015011, + "ewc_loss": 0.0796591192483902, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042305601527914405, + "grad_norm": 9.289473533630371, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8553547263145447, + "num_tokens": 772728071.0, + "step": 20252 + }, + { + "epoch": 2.5763897722936013, + "ewc_loss": 0.07995443791151047, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004260092100594193, + "grad_norm": 9.439496994018555, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8642289638519287, + "num_tokens": 772762730.0, + "step": 20253 + }, + { + "epoch": 2.576516982572192, + "ewc_loss": 0.07948268204927444, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042129168286919594, + "grad_norm": 9.277746200561523, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.868900716304779, + "num_tokens": 772799125.0, + "step": 20254 + }, + { + "epoch": 2.5766441928507824, + "ewc_loss": 0.08006392419338226, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042710406705737114, + "grad_norm": 9.393590927124023, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8761005401611328, + "num_tokens": 772838657.0, + "step": 20255 + }, + { + "epoch": 2.576771403129373, + "ewc_loss": 0.07940755784511566, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042054042569361627, + "grad_norm": 9.255735397338867, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8702167272567749, + "num_tokens": 772876470.0, + "step": 20256 + }, + { + "epoch": 2.5768986134079634, + "ewc_loss": 0.08013159036636353, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042778076021932065, + "grad_norm": 9.42778205871582, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8585083484649658, + "num_tokens": 772910369.0, + "step": 20257 + }, + { + "epoch": 2.577025823686554, + "ewc_loss": 0.07926735281944275, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004191383777651936, + "grad_norm": 9.228734970092773, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8699018955230713, + "num_tokens": 772948129.0, + "step": 20258 + }, + { + "epoch": 2.5771530339651445, + "ewc_loss": 0.08012789487838745, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042774376925081015, + "grad_norm": 9.41763973236084, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8910545706748962, + "num_tokens": 772985359.0, + "step": 20259 + }, + { + "epoch": 2.577280244243735, + "ewc_loss": 0.07994753122329712, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00042105739703401923, + "grad_norm": 9.310298919677734, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8662853240966797, + "num_tokens": 773019402.0, + "step": 20260 + }, + { + "epoch": 2.5774074545223256, + "ewc_loss": 0.07998581975698471, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042632303666323423, + "grad_norm": 9.383893013000488, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8640897274017334, + "num_tokens": 773059176.0, + "step": 20261 + }, + { + "epoch": 2.5775346648009156, + "ewc_loss": 0.07951131463050842, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042157794814556837, + "grad_norm": 9.29500961303711, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8665928244590759, + "num_tokens": 773094274.0, + "step": 20262 + }, + { + "epoch": 2.5776618750795066, + "ewc_loss": 0.08008705079555511, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004273352969903499, + "grad_norm": 9.432123184204102, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.863483726978302, + "num_tokens": 773135679.0, + "step": 20263 + }, + { + "epoch": 2.5777890853580967, + "ewc_loss": 0.07939223945140839, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042038719402626157, + "grad_norm": 9.249040603637695, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.859961986541748, + "num_tokens": 773174270.0, + "step": 20264 + }, + { + "epoch": 2.5779162956366877, + "ewc_loss": 0.0800701379776001, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042716620373539627, + "grad_norm": 9.422542572021484, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8682659864425659, + "num_tokens": 773213582.0, + "step": 20265 + }, + { + "epoch": 2.5780435059152778, + "ewc_loss": 0.07938793301582336, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042034414946101606, + "grad_norm": 9.351788520812988, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8599849939346313, + "num_tokens": 773244980.0, + "step": 20266 + }, + { + "epoch": 2.5781707161938687, + "ewc_loss": 0.07999670505523682, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042643185588531196, + "grad_norm": 9.388386726379395, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8778781890869141, + "num_tokens": 773287395.0, + "step": 20267 + }, + { + "epoch": 2.578297926472459, + "ewc_loss": 0.07953812181949615, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042184608173556626, + "grad_norm": 9.355951309204102, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8615662455558777, + "num_tokens": 773324593.0, + "step": 20268 + }, + { + "epoch": 2.5784251367510493, + "ewc_loss": 0.07992985099554062, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004233219369780272, + "grad_norm": 9.316488265991211, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8593985438346863, + "num_tokens": 773361802.0, + "step": 20269 + }, + { + "epoch": 2.57855234702964, + "ewc_loss": 0.07977807521820068, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004242455761414021, + "grad_norm": 9.298300743103027, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8699007630348206, + "num_tokens": 773401348.0, + "step": 20270 + }, + { + "epoch": 2.5786795573082304, + "ewc_loss": 0.0797816663980484, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042428154847584665, + "grad_norm": 9.431591033935547, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8814132809638977, + "num_tokens": 773439040.0, + "step": 20271 + }, + { + "epoch": 2.578806767586821, + "ewc_loss": 0.07953880727291107, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042185289203189313, + "grad_norm": 9.369714736938477, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.877975344657898, + "num_tokens": 773471382.0, + "step": 20272 + }, + { + "epoch": 2.5789339778654115, + "ewc_loss": 0.08005672693252563, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004245906602591276, + "grad_norm": 9.37294864654541, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8536367416381836, + "num_tokens": 773508924.0, + "step": 20273 + }, + { + "epoch": 2.579061188144002, + "ewc_loss": 0.07956120371818542, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042207693331874907, + "grad_norm": 9.239744186401367, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.879639744758606, + "num_tokens": 773550219.0, + "step": 20274 + }, + { + "epoch": 2.5791883984225925, + "ewc_loss": 0.08013097941875458, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004277746193110943, + "grad_norm": 9.402894020080566, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8613792657852173, + "num_tokens": 773589363.0, + "step": 20275 + }, + { + "epoch": 2.579315608701183, + "ewc_loss": 0.07944917678833008, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042095661046914756, + "grad_norm": 9.293545722961426, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8640446066856384, + "num_tokens": 773630289.0, + "step": 20276 + }, + { + "epoch": 2.5794428189797736, + "ewc_loss": 0.08022059500217438, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042867078445851803, + "grad_norm": 9.422688484191895, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8656136989593506, + "num_tokens": 773668660.0, + "step": 20277 + }, + { + "epoch": 2.579570029258364, + "ewc_loss": 0.07951020449399948, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004215668886899948, + "grad_norm": 9.292156219482422, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8612476587295532, + "num_tokens": 773704318.0, + "step": 20278 + }, + { + "epoch": 2.5796972395369546, + "ewc_loss": 0.08027222752571106, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004291871446184814, + "grad_norm": 9.488765716552734, + "learning_rate": 1e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8497863411903381, + "num_tokens": 773745821.0, + "step": 20279 + }, + { + "epoch": 2.579824449815545, + "ewc_loss": 0.07938389480113983, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004203038406558335, + "grad_norm": 9.251114845275879, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.874706506729126, + "num_tokens": 773782880.0, + "step": 20280 + }, + { + "epoch": 2.5799516600941357, + "ewc_loss": 0.08024292439222336, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004288940690457821, + "grad_norm": 9.455174446105957, + "learning_rate": 1e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8502919673919678, + "num_tokens": 773824880.0, + "step": 20281 + }, + { + "epoch": 2.580078870372726, + "ewc_loss": 0.07942909002304077, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004207557940389961, + "grad_norm": 9.3011474609375, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8681464195251465, + "num_tokens": 773863210.0, + "step": 20282 + }, + { + "epoch": 2.5802060806513167, + "ewc_loss": 0.0801178514957428, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042764333193190396, + "grad_norm": 9.449589729309082, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8727709650993347, + "num_tokens": 773903432.0, + "step": 20283 + }, + { + "epoch": 2.5803332909299073, + "ewc_loss": 0.07943928241729736, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004208576865494251, + "grad_norm": 9.299971580505371, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8730396032333374, + "num_tokens": 773942424.0, + "step": 20284 + }, + { + "epoch": 2.580460501208498, + "ewc_loss": 0.0800807774066925, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042727263644337654, + "grad_norm": 9.417058944702148, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8631950616836548, + "num_tokens": 773976598.0, + "step": 20285 + }, + { + "epoch": 2.5805877114870883, + "ewc_loss": 0.07951068878173828, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042157177813351154, + "grad_norm": 9.31641674041748, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8582199811935425, + "num_tokens": 774016696.0, + "step": 20286 + }, + { + "epoch": 2.5807149217656784, + "ewc_loss": 0.08009538054466248, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042741867946460843, + "grad_norm": 9.386248588562012, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8570449948310852, + "num_tokens": 774059046.0, + "step": 20287 + }, + { + "epoch": 2.5808421320442694, + "ewc_loss": 0.07954065501689911, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004218714020680636, + "grad_norm": 9.276659965515137, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8614778518676758, + "num_tokens": 774095322.0, + "step": 20288 + }, + { + "epoch": 2.5809693423228595, + "ewc_loss": 0.08026635646820068, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042912844219245017, + "grad_norm": 9.509598731994629, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8536151051521301, + "num_tokens": 774134506.0, + "step": 20289 + }, + { + "epoch": 2.5810965526014504, + "ewc_loss": 0.07933205366134644, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004197853268124163, + "grad_norm": 9.227032661437988, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8811753988265991, + "num_tokens": 774176997.0, + "step": 20290 + }, + { + "epoch": 2.5812237628800405, + "ewc_loss": 0.08030888438224792, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042955364915542305, + "grad_norm": 9.482236862182617, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8580928444862366, + "num_tokens": 774213900.0, + "step": 20291 + }, + { + "epoch": 2.5813509731586315, + "ewc_loss": 0.07931864261627197, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041965124546550214, + "grad_norm": 9.281669616699219, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8578194379806519, + "num_tokens": 774259687.0, + "step": 20292 + }, + { + "epoch": 2.5814781834372216, + "ewc_loss": 0.08017610013484955, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042822587420232594, + "grad_norm": 9.386869430541992, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8707133531570435, + "num_tokens": 774297200.0, + "step": 20293 + }, + { + "epoch": 2.581605393715812, + "ewc_loss": 0.07953576743602753, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042182253673672676, + "grad_norm": 9.312411308288574, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8784786462783813, + "num_tokens": 774336642.0, + "step": 20294 + }, + { + "epoch": 2.5817326039944026, + "ewc_loss": 0.08007945120334625, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042725930688902736, + "grad_norm": 9.555428504943848, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8495163917541504, + "num_tokens": 774376762.0, + "step": 20295 + }, + { + "epoch": 2.581859814272993, + "ewc_loss": 0.07933114469051361, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004197763337288052, + "grad_norm": 9.242703437805176, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8740883469581604, + "num_tokens": 774421360.0, + "step": 20296 + }, + { + "epoch": 2.5819870245515837, + "ewc_loss": 0.08024232089519501, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000428888073656708, + "grad_norm": 9.427903175354004, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8718885183334351, + "num_tokens": 774460850.0, + "step": 20297 + }, + { + "epoch": 2.582114234830174, + "ewc_loss": 0.07949474453926086, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004214123182464391, + "grad_norm": 9.298885345458984, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8636575937271118, + "num_tokens": 774497542.0, + "step": 20298 + }, + { + "epoch": 2.5822414451087647, + "ewc_loss": 0.08011844754219055, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004276493564248085, + "grad_norm": 9.45455551147461, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.854669988155365, + "num_tokens": 774538164.0, + "step": 20299 + }, + { + "epoch": 2.5823686553873553, + "ewc_loss": 0.0795523002743721, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000421987846493721, + "grad_norm": 9.329296112060547, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8608193397521973, + "num_tokens": 774576118.0, + "step": 20300 + }, + { + "epoch": 2.582495865665946, + "ewc_loss": 0.07998497784137726, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004263145674485713, + "grad_norm": 9.325575828552246, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8873178958892822, + "num_tokens": 774617506.0, + "step": 20301 + }, + { + "epoch": 2.5826230759445363, + "ewc_loss": 0.07983686029911041, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042483341530896723, + "grad_norm": 9.409163475036621, + "learning_rate": 1e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8403975367546082, + "num_tokens": 774653720.0, + "step": 20302 + }, + { + "epoch": 2.582750286223127, + "ewc_loss": 0.07971224188804626, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004235873057041317, + "grad_norm": 9.347789764404297, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8856929540634155, + "num_tokens": 774691056.0, + "step": 20303 + }, + { + "epoch": 2.5828774965017174, + "ewc_loss": 0.08004201948642731, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004268850898370147, + "grad_norm": 9.412223815917969, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8592146635055542, + "num_tokens": 774726514.0, + "step": 20304 + }, + { + "epoch": 2.583004706780308, + "ewc_loss": 0.07960927486419678, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042255755397491157, + "grad_norm": 9.313470840454102, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.856208086013794, + "num_tokens": 774760831.0, + "step": 20305 + }, + { + "epoch": 2.5831319170588984, + "ewc_loss": 0.08014027774333954, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042786760604940355, + "grad_norm": 9.370771408081055, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.875841498374939, + "num_tokens": 774799399.0, + "step": 20306 + }, + { + "epoch": 2.583259127337489, + "ewc_loss": 0.07977332919836044, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042419813689775765, + "grad_norm": 9.358777046203613, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8576409816741943, + "num_tokens": 774839910.0, + "step": 20307 + }, + { + "epoch": 2.5833863376160795, + "ewc_loss": 0.07984814792871475, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042494633817113936, + "grad_norm": 9.33393383026123, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8670649528503418, + "num_tokens": 774875694.0, + "step": 20308 + }, + { + "epoch": 2.58351354789467, + "ewc_loss": 0.07980283349752426, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000424493191530928, + "grad_norm": 9.360981941223145, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8674634695053101, + "num_tokens": 774908020.0, + "step": 20309 + }, + { + "epoch": 2.5836407581732606, + "ewc_loss": 0.08003497123718262, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042437310912646353, + "grad_norm": 9.339249610900879, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8548584580421448, + "num_tokens": 774946716.0, + "step": 20310 + }, + { + "epoch": 2.583767968451851, + "ewc_loss": 0.07992666959762573, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004257315304130316, + "grad_norm": 9.360695838928223, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8597696423530579, + "num_tokens": 774981732.0, + "step": 20311 + }, + { + "epoch": 2.583895178730441, + "ewc_loss": 0.07996954023838043, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004237188259139657, + "grad_norm": 9.288338661193848, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8700253963470459, + "num_tokens": 775015280.0, + "step": 20312 + }, + { + "epoch": 2.584022389009032, + "ewc_loss": 0.07991118729114532, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042557669803500175, + "grad_norm": 9.381084442138672, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8730316162109375, + "num_tokens": 775054518.0, + "step": 20313 + }, + { + "epoch": 2.5841495992876222, + "ewc_loss": 0.07963148504495621, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004227797035127878, + "grad_norm": 9.338701248168945, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8714897632598877, + "num_tokens": 775091548.0, + "step": 20314 + }, + { + "epoch": 2.584276809566213, + "ewc_loss": 0.07979963719844818, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000424461264628917, + "grad_norm": 9.28143310546875, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8530145287513733, + "num_tokens": 775130916.0, + "step": 20315 + }, + { + "epoch": 2.5844040198448033, + "ewc_loss": 0.07978914678096771, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004243562580086291, + "grad_norm": 9.418926239013672, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8777192831039429, + "num_tokens": 775165875.0, + "step": 20316 + }, + { + "epoch": 2.584531230123394, + "ewc_loss": 0.07947224378585815, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042118728742934763, + "grad_norm": 9.353187561035156, + "learning_rate": 1e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.851047158241272, + "num_tokens": 775211463.0, + "step": 20317 + }, + { + "epoch": 2.5846584404019843, + "ewc_loss": 0.07971517741680145, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004236166423652321, + "grad_norm": 9.354937553405762, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8747611045837402, + "num_tokens": 775248338.0, + "step": 20318 + }, + { + "epoch": 2.584785650680575, + "ewc_loss": 0.07953797280788422, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042184453923255205, + "grad_norm": 9.435678482055664, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8534331321716309, + "num_tokens": 775285707.0, + "step": 20319 + }, + { + "epoch": 2.5849128609591654, + "ewc_loss": 0.07946275174617767, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004210923798382282, + "grad_norm": 9.337640762329102, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.873171329498291, + "num_tokens": 775322682.0, + "step": 20320 + }, + { + "epoch": 2.585040071237756, + "ewc_loss": 0.07979428023099899, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004244076553732157, + "grad_norm": 9.403074264526367, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8753806948661804, + "num_tokens": 775358274.0, + "step": 20321 + }, + { + "epoch": 2.5851672815163464, + "ewc_loss": 0.07943032681941986, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042076807585544884, + "grad_norm": 9.400150299072266, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8599121570587158, + "num_tokens": 775394472.0, + "step": 20322 + }, + { + "epoch": 2.585294491794937, + "ewc_loss": 0.07950591295957565, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042152396054007113, + "grad_norm": 9.254539489746094, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8758252859115601, + "num_tokens": 775433647.0, + "step": 20323 + }, + { + "epoch": 2.5854217020735275, + "ewc_loss": 0.0798858255147934, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004253231454640627, + "grad_norm": 9.436212539672852, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8549328446388245, + "num_tokens": 775469159.0, + "step": 20324 + }, + { + "epoch": 2.585548912352118, + "ewc_loss": 0.07926984131336212, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041916323243640363, + "grad_norm": 9.24724292755127, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8736335635185242, + "num_tokens": 775507329.0, + "step": 20325 + }, + { + "epoch": 2.5856761226307086, + "ewc_loss": 0.08008143305778503, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004272791848052293, + "grad_norm": 9.425117492675781, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.863923966884613, + "num_tokens": 775544638.0, + "step": 20326 + }, + { + "epoch": 2.585803332909299, + "ewc_loss": 0.07938064634799957, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004202712734695524, + "grad_norm": 9.189229011535645, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8660396933555603, + "num_tokens": 775584918.0, + "step": 20327 + }, + { + "epoch": 2.5859305431878896, + "ewc_loss": 0.08027951419353485, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004292599332984537, + "grad_norm": 9.400375366210938, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8603295683860779, + "num_tokens": 775624214.0, + "step": 20328 + }, + { + "epoch": 2.58605775346648, + "ewc_loss": 0.07947045564651489, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042116944678127766, + "grad_norm": 9.221858024597168, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8537272214889526, + "num_tokens": 775668372.0, + "step": 20329 + }, + { + "epoch": 2.5861849637450707, + "ewc_loss": 0.08035526424646378, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000430017476901412, + "grad_norm": 9.471529006958008, + "learning_rate": 1e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8373085856437683, + "num_tokens": 775700578.0, + "step": 20330 + }, + { + "epoch": 2.586312174023661, + "ewc_loss": 0.07928676158189774, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004193324421066791, + "grad_norm": 9.162712097167969, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8727867603302002, + "num_tokens": 775746980.0, + "step": 20331 + }, + { + "epoch": 2.5864393843022517, + "ewc_loss": 0.080766960978508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043413450475782156, + "grad_norm": 9.476521492004395, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8589651584625244, + "num_tokens": 775780762.0, + "step": 20332 + }, + { + "epoch": 2.5865665945808423, + "ewc_loss": 0.07942076027393341, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041823109495453537, + "grad_norm": 9.196065902709961, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8709411025047302, + "num_tokens": 775815591.0, + "step": 20333 + }, + { + "epoch": 2.586693804859433, + "ewc_loss": 0.08072014898061752, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043366634054109454, + "grad_norm": 9.412792205810547, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8569990396499634, + "num_tokens": 775856414.0, + "step": 20334 + }, + { + "epoch": 2.5868210151380233, + "ewc_loss": 0.0796448141336441, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004229129699524492, + "grad_norm": 9.292338371276855, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8651665449142456, + "num_tokens": 775896939.0, + "step": 20335 + }, + { + "epoch": 2.586948225416614, + "ewc_loss": 0.08033342659473419, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004297990817576647, + "grad_norm": 9.368882179260254, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.859376847743988, + "num_tokens": 775941778.0, + "step": 20336 + }, + { + "epoch": 2.587075435695204, + "ewc_loss": 0.07996798306703568, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042614468839019537, + "grad_norm": 9.385568618774414, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8650096654891968, + "num_tokens": 775979346.0, + "step": 20337 + }, + { + "epoch": 2.587202645973795, + "ewc_loss": 0.08014412224292755, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004254646191839129, + "grad_norm": 9.400632858276367, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8635143041610718, + "num_tokens": 776013793.0, + "step": 20338 + }, + { + "epoch": 2.587329856252385, + "ewc_loss": 0.08011066913604736, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004251301579643041, + "grad_norm": 9.312801361083984, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8777916431427002, + "num_tokens": 776058513.0, + "step": 20339 + }, + { + "epoch": 2.587457066530976, + "ewc_loss": 0.0799979716539383, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004264445451553911, + "grad_norm": 9.333219528198242, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8661115169525146, + "num_tokens": 776102790.0, + "step": 20340 + }, + { + "epoch": 2.587584276809566, + "ewc_loss": 0.0801582932472229, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004256063257344067, + "grad_norm": 9.378446578979492, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8801149129867554, + "num_tokens": 776133898.0, + "step": 20341 + }, + { + "epoch": 2.5877114870881566, + "ewc_loss": 0.07974843680858612, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004239492118358612, + "grad_norm": 9.329363822937012, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8635703325271606, + "num_tokens": 776167590.0, + "step": 20342 + }, + { + "epoch": 2.587838697366747, + "ewc_loss": 0.0800558552145958, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004270233912393451, + "grad_norm": 9.311600685119629, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8591476082801819, + "num_tokens": 776207966.0, + "step": 20343 + }, + { + "epoch": 2.5879659076453376, + "ewc_loss": 0.07998060435056686, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042627088259905577, + "grad_norm": 9.378812789916992, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8648114204406738, + "num_tokens": 776245601.0, + "step": 20344 + }, + { + "epoch": 2.588093117923928, + "ewc_loss": 0.07978077232837677, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000424272584496066, + "grad_norm": 9.252657890319824, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.873722493648529, + "num_tokens": 776285377.0, + "step": 20345 + }, + { + "epoch": 2.5882203282025187, + "ewc_loss": 0.08031049370765686, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042956980178132653, + "grad_norm": 9.368121147155762, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8572849035263062, + "num_tokens": 776328116.0, + "step": 20346 + }, + { + "epoch": 2.588347538481109, + "ewc_loss": 0.07972905039787292, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000423755293013528, + "grad_norm": 9.31709098815918, + "learning_rate": 1e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8489713668823242, + "num_tokens": 776369008.0, + "step": 20347 + }, + { + "epoch": 2.5884747487596997, + "ewc_loss": 0.08014903217554092, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042795517947524786, + "grad_norm": 9.33700942993164, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8555271625518799, + "num_tokens": 776410151.0, + "step": 20348 + }, + { + "epoch": 2.5886019590382903, + "ewc_loss": 0.07998640835285187, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004263289156369865, + "grad_norm": 9.301290512084961, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8891884088516235, + "num_tokens": 776445944.0, + "step": 20349 + }, + { + "epoch": 2.588729169316881, + "ewc_loss": 0.08008161187171936, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004272809310350567, + "grad_norm": 9.383993148803711, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8683537840843201, + "num_tokens": 776485388.0, + "step": 20350 + }, + { + "epoch": 2.5888563795954713, + "ewc_loss": 0.07981085032224655, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042457334348000586, + "grad_norm": 9.297898292541504, + "learning_rate": 1e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8467139005661011, + "num_tokens": 776529135.0, + "step": 20351 + }, + { + "epoch": 2.588983589874062, + "ewc_loss": 0.08014550060033798, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004279198474250734, + "grad_norm": 9.37448787689209, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8575304746627808, + "num_tokens": 776564310.0, + "step": 20352 + }, + { + "epoch": 2.5891108001526524, + "ewc_loss": 0.07973489910364151, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004238138208165765, + "grad_norm": 9.25330638885498, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8789860606193542, + "num_tokens": 776609004.0, + "step": 20353 + }, + { + "epoch": 2.589238010431243, + "ewc_loss": 0.08020266890525818, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000428491533966735, + "grad_norm": 9.381434440612793, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8710591793060303, + "num_tokens": 776645796.0, + "step": 20354 + }, + { + "epoch": 2.5893652207098334, + "ewc_loss": 0.07981407642364502, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042460556142032146, + "grad_norm": 9.318734169006348, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8554054498672485, + "num_tokens": 776684248.0, + "step": 20355 + }, + { + "epoch": 2.589492430988424, + "ewc_loss": 0.08025990426540375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042906388989649713, + "grad_norm": 10.348013877868652, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.868476927280426, + "num_tokens": 776722684.0, + "step": 20356 + }, + { + "epoch": 2.5896196412670145, + "ewc_loss": 0.07858933508396149, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041235817479901016, + "grad_norm": 8.978255271911621, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8697094321250916, + "num_tokens": 776764635.0, + "step": 20357 + }, + { + "epoch": 2.589746851545605, + "ewc_loss": 0.08284758031368256, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045494065852835774, + "grad_norm": 9.824676513671875, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8536971807479858, + "num_tokens": 776805683.0, + "step": 20358 + }, + { + "epoch": 2.5898740618241956, + "ewc_loss": 0.07859795540571213, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000412444380344823, + "grad_norm": 9.042952537536621, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8672482967376709, + "num_tokens": 776841229.0, + "step": 20359 + }, + { + "epoch": 2.5900012721027856, + "ewc_loss": 0.08295008540153503, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045596572454087436, + "grad_norm": 9.814091682434082, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8711241483688354, + "num_tokens": 776881319.0, + "step": 20360 + }, + { + "epoch": 2.5901284823813766, + "ewc_loss": 0.079256072640419, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004190255713183433, + "grad_norm": 9.122701644897461, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8704200983047485, + "num_tokens": 776919436.0, + "step": 20361 + }, + { + "epoch": 2.5902556926599667, + "ewc_loss": 0.08258172869682312, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004522821691352874, + "grad_norm": 9.80317211151123, + "learning_rate": 1e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.84767746925354, + "num_tokens": 776958644.0, + "step": 20362 + }, + { + "epoch": 2.5903829029385577, + "ewc_loss": 0.07961589097976685, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042262376518920064, + "grad_norm": 10.155986785888672, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.881960928440094, + "num_tokens": 776994509.0, + "step": 20363 + }, + { + "epoch": 2.5905101132171477, + "ewc_loss": 0.07977090775966644, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004241738934069872, + "grad_norm": 9.245426177978516, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8712977170944214, + "num_tokens": 777037557.0, + "step": 20364 + }, + { + "epoch": 2.5906373234957387, + "ewc_loss": 0.08144740760326385, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044093895121477544, + "grad_norm": 9.59760570526123, + "learning_rate": 1e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8450320959091187, + "num_tokens": 777077237.0, + "step": 20365 + }, + { + "epoch": 2.590764533774329, + "ewc_loss": 0.07922323048114777, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004162557306699455, + "grad_norm": 9.216257095336914, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8717300891876221, + "num_tokens": 777120813.0, + "step": 20366 + }, + { + "epoch": 2.5908917440529193, + "ewc_loss": 0.08140347898006439, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004404995997902006, + "grad_norm": 9.592878341674805, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8528047800064087, + "num_tokens": 777155809.0, + "step": 20367 + }, + { + "epoch": 2.59101895433151, + "ewc_loss": 0.07927993685007095, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041926419362425804, + "grad_norm": 9.16896915435791, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8659088015556335, + "num_tokens": 777195868.0, + "step": 20368 + }, + { + "epoch": 2.5911461646101004, + "ewc_loss": 0.08120886981487274, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004385535721667111, + "grad_norm": 9.562634468078613, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8693660497665405, + "num_tokens": 777233406.0, + "step": 20369 + }, + { + "epoch": 2.591273374888691, + "ewc_loss": 0.08005985617637634, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00042218054295517504, + "grad_norm": 9.188936233520508, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8662914037704468, + "num_tokens": 777272817.0, + "step": 20370 + }, + { + "epoch": 2.5914005851672814, + "ewc_loss": 0.0812433660030365, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043889848166145384, + "grad_norm": 9.584924697875977, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8597409129142761, + "num_tokens": 777312398.0, + "step": 20371 + }, + { + "epoch": 2.591527795445872, + "ewc_loss": 0.07951359450817108, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004216007946524769, + "grad_norm": 9.29000186920166, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8754408955574036, + "num_tokens": 777349375.0, + "step": 20372 + }, + { + "epoch": 2.5916550057244625, + "ewc_loss": 0.08101983368396759, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043666313285939395, + "grad_norm": 9.483859062194824, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8736147880554199, + "num_tokens": 777394806.0, + "step": 20373 + }, + { + "epoch": 2.591782216003053, + "ewc_loss": 0.07984648644924164, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000424929748987779, + "grad_norm": 9.244488716125488, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8699297904968262, + "num_tokens": 777434350.0, + "step": 20374 + }, + { + "epoch": 2.5919094262816436, + "ewc_loss": 0.0810161679983139, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043418508721515536, + "grad_norm": 9.500836372375488, + "learning_rate": 1e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.8399338722229004, + "num_tokens": 777469690.0, + "step": 20375 + }, + { + "epoch": 2.592036636560234, + "ewc_loss": 0.07992307841777802, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004256956744939089, + "grad_norm": 9.333641052246094, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8575707077980042, + "num_tokens": 777502095.0, + "step": 20376 + }, + { + "epoch": 2.5921638468388246, + "ewc_loss": 0.08067573606967926, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043322218698449433, + "grad_norm": 9.434248924255371, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8753711581230164, + "num_tokens": 777540387.0, + "step": 20377 + }, + { + "epoch": 2.592291057117415, + "ewc_loss": 0.07999174296855927, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042638223385438323, + "grad_norm": 9.254472732543945, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8724567890167236, + "num_tokens": 777581855.0, + "step": 20378 + }, + { + "epoch": 2.5924182673960057, + "ewc_loss": 0.08059203624725342, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043238516082055867, + "grad_norm": 9.367919921875, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8650676608085632, + "num_tokens": 777619852.0, + "step": 20379 + }, + { + "epoch": 2.592545477674596, + "ewc_loss": 0.08010625839233398, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042752738227136433, + "grad_norm": 9.307540893554688, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.85875004529953, + "num_tokens": 777662772.0, + "step": 20380 + }, + { + "epoch": 2.5926726879531867, + "ewc_loss": 0.0802411437034607, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004288762283977121, + "grad_norm": 9.32716178894043, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.848700225353241, + "num_tokens": 777701171.0, + "step": 20381 + }, + { + "epoch": 2.5927998982317773, + "ewc_loss": 0.08019236475229263, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004283884773030877, + "grad_norm": 9.329761505126953, + "learning_rate": 1e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8504334092140198, + "num_tokens": 777738245.0, + "step": 20382 + }, + { + "epoch": 2.592927108510368, + "ewc_loss": 0.08024841547012329, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004289490170776844, + "grad_norm": 9.36784553527832, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8666461110115051, + "num_tokens": 777775732.0, + "step": 20383 + }, + { + "epoch": 2.5930543187889583, + "ewc_loss": 0.08010227978229523, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042748762643896043, + "grad_norm": 9.322601318359375, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8649682402610779, + "num_tokens": 777811077.0, + "step": 20384 + }, + { + "epoch": 2.5931815290675484, + "ewc_loss": 0.08019594848155975, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004284243332222104, + "grad_norm": 9.246253967285156, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8632996678352356, + "num_tokens": 777854107.0, + "step": 20385 + }, + { + "epoch": 2.5933087393461394, + "ewc_loss": 0.0804884284734726, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043134912266395986, + "grad_norm": 9.454371452331543, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8643161058425903, + "num_tokens": 777891950.0, + "step": 20386 + }, + { + "epoch": 2.5934359496247295, + "ewc_loss": 0.07976527512073517, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042411754839122295, + "grad_norm": 9.235389709472656, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8824071884155273, + "num_tokens": 777932638.0, + "step": 20387 + }, + { + "epoch": 2.5935631599033204, + "ewc_loss": 0.08088217675685883, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004352866380941123, + "grad_norm": 9.388330459594727, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8639915585517883, + "num_tokens": 777977733.0, + "step": 20388 + }, + { + "epoch": 2.5936903701819105, + "ewc_loss": 0.0801079198718071, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000425102625740692, + "grad_norm": 9.311996459960938, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8702244758605957, + "num_tokens": 778020451.0, + "step": 20389 + }, + { + "epoch": 2.5938175804605015, + "ewc_loss": 0.08063149452209473, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004327797796577215, + "grad_norm": 9.329997062683105, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8703200817108154, + "num_tokens": 778059594.0, + "step": 20390 + }, + { + "epoch": 2.5939447907390916, + "ewc_loss": 0.08022722601890564, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004287371120881289, + "grad_norm": 9.336697578430176, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8698341846466064, + "num_tokens": 778099235.0, + "step": 20391 + }, + { + "epoch": 2.594072001017682, + "ewc_loss": 0.08051836490631104, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004316484846640378, + "grad_norm": 9.4137601852417, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8752254247665405, + "num_tokens": 778135430.0, + "step": 20392 + }, + { + "epoch": 2.5941992112962726, + "ewc_loss": 0.0802227258682251, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004286921175662428, + "grad_norm": 9.280879020690918, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8568249344825745, + "num_tokens": 778175111.0, + "step": 20393 + }, + { + "epoch": 2.594326421574863, + "ewc_loss": 0.08077532052993774, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004342180327512324, + "grad_norm": 9.532136917114258, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8567969799041748, + "num_tokens": 778211954.0, + "step": 20394 + }, + { + "epoch": 2.5944536318534537, + "ewc_loss": 0.07976878434419632, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004241527058184147, + "grad_norm": 9.286537170410156, + "learning_rate": 1e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8494982719421387, + "num_tokens": 778246580.0, + "step": 20395 + }, + { + "epoch": 2.594580842132044, + "ewc_loss": 0.08094412088394165, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004359060840215534, + "grad_norm": 9.464498519897461, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8705333471298218, + "num_tokens": 778288659.0, + "step": 20396 + }, + { + "epoch": 2.5947080524106347, + "ewc_loss": 0.07978503406047821, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004243152216076851, + "grad_norm": 9.318730354309082, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.864232063293457, + "num_tokens": 778325365.0, + "step": 20397 + }, + { + "epoch": 2.5948352626892253, + "ewc_loss": 0.08065421879291534, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043300699326209724, + "grad_norm": 9.423471450805664, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.872234582901001, + "num_tokens": 778362982.0, + "step": 20398 + }, + { + "epoch": 2.594962472967816, + "ewc_loss": 0.07988505810499191, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042531543294899166, + "grad_norm": 9.365373611450195, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8497201204299927, + "num_tokens": 778394230.0, + "step": 20399 + }, + { + "epoch": 2.5950896832464063, + "ewc_loss": 0.08033466339111328, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042981147998943925, + "grad_norm": 9.433284759521484, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8539116382598877, + "num_tokens": 778434345.0, + "step": 20400 + }, + { + "epoch": 2.595216893524997, + "ewc_loss": 0.07991290092468262, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004255938110873103, + "grad_norm": 9.323498725891113, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.859123706817627, + "num_tokens": 778478303.0, + "step": 20401 + }, + { + "epoch": 2.5953441038035874, + "ewc_loss": 0.08022311329841614, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004286959592718631, + "grad_norm": 9.390934944152832, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8710421919822693, + "num_tokens": 778513915.0, + "step": 20402 + }, + { + "epoch": 2.595471314082178, + "ewc_loss": 0.07993873953819275, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004258521948941052, + "grad_norm": 9.332432746887207, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8785609602928162, + "num_tokens": 778554294.0, + "step": 20403 + }, + { + "epoch": 2.5955985243607684, + "ewc_loss": 0.08028052002191544, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004292700323276222, + "grad_norm": 9.427522659301758, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8627043962478638, + "num_tokens": 778592836.0, + "step": 20404 + }, + { + "epoch": 2.595725734639359, + "ewc_loss": 0.08038066327571869, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00042538862908259034, + "grad_norm": 9.380570411682129, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.856698751449585, + "num_tokens": 778629316.0, + "step": 20405 + }, + { + "epoch": 2.5958529449179495, + "ewc_loss": 0.08057041466236115, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004297276318538934, + "grad_norm": 9.547987937927246, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8603390455245972, + "num_tokens": 778659065.0, + "step": 20406 + }, + { + "epoch": 2.59598015519654, + "ewc_loss": 0.07956494390964508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000422114331740886, + "grad_norm": 9.247061729431152, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8811996579170227, + "num_tokens": 778693949.0, + "step": 20407 + }, + { + "epoch": 2.5961073654751305, + "ewc_loss": 0.08067281544208527, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043319296673871577, + "grad_norm": 9.47003173828125, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8698026537895203, + "num_tokens": 778738045.0, + "step": 20408 + }, + { + "epoch": 2.596234575753721, + "ewc_loss": 0.07970724999904633, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004235373926348984, + "grad_norm": 9.387657165527344, + "learning_rate": 1e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.8204752206802368, + "num_tokens": 778769578.0, + "step": 20409 + }, + { + "epoch": 2.596361786032311, + "ewc_loss": 0.08023640513420105, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042882884736172855, + "grad_norm": 9.375370979309082, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8612755537033081, + "num_tokens": 778810052.0, + "step": 20410 + }, + { + "epoch": 2.596488996310902, + "ewc_loss": 0.08004289865493774, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004268938791938126, + "grad_norm": 9.402441024780273, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.858932614326477, + "num_tokens": 778850868.0, + "step": 20411 + }, + { + "epoch": 2.596616206589492, + "ewc_loss": 0.08006635308265686, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004271284269634634, + "grad_norm": 9.3632230758667, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8718858957290649, + "num_tokens": 778890129.0, + "step": 20412 + }, + { + "epoch": 2.596743416868083, + "ewc_loss": 0.08030258119106293, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004294906393624842, + "grad_norm": 9.417462348937988, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8700203895568848, + "num_tokens": 778923181.0, + "step": 20413 + }, + { + "epoch": 2.5968706271466733, + "ewc_loss": 0.07992269843816757, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004256918327882886, + "grad_norm": 9.368487358093262, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8571906089782715, + "num_tokens": 778963328.0, + "step": 20414 + }, + { + "epoch": 2.596997837425264, + "ewc_loss": 0.08016470819711685, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042811193270608783, + "grad_norm": 9.374216079711914, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8757063150405884, + "num_tokens": 779003932.0, + "step": 20415 + }, + { + "epoch": 2.5971250477038543, + "ewc_loss": 0.0800754725933075, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042721955105662346, + "grad_norm": 9.425612449645996, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8548474907875061, + "num_tokens": 779043913.0, + "step": 20416 + }, + { + "epoch": 2.597252257982445, + "ewc_loss": 0.08005985617637634, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004270633799023926, + "grad_norm": 9.381180763244629, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8610785007476807, + "num_tokens": 779077126.0, + "step": 20417 + }, + { + "epoch": 2.5973794682610354, + "ewc_loss": 0.08003905415534973, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004268553457222879, + "grad_norm": 9.339311599731445, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8689008951187134, + "num_tokens": 779111544.0, + "step": 20418 + }, + { + "epoch": 2.597506678539626, + "ewc_loss": 0.08004210144281387, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042688584653660655, + "grad_norm": 9.401141166687012, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8644781112670898, + "num_tokens": 779145493.0, + "step": 20419 + }, + { + "epoch": 2.5976338888182164, + "ewc_loss": 0.08023114502429962, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004263349110260606, + "grad_norm": 9.305686950683594, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8568201065063477, + "num_tokens": 779189216.0, + "step": 20420 + }, + { + "epoch": 2.597761099096807, + "ewc_loss": 0.08024346828460693, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042889954056590796, + "grad_norm": 9.455299377441406, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8714140057563782, + "num_tokens": 779225914.0, + "step": 20421 + }, + { + "epoch": 2.5978883093753975, + "ewc_loss": 0.07967569679021835, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042322181980125606, + "grad_norm": 9.267003059387207, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8699639439582825, + "num_tokens": 779264118.0, + "step": 20422 + }, + { + "epoch": 2.598015519653988, + "ewc_loss": 0.08047725260257721, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043123739305883646, + "grad_norm": 9.527587890625, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8683107495307922, + "num_tokens": 779301694.0, + "step": 20423 + }, + { + "epoch": 2.5981427299325786, + "ewc_loss": 0.07936206459999084, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042008544551208615, + "grad_norm": 9.260870933532715, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.863149881362915, + "num_tokens": 779339012.0, + "step": 20424 + }, + { + "epoch": 2.598269940211169, + "ewc_loss": 0.08054962754249573, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004319611471146345, + "grad_norm": 9.513564109802246, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8689666390419006, + "num_tokens": 779376358.0, + "step": 20425 + }, + { + "epoch": 2.5983971504897596, + "ewc_loss": 0.07939121127128601, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004203770076856017, + "grad_norm": 9.228599548339844, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8634684085845947, + "num_tokens": 779416344.0, + "step": 20426 + }, + { + "epoch": 2.59852436076835, + "ewc_loss": 0.08072387427091599, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043370359344407916, + "grad_norm": 9.61154842376709, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8523240685462952, + "num_tokens": 779452189.0, + "step": 20427 + }, + { + "epoch": 2.5986515710469407, + "ewc_loss": 0.07916450500488281, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004181099066045135, + "grad_norm": 9.182011604309082, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8641374111175537, + "num_tokens": 779493513.0, + "step": 20428 + }, + { + "epoch": 2.598778781325531, + "ewc_loss": 0.08098405599594116, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004363053885754198, + "grad_norm": 9.561014175415039, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8700318336486816, + "num_tokens": 779533061.0, + "step": 20429 + }, + { + "epoch": 2.5989059916041217, + "ewc_loss": 0.07914436608552933, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041790850809775293, + "grad_norm": 9.15386962890625, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8585059642791748, + "num_tokens": 779570159.0, + "step": 20430 + }, + { + "epoch": 2.5990332018827123, + "ewc_loss": 0.08105890452861786, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043705388088710606, + "grad_norm": 9.544075965881348, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8622033596038818, + "num_tokens": 779609066.0, + "step": 20431 + }, + { + "epoch": 2.599160412161303, + "ewc_loss": 0.07922635972499847, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004187283921055496, + "grad_norm": 9.198994636535645, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8649701476097107, + "num_tokens": 779651087.0, + "step": 20432 + }, + { + "epoch": 2.5992876224398933, + "ewc_loss": 0.08101354539394379, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043660029768943787, + "grad_norm": 9.59180736541748, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8776302337646484, + "num_tokens": 779689531.0, + "step": 20433 + }, + { + "epoch": 2.599414832718484, + "ewc_loss": 0.07937301695346832, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004201949923299253, + "grad_norm": 9.234108924865723, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8596479892730713, + "num_tokens": 779724062.0, + "step": 20434 + }, + { + "epoch": 2.599542042997074, + "ewc_loss": 0.08095937967300415, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043605861719697714, + "grad_norm": 9.517433166503906, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.851597249507904, + "num_tokens": 779766029.0, + "step": 20435 + }, + { + "epoch": 2.599669253275665, + "ewc_loss": 0.07948939502239227, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004213588254060596, + "grad_norm": 9.22667407989502, + "learning_rate": 1e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8482500910758972, + "num_tokens": 779811065.0, + "step": 20436 + }, + { + "epoch": 2.599796463554255, + "ewc_loss": 0.08095481246709824, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000436012982390821, + "grad_norm": 9.521328926086426, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.851252555847168, + "num_tokens": 779852442.0, + "step": 20437 + }, + { + "epoch": 2.599923673832846, + "ewc_loss": 0.07953210175037384, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004217858368065208, + "grad_norm": 9.247696876525879, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8687621355056763, + "num_tokens": 779885993.0, + "step": 20438 + }, + { + "epoch": 2.600050884111436, + "ewc_loss": 0.08089721202850342, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043543693027459085, + "grad_norm": 9.536066055297852, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8684086203575134, + "num_tokens": 779927791.0, + "step": 20439 + }, + { + "epoch": 2.6001780943900266, + "ewc_loss": 0.07953938841819763, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000421858683694154, + "grad_norm": 9.261514663696289, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8671435117721558, + "num_tokens": 779961517.0, + "step": 20440 + }, + { + "epoch": 2.600305304668617, + "ewc_loss": 0.08093449473381042, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043580977944657207, + "grad_norm": 9.561296463012695, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8525225520133972, + "num_tokens": 779992902.0, + "step": 20441 + }, + { + "epoch": 2.6004325149472076, + "ewc_loss": 0.07956091314554214, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042207399383187294, + "grad_norm": 9.273000717163086, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8652533888816833, + "num_tokens": 780032849.0, + "step": 20442 + }, + { + "epoch": 2.600559725225798, + "ewc_loss": 0.08075980842113495, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000434062909334898, + "grad_norm": 9.506779670715332, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8760470151901245, + "num_tokens": 780072563.0, + "step": 20443 + }, + { + "epoch": 2.6006869355043887, + "ewc_loss": 0.07960143685340881, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004224792355671525, + "grad_norm": 9.240989685058594, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8651326894760132, + "num_tokens": 780115299.0, + "step": 20444 + }, + { + "epoch": 2.600814145782979, + "ewc_loss": 0.08076588809490204, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000434123765444383, + "grad_norm": 9.528761863708496, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8632221817970276, + "num_tokens": 780155792.0, + "step": 20445 + }, + { + "epoch": 2.6009413560615697, + "ewc_loss": 0.0795152485370636, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004216172965243459, + "grad_norm": 9.27611255645752, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8596915602684021, + "num_tokens": 780189040.0, + "step": 20446 + }, + { + "epoch": 2.6010685663401603, + "ewc_loss": 0.08043816685676575, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000430846557719633, + "grad_norm": 9.400943756103516, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.863537073135376, + "num_tokens": 780229463.0, + "step": 20447 + }, + { + "epoch": 2.601195776618751, + "ewc_loss": 0.07971328496932983, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004235976666677743, + "grad_norm": 9.280200958251953, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8762705326080322, + "num_tokens": 780265961.0, + "step": 20448 + }, + { + "epoch": 2.6013229868973413, + "ewc_loss": 0.08054281771183014, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043189298594370484, + "grad_norm": 9.432711601257324, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8591049909591675, + "num_tokens": 780303220.0, + "step": 20449 + }, + { + "epoch": 2.601450197175932, + "ewc_loss": 0.07976433634757996, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004241082351654768, + "grad_norm": 9.245956420898438, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8624112010002136, + "num_tokens": 780340456.0, + "step": 20450 + }, + { + "epoch": 2.6015774074545224, + "ewc_loss": 0.08051596581935883, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000431624474003911, + "grad_norm": 9.449945449829102, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8620780110359192, + "num_tokens": 780381402.0, + "step": 20451 + }, + { + "epoch": 2.601704617733113, + "ewc_loss": 0.07974282652139664, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004238930996507406, + "grad_norm": 9.252681732177734, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8689136505126953, + "num_tokens": 780422571.0, + "step": 20452 + }, + { + "epoch": 2.6018318280117034, + "ewc_loss": 0.08057142794132233, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043217913480475545, + "grad_norm": 9.427488327026367, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8602404594421387, + "num_tokens": 780462447.0, + "step": 20453 + }, + { + "epoch": 2.601959038290294, + "ewc_loss": 0.07977302372455597, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042419505189172924, + "grad_norm": 9.297700881958008, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8644861578941345, + "num_tokens": 780499183.0, + "step": 20454 + }, + { + "epoch": 2.6020862485688845, + "ewc_loss": 0.08080857992172241, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004321091983001679, + "grad_norm": 9.45315933227539, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.851478099822998, + "num_tokens": 780539216.0, + "step": 20455 + }, + { + "epoch": 2.602213458847475, + "ewc_loss": 0.07976803183555603, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004241451097186655, + "grad_norm": 9.335651397705078, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8718936443328857, + "num_tokens": 780578725.0, + "step": 20456 + }, + { + "epoch": 2.6023406691260655, + "ewc_loss": 0.08041618764400482, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043062667828053236, + "grad_norm": 9.44085693359375, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8658010959625244, + "num_tokens": 780615175.0, + "step": 20457 + }, + { + "epoch": 2.6024678794046556, + "ewc_loss": 0.07977497577667236, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042421455145813525, + "grad_norm": 9.326258659362793, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8668378591537476, + "num_tokens": 780651186.0, + "step": 20458 + }, + { + "epoch": 2.6025950896832466, + "ewc_loss": 0.08025703579187393, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004290351935196668, + "grad_norm": 9.466800689697266, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8577215671539307, + "num_tokens": 780685550.0, + "step": 20459 + }, + { + "epoch": 2.6027222999618367, + "ewc_loss": 0.07975110411643982, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004239759291522205, + "grad_norm": 9.296308517456055, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8864465355873108, + "num_tokens": 780725477.0, + "step": 20460 + }, + { + "epoch": 2.6028495102404277, + "ewc_loss": 0.08023340255022049, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004287988704163581, + "grad_norm": 9.406811714172363, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8678039312362671, + "num_tokens": 780762481.0, + "step": 20461 + }, + { + "epoch": 2.6029767205190177, + "ewc_loss": 0.07965721189975739, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042303692316636443, + "grad_norm": 9.342037200927734, + "learning_rate": 1e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8449022173881531, + "num_tokens": 780800305.0, + "step": 20462 + }, + { + "epoch": 2.6031039307976087, + "ewc_loss": 0.08014487475156784, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004279135901015252, + "grad_norm": 9.492415428161621, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8663439750671387, + "num_tokens": 780836271.0, + "step": 20463 + }, + { + "epoch": 2.603231141076199, + "ewc_loss": 0.07955216616392136, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042198650771752, + "grad_norm": 9.214484214782715, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8764910697937012, + "num_tokens": 780877363.0, + "step": 20464 + }, + { + "epoch": 2.6033583513547893, + "ewc_loss": 0.08046445995569229, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043110945262014866, + "grad_norm": 9.439688682556152, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8516475558280945, + "num_tokens": 780922982.0, + "step": 20465 + }, + { + "epoch": 2.60348556163338, + "ewc_loss": 0.07954779267311096, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042194273555651307, + "grad_norm": 9.305313110351562, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8617458343505859, + "num_tokens": 780957925.0, + "step": 20466 + }, + { + "epoch": 2.6036127719119704, + "ewc_loss": 0.08037306368350983, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043019544682465494, + "grad_norm": 9.4130220413208, + "learning_rate": 1e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8453140258789062, + "num_tokens": 780996922.0, + "step": 20467 + }, + { + "epoch": 2.603739982190561, + "ewc_loss": 0.07970768958330154, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004235417291056365, + "grad_norm": 9.34659481048584, + "learning_rate": 1e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.8474500179290771, + "num_tokens": 781037209.0, + "step": 20468 + }, + { + "epoch": 2.6038671924691514, + "ewc_loss": 0.08015257120132446, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004279905988369137, + "grad_norm": 9.493439674377441, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8574002385139465, + "num_tokens": 781078077.0, + "step": 20469 + }, + { + "epoch": 2.603994402747742, + "ewc_loss": 0.07968585938215256, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000423323450377211, + "grad_norm": 9.341225624084473, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8635165095329285, + "num_tokens": 781122516.0, + "step": 20470 + }, + { + "epoch": 2.6041216130263325, + "ewc_loss": 0.08008486032485962, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042731346911750734, + "grad_norm": 9.469745635986328, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8579988479614258, + "num_tokens": 781156290.0, + "step": 20471 + }, + { + "epoch": 2.604248823304923, + "ewc_loss": 0.07954612374305725, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004219261172693223, + "grad_norm": 9.317017555236816, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8711137175559998, + "num_tokens": 781192769.0, + "step": 20472 + }, + { + "epoch": 2.6043760335835135, + "ewc_loss": 0.08019102364778519, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004283750895410776, + "grad_norm": 9.433382034301758, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8694201707839966, + "num_tokens": 781227594.0, + "step": 20473 + }, + { + "epoch": 2.604503243862104, + "ewc_loss": 0.07957523316144943, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042221718467772007, + "grad_norm": 9.333709716796875, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8749815225601196, + "num_tokens": 781259369.0, + "step": 20474 + }, + { + "epoch": 2.6046304541406946, + "ewc_loss": 0.08003073930740356, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004267722542863339, + "grad_norm": 9.392495155334473, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.862604022026062, + "num_tokens": 781296788.0, + "step": 20475 + }, + { + "epoch": 2.604757664419285, + "ewc_loss": 0.07954350113868713, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042189989471808076, + "grad_norm": 9.287827491760254, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8670929670333862, + "num_tokens": 781333103.0, + "step": 20476 + }, + { + "epoch": 2.6048848746978757, + "ewc_loss": 0.08012427389621735, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004277075349818915, + "grad_norm": 9.306281089782715, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8737427592277527, + "num_tokens": 781372691.0, + "step": 20477 + }, + { + "epoch": 2.605012084976466, + "ewc_loss": 0.07986191660165787, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042508402839303017, + "grad_norm": 9.361465454101562, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8589609265327454, + "num_tokens": 781410480.0, + "step": 20478 + }, + { + "epoch": 2.6051392952550567, + "ewc_loss": 0.07976636290550232, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004241284914314747, + "grad_norm": 9.277493476867676, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8565547466278076, + "num_tokens": 781454110.0, + "step": 20479 + }, + { + "epoch": 2.6052665055336472, + "ewc_loss": 0.08004505932331085, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004269154160283506, + "grad_norm": 9.415925025939941, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8618154525756836, + "num_tokens": 781494034.0, + "step": 20480 + }, + { + "epoch": 2.6053937158122378, + "ewc_loss": 0.07967203110456467, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004231851489748806, + "grad_norm": 9.266032218933105, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8622258305549622, + "num_tokens": 781536225.0, + "step": 20481 + }, + { + "epoch": 2.6055209260908283, + "ewc_loss": 0.08025793731212616, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004290442157071084, + "grad_norm": 9.414044380187988, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8700524568557739, + "num_tokens": 781573987.0, + "step": 20482 + }, + { + "epoch": 2.6056481363694184, + "ewc_loss": 0.07946280390024185, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000421092874603346, + "grad_norm": 9.265408515930176, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8683075308799744, + "num_tokens": 781608987.0, + "step": 20483 + }, + { + "epoch": 2.6057753466480094, + "ewc_loss": 0.08025085926055908, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042897346429526806, + "grad_norm": 9.411261558532715, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8702753186225891, + "num_tokens": 781646185.0, + "step": 20484 + }, + { + "epoch": 2.6059025569265994, + "ewc_loss": 0.079546257853508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042192742694169283, + "grad_norm": 9.240877151489258, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8531124591827393, + "num_tokens": 781682999.0, + "step": 20485 + }, + { + "epoch": 2.6060297672051904, + "ewc_loss": 0.08038395643234253, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004303043824620545, + "grad_norm": 9.40211296081543, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8621892333030701, + "num_tokens": 781726052.0, + "step": 20486 + }, + { + "epoch": 2.6061569774837805, + "ewc_loss": 0.07960088551044464, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004224736476317048, + "grad_norm": 9.309337615966797, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8778825402259827, + "num_tokens": 781760562.0, + "step": 20487 + }, + { + "epoch": 2.6062841877623715, + "ewc_loss": 0.08020412921905518, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042850608588196337, + "grad_norm": 9.408628463745117, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8520766496658325, + "num_tokens": 781803765.0, + "step": 20488 + }, + { + "epoch": 2.6064113980409616, + "ewc_loss": 0.07967743277549744, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004232391365803778, + "grad_norm": 9.266519546508789, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8644325733184814, + "num_tokens": 781845419.0, + "step": 20489 + }, + { + "epoch": 2.606538608319552, + "ewc_loss": 0.08027689158916473, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042923379805870354, + "grad_norm": 9.399293899536133, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8731141686439514, + "num_tokens": 781883371.0, + "step": 20490 + }, + { + "epoch": 2.6066658185981426, + "ewc_loss": 0.07949948310852051, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004214597283862531, + "grad_norm": 9.263495445251465, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8621928095817566, + "num_tokens": 781919669.0, + "step": 20491 + }, + { + "epoch": 2.606793028876733, + "ewc_loss": 0.08029884099960327, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042945327004417777, + "grad_norm": 9.405501365661621, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8699923753738403, + "num_tokens": 781953212.0, + "step": 20492 + }, + { + "epoch": 2.6069202391553237, + "ewc_loss": 0.07964316010475159, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004228964098729193, + "grad_norm": 9.287101745605469, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8680651187896729, + "num_tokens": 781996824.0, + "step": 20493 + }, + { + "epoch": 2.607047449433914, + "ewc_loss": 0.08018293976783752, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042829426820389926, + "grad_norm": 9.334274291992188, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8837019205093384, + "num_tokens": 782035350.0, + "step": 20494 + }, + { + "epoch": 2.6071746597125047, + "ewc_loss": 0.07989189028739929, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004253837396390736, + "grad_norm": 9.37739372253418, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.857479453086853, + "num_tokens": 782073217.0, + "step": 20495 + }, + { + "epoch": 2.6073018699910953, + "ewc_loss": 0.07989679276943207, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004254327795933932, + "grad_norm": 9.344172477722168, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.857719361782074, + "num_tokens": 782110777.0, + "step": 20496 + }, + { + "epoch": 2.607429080269686, + "ewc_loss": 0.07988637685775757, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004253285878803581, + "grad_norm": 9.312569618225098, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8579188585281372, + "num_tokens": 782153300.0, + "step": 20497 + }, + { + "epoch": 2.6075562905482763, + "ewc_loss": 0.08014370501041412, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004279019485693425, + "grad_norm": 9.396559715270996, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8693479895591736, + "num_tokens": 782191945.0, + "step": 20498 + }, + { + "epoch": 2.607683500826867, + "ewc_loss": 0.07983379811048508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004248028271831572, + "grad_norm": 9.304998397827148, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8748254776000977, + "num_tokens": 782230472.0, + "step": 20499 + }, + { + "epoch": 2.6078107111054574, + "ewc_loss": 0.08032631874084473, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042972801020368934, + "grad_norm": 9.471782684326172, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8550065755844116, + "num_tokens": 782263868.0, + "step": 20500 + }, + { + "epoch": 2.607937921384048, + "ewc_loss": 0.07987351715564728, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004227586614433676, + "grad_norm": 9.303182601928711, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8673315644264221, + "num_tokens": 782299781.0, + "step": 20501 + }, + { + "epoch": 2.6080651316626384, + "ewc_loss": 0.08030514419078827, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004295163380447775, + "grad_norm": 9.41091251373291, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8571709394454956, + "num_tokens": 782343376.0, + "step": 20502 + }, + { + "epoch": 2.608192341941229, + "ewc_loss": 0.07995614409446716, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042602623580023646, + "grad_norm": 9.278807640075684, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8766268491744995, + "num_tokens": 782389894.0, + "step": 20503 + }, + { + "epoch": 2.6083195522198195, + "ewc_loss": 0.0802663266658783, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004291280929464847, + "grad_norm": 9.418310165405273, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.866621732711792, + "num_tokens": 782430055.0, + "step": 20504 + }, + { + "epoch": 2.60844676249841, + "ewc_loss": 0.07987245917320251, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004251894715707749, + "grad_norm": 9.340791702270508, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8646632432937622, + "num_tokens": 782467211.0, + "step": 20505 + }, + { + "epoch": 2.6085739727770005, + "ewc_loss": 0.0803002119064331, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000429466919740662, + "grad_norm": 9.44586181640625, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8559077978134155, + "num_tokens": 782505913.0, + "step": 20506 + }, + { + "epoch": 2.608701183055591, + "ewc_loss": 0.07996752113103867, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042614006088115275, + "grad_norm": 9.376336097717285, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8646606206893921, + "num_tokens": 782541858.0, + "step": 20507 + }, + { + "epoch": 2.608828393334181, + "ewc_loss": 0.08036395907402039, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004276630061212927, + "grad_norm": 9.414478302001953, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8634886741638184, + "num_tokens": 782575854.0, + "step": 20508 + }, + { + "epoch": 2.608955603612772, + "ewc_loss": 0.08007729053497314, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042723779915831983, + "grad_norm": 9.375457763671875, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8669230937957764, + "num_tokens": 782614005.0, + "step": 20509 + }, + { + "epoch": 2.609082813891362, + "ewc_loss": 0.08044995367527008, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004285229369997978, + "grad_norm": 9.426462173461914, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.873618483543396, + "num_tokens": 782650147.0, + "step": 20510 + }, + { + "epoch": 2.609210024169953, + "ewc_loss": 0.07992735505104065, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004257384280208498, + "grad_norm": 9.3673734664917, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8650244474411011, + "num_tokens": 782683215.0, + "step": 20511 + }, + { + "epoch": 2.6093372344485433, + "ewc_loss": 0.08053694665431976, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004293929086998105, + "grad_norm": 9.47209644317627, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8700540065765381, + "num_tokens": 782721184.0, + "step": 20512 + }, + { + "epoch": 2.609464444727134, + "ewc_loss": 0.08001808822154999, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004242043651174754, + "grad_norm": 9.413158416748047, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8588762283325195, + "num_tokens": 782759891.0, + "step": 20513 + }, + { + "epoch": 2.6095916550057243, + "ewc_loss": 0.08006609231233597, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004271257494110614, + "grad_norm": 9.43114185333252, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8540414571762085, + "num_tokens": 782796398.0, + "step": 20514 + }, + { + "epoch": 2.609718865284315, + "ewc_loss": 0.07973042130470276, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042376903002150357, + "grad_norm": 9.302404403686523, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8742247819900513, + "num_tokens": 782835968.0, + "step": 20515 + }, + { + "epoch": 2.6098460755629054, + "ewc_loss": 0.080299973487854, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042946459143422544, + "grad_norm": 9.43706226348877, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8742180466651917, + "num_tokens": 782871367.0, + "step": 20516 + }, + { + "epoch": 2.609973285841496, + "ewc_loss": 0.07960346341133118, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004224994336254895, + "grad_norm": 9.337984085083008, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8864008188247681, + "num_tokens": 782905766.0, + "step": 20517 + }, + { + "epoch": 2.6101004961200864, + "ewc_loss": 0.08013186603784561, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004277834959793836, + "grad_norm": 9.427631378173828, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8699694275856018, + "num_tokens": 782942256.0, + "step": 20518 + }, + { + "epoch": 2.610227706398677, + "ewc_loss": 0.07958317548036575, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042229657992720604, + "grad_norm": 9.336153984069824, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8757851123809814, + "num_tokens": 782976156.0, + "step": 20519 + }, + { + "epoch": 2.6103549166772675, + "ewc_loss": 0.08018842339515686, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004283491289243102, + "grad_norm": 9.428257942199707, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.873924732208252, + "num_tokens": 783011450.0, + "step": 20520 + }, + { + "epoch": 2.610482126955858, + "ewc_loss": 0.07969363033771515, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004234011285007, + "grad_norm": 9.255444526672363, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8701386451721191, + "num_tokens": 783050355.0, + "step": 20521 + }, + { + "epoch": 2.6106093372344485, + "ewc_loss": 0.08021348714828491, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042859968380071223, + "grad_norm": 9.44820785522461, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.874625563621521, + "num_tokens": 783091621.0, + "step": 20522 + }, + { + "epoch": 2.610736547513039, + "ewc_loss": 0.07958746701478958, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004223395080771297, + "grad_norm": 9.295363426208496, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8550114035606384, + "num_tokens": 783130471.0, + "step": 20523 + }, + { + "epoch": 2.6108637577916296, + "ewc_loss": 0.080414779484272, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043061262113042176, + "grad_norm": 9.445503234863281, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8682029247283936, + "num_tokens": 783168340.0, + "step": 20524 + }, + { + "epoch": 2.61099096807022, + "ewc_loss": 0.07954692840576172, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042193414992652833, + "grad_norm": 9.327593803405762, + "learning_rate": 1e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8452562689781189, + "num_tokens": 783207279.0, + "step": 20525 + }, + { + "epoch": 2.6111181783488107, + "ewc_loss": 0.08039584755897522, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043042327160947025, + "grad_norm": 9.428864479064941, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8574023246765137, + "num_tokens": 783243010.0, + "step": 20526 + }, + { + "epoch": 2.611245388627401, + "ewc_loss": 0.07962433248758316, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042270816629752517, + "grad_norm": 9.305331230163574, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8557325601577759, + "num_tokens": 783287548.0, + "step": 20527 + }, + { + "epoch": 2.6113725989059917, + "ewc_loss": 0.08055143803358078, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004295378166716546, + "grad_norm": 9.466866493225098, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8614280819892883, + "num_tokens": 783331549.0, + "step": 20528 + }, + { + "epoch": 2.6114998091845822, + "ewc_loss": 0.07959005236625671, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004223654104862362, + "grad_norm": 9.23177719116211, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8750359416007996, + "num_tokens": 783369445.0, + "step": 20529 + }, + { + "epoch": 2.6116270194631728, + "ewc_loss": 0.08063535392284393, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004328183422330767, + "grad_norm": 9.544879913330078, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.86033695936203, + "num_tokens": 783413218.0, + "step": 20530 + }, + { + "epoch": 2.6117542297417633, + "ewc_loss": 0.07943576574325562, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042082250001840293, + "grad_norm": 9.214380264282227, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8641970157623291, + "num_tokens": 783449944.0, + "step": 20531 + }, + { + "epoch": 2.611881440020354, + "ewc_loss": 0.08093656599521637, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004358304722700268, + "grad_norm": 9.576706886291504, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8650375604629517, + "num_tokens": 783486142.0, + "step": 20532 + }, + { + "epoch": 2.612008650298944, + "ewc_loss": 0.07933860272169113, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004198508686386049, + "grad_norm": 9.252947807312012, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8718100786209106, + "num_tokens": 783521170.0, + "step": 20533 + }, + { + "epoch": 2.612135860577535, + "ewc_loss": 0.08099409937858582, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004364058841019869, + "grad_norm": 9.543585777282715, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8627252578735352, + "num_tokens": 783563902.0, + "step": 20534 + }, + { + "epoch": 2.612263070856125, + "ewc_loss": 0.0794752985239029, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004212178464513272, + "grad_norm": 9.282532691955566, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8702210187911987, + "num_tokens": 783597655.0, + "step": 20535 + }, + { + "epoch": 2.612390281134716, + "ewc_loss": 0.08133768290281296, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00043495887075550854, + "grad_norm": 9.57534408569336, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8543745279312134, + "num_tokens": 783635233.0, + "step": 20536 + }, + { + "epoch": 2.612517491413306, + "ewc_loss": 0.0795208215713501, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042167308856733143, + "grad_norm": 9.361452102661133, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8654932975769043, + "num_tokens": 783679677.0, + "step": 20537 + }, + { + "epoch": 2.6126447016918966, + "ewc_loss": 0.08041854202747345, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004306502523832023, + "grad_norm": 9.46071720123291, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8703292012214661, + "num_tokens": 783720608.0, + "step": 20538 + }, + { + "epoch": 2.612771911970487, + "ewc_loss": 0.07970037311315536, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004234685911796987, + "grad_norm": 9.298669815063477, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8741489052772522, + "num_tokens": 783753198.0, + "step": 20539 + }, + { + "epoch": 2.6128991222490776, + "ewc_loss": 0.08038485795259476, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004303134046494961, + "grad_norm": 9.429253578186035, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8639755249023438, + "num_tokens": 783791068.0, + "step": 20540 + }, + { + "epoch": 2.613026332527668, + "ewc_loss": 0.07975175976753235, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042398241930641234, + "grad_norm": 9.36590576171875, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8659718036651611, + "num_tokens": 783831319.0, + "step": 20541 + }, + { + "epoch": 2.6131535428062587, + "ewc_loss": 0.08015529811382294, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004280177818145603, + "grad_norm": 9.385004997253418, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8692929744720459, + "num_tokens": 783867537.0, + "step": 20542 + }, + { + "epoch": 2.613280753084849, + "ewc_loss": 0.07987361401319504, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004252009966876358, + "grad_norm": 9.329294204711914, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8581719994544983, + "num_tokens": 783907787.0, + "step": 20543 + }, + { + "epoch": 2.6134079633634397, + "ewc_loss": 0.0800270065665245, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042673491407185793, + "grad_norm": 9.411988258361816, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.858669102191925, + "num_tokens": 783948558.0, + "step": 20544 + }, + { + "epoch": 2.6135351736420303, + "ewc_loss": 0.07976660132408142, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004241308197379112, + "grad_norm": 9.268707275390625, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.883125901222229, + "num_tokens": 783986726.0, + "step": 20545 + }, + { + "epoch": 2.613662383920621, + "ewc_loss": 0.08026141673326492, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004290789947845042, + "grad_norm": 9.540189743041992, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8458642363548279, + "num_tokens": 784023449.0, + "step": 20546 + }, + { + "epoch": 2.6137895941992113, + "ewc_loss": 0.07932540029287338, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041971885366365314, + "grad_norm": 9.26524543762207, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8819186687469482, + "num_tokens": 784059529.0, + "step": 20547 + }, + { + "epoch": 2.613916804477802, + "ewc_loss": 0.08048608899116516, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043132572318427265, + "grad_norm": 9.423924446105957, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.867317795753479, + "num_tokens": 784103530.0, + "step": 20548 + }, + { + "epoch": 2.6140440147563924, + "ewc_loss": 0.07954148948192596, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042187972576357424, + "grad_norm": 9.282573699951172, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8579311966896057, + "num_tokens": 784141721.0, + "step": 20549 + }, + { + "epoch": 2.614171225034983, + "ewc_loss": 0.08052121102809906, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043167691910639405, + "grad_norm": 9.445907592773438, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8655821084976196, + "num_tokens": 784175868.0, + "step": 20550 + }, + { + "epoch": 2.6142984353135734, + "ewc_loss": 0.07968121767044067, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004232770297676325, + "grad_norm": 9.298262596130371, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8769389986991882, + "num_tokens": 784210646.0, + "step": 20551 + }, + { + "epoch": 2.614425645592164, + "ewc_loss": 0.08044113218784332, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043087618541903794, + "grad_norm": 9.42198371887207, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8705241680145264, + "num_tokens": 784255588.0, + "step": 20552 + }, + { + "epoch": 2.6145528558707545, + "ewc_loss": 0.0800197571516037, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042666238732635975, + "grad_norm": 9.392864227294922, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8663384914398193, + "num_tokens": 784292877.0, + "step": 20553 + }, + { + "epoch": 2.614680066149345, + "ewc_loss": 0.08001334965229034, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042659835889935493, + "grad_norm": 9.362435340881348, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8695370554924011, + "num_tokens": 784336150.0, + "step": 20554 + }, + { + "epoch": 2.6148072764279355, + "ewc_loss": 0.08015336096286774, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004279983986634761, + "grad_norm": 9.383370399475098, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8795138597488403, + "num_tokens": 784371547.0, + "step": 20555 + }, + { + "epoch": 2.6149344867065256, + "ewc_loss": 0.08020111918449402, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004284760798327625, + "grad_norm": 9.41204833984375, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8683550357818604, + "num_tokens": 784411473.0, + "step": 20556 + }, + { + "epoch": 2.6150616969851166, + "ewc_loss": 0.08003182709217072, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042678313911892474, + "grad_norm": 9.36478328704834, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.855904221534729, + "num_tokens": 784456713.0, + "step": 20557 + }, + { + "epoch": 2.6151889072637067, + "ewc_loss": 0.08024505525827408, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004289154021535069, + "grad_norm": 9.408884048461914, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.879447340965271, + "num_tokens": 784493685.0, + "step": 20558 + }, + { + "epoch": 2.6153161175422976, + "ewc_loss": 0.08002103865146637, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004266752803232521, + "grad_norm": 9.286468505859375, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8771165609359741, + "num_tokens": 784534641.0, + "step": 20559 + }, + { + "epoch": 2.6154433278208877, + "ewc_loss": 0.08045181632041931, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004309830255806446, + "grad_norm": 9.43472671508789, + "learning_rate": 1e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8423882126808167, + "num_tokens": 784576184.0, + "step": 20560 + }, + { + "epoch": 2.6155705380994787, + "ewc_loss": 0.08025698363780975, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004265933239366859, + "grad_norm": 9.391104698181152, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8677485585212708, + "num_tokens": 784613851.0, + "step": 20561 + }, + { + "epoch": 2.615697748378069, + "ewc_loss": 0.08037767559289932, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004302416054997593, + "grad_norm": 9.418643951416016, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8678619861602783, + "num_tokens": 784654133.0, + "step": 20562 + }, + { + "epoch": 2.6158249586566593, + "ewc_loss": 0.08006944507360458, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000427159306127578, + "grad_norm": 9.344067573547363, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8671435117721558, + "num_tokens": 784690529.0, + "step": 20563 + }, + { + "epoch": 2.61595216893525, + "ewc_loss": 0.08048157393932343, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043128058314323425, + "grad_norm": 9.42750358581543, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8646997213363647, + "num_tokens": 784731972.0, + "step": 20564 + }, + { + "epoch": 2.6160793792138404, + "ewc_loss": 0.08017697930335999, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042823460535146296, + "grad_norm": 9.363353729248047, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8604190349578857, + "num_tokens": 784773275.0, + "step": 20565 + }, + { + "epoch": 2.616206589492431, + "ewc_loss": 0.08041560649871826, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004306209448259324, + "grad_norm": 9.61092472076416, + "learning_rate": 1e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8495833873748779, + "num_tokens": 784811468.0, + "step": 20566 + }, + { + "epoch": 2.6163337997710214, + "ewc_loss": 0.07977236807346344, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042418853263370693, + "grad_norm": 9.205100059509277, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8668476343154907, + "num_tokens": 784850611.0, + "step": 20567 + }, + { + "epoch": 2.616461010049612, + "ewc_loss": 0.08130308985710144, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004394957795739174, + "grad_norm": 9.606498718261719, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.87005615234375, + "num_tokens": 784886502.0, + "step": 20568 + }, + { + "epoch": 2.6165882203282025, + "ewc_loss": 0.07953114807605743, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004217763780616224, + "grad_norm": 9.251517295837402, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8559924960136414, + "num_tokens": 784924489.0, + "step": 20569 + }, + { + "epoch": 2.616715430606793, + "ewc_loss": 0.08130413293838501, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043950622784905136, + "grad_norm": 9.592612266540527, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8698137402534485, + "num_tokens": 784957224.0, + "step": 20570 + }, + { + "epoch": 2.6168426408853835, + "ewc_loss": 0.07959496974945068, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004224145086482167, + "grad_norm": 9.262114524841309, + "learning_rate": 1e-06, + "loss": 0.5563, + "mean_token_accuracy": 0.8403212428092957, + "num_tokens": 784997754.0, + "step": 20571 + }, + { + "epoch": 2.616969851163974, + "ewc_loss": 0.08137015998363495, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044016644824296236, + "grad_norm": 9.594590187072754, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8503038883209229, + "num_tokens": 785033932.0, + "step": 20572 + }, + { + "epoch": 2.6170970614425646, + "ewc_loss": 0.07966121286153793, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004230769700370729, + "grad_norm": 9.241277694702148, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8635162115097046, + "num_tokens": 785073190.0, + "step": 20573 + }, + { + "epoch": 2.617224271721155, + "ewc_loss": 0.08143353462219238, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004408001550473273, + "grad_norm": 9.608983039855957, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8822035789489746, + "num_tokens": 785108397.0, + "step": 20574 + }, + { + "epoch": 2.6173514819997457, + "ewc_loss": 0.08000102639198303, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042403372935950756, + "grad_norm": 9.280755996704102, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8930272459983826, + "num_tokens": 785142671.0, + "step": 20575 + }, + { + "epoch": 2.617478692278336, + "ewc_loss": 0.08134379982948303, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004399027966428548, + "grad_norm": 9.6212158203125, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8636974096298218, + "num_tokens": 785183467.0, + "step": 20576 + }, + { + "epoch": 2.6176059025569267, + "ewc_loss": 0.07993917167186737, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042585658957250416, + "grad_norm": 9.230588912963867, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8548039197921753, + "num_tokens": 785222072.0, + "step": 20577 + }, + { + "epoch": 2.6177331128355172, + "ewc_loss": 0.08148244023323059, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044128927402198315, + "grad_norm": 9.64880084991455, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.855424165725708, + "num_tokens": 785260458.0, + "step": 20578 + }, + { + "epoch": 2.6178603231141078, + "ewc_loss": 0.07986128330230713, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042507765465416014, + "grad_norm": 9.26790714263916, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8516907691955566, + "num_tokens": 785302434.0, + "step": 20579 + }, + { + "epoch": 2.6179875333926983, + "ewc_loss": 0.08140671253204346, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044053199235349894, + "grad_norm": 9.631630897521973, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.850411057472229, + "num_tokens": 785338251.0, + "step": 20580 + }, + { + "epoch": 2.6181147436712884, + "ewc_loss": 0.0797136127948761, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004236009845044464, + "grad_norm": 9.231846809387207, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8618540167808533, + "num_tokens": 785384789.0, + "step": 20581 + }, + { + "epoch": 2.6182419539498794, + "ewc_loss": 0.08161868900060654, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004426517407409847, + "grad_norm": 9.649410247802734, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8652924299240112, + "num_tokens": 785424978.0, + "step": 20582 + }, + { + "epoch": 2.6183691642284694, + "ewc_loss": 0.07971850037574768, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004236498789396137, + "grad_norm": 9.291377067565918, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8584469556808472, + "num_tokens": 785464421.0, + "step": 20583 + }, + { + "epoch": 2.6184963745070604, + "ewc_loss": 0.08133918046951294, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043985663796775043, + "grad_norm": 9.637066841125488, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8830925226211548, + "num_tokens": 785496166.0, + "step": 20584 + }, + { + "epoch": 2.6186235847856505, + "ewc_loss": 0.07962427288293839, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042270758422091603, + "grad_norm": 9.252737045288086, + "learning_rate": 1e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8496608734130859, + "num_tokens": 785528504.0, + "step": 20585 + }, + { + "epoch": 2.6187507950642415, + "ewc_loss": 0.08131612837314606, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004396261356305331, + "grad_norm": 9.600845336914062, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.867304801940918, + "num_tokens": 785570896.0, + "step": 20586 + }, + { + "epoch": 2.6188780053428315, + "ewc_loss": 0.07981045544147491, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004245693562552333, + "grad_norm": 9.236468315124512, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8609877824783325, + "num_tokens": 785612464.0, + "step": 20587 + }, + { + "epoch": 2.619005215621422, + "ewc_loss": 0.08122560381889343, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043872083188034594, + "grad_norm": 9.532255172729492, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8654776215553284, + "num_tokens": 785655500.0, + "step": 20588 + }, + { + "epoch": 2.6191324259000126, + "ewc_loss": 0.07995733618736267, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042603822657838464, + "grad_norm": 9.374485969543457, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8676974773406982, + "num_tokens": 785691186.0, + "step": 20589 + }, + { + "epoch": 2.619259636178603, + "ewc_loss": 0.08092403411865234, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004357051511760801, + "grad_norm": 9.464376449584961, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8795689344406128, + "num_tokens": 785728575.0, + "step": 20590 + }, + { + "epoch": 2.6193868464571937, + "ewc_loss": 0.08018092811107635, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042827409924939275, + "grad_norm": 9.383699417114258, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8625813126564026, + "num_tokens": 785765782.0, + "step": 20591 + }, + { + "epoch": 2.619514056735784, + "ewc_loss": 0.08053742349147797, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004318390565458685, + "grad_norm": 9.427098274230957, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8696730732917786, + "num_tokens": 785800130.0, + "step": 20592 + }, + { + "epoch": 2.6196412670143747, + "ewc_loss": 0.08062379062175751, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000430261337896809, + "grad_norm": 9.337752342224121, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8918615579605103, + "num_tokens": 785836647.0, + "step": 20593 + }, + { + "epoch": 2.6197684772929652, + "ewc_loss": 0.0806223452091217, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004326882481109351, + "grad_norm": 9.3853759765625, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8586041927337646, + "num_tokens": 785873897.0, + "step": 20594 + }, + { + "epoch": 2.6198956875715558, + "ewc_loss": 0.08040046691894531, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004304694593884051, + "grad_norm": 9.39159870147705, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8602387309074402, + "num_tokens": 785914296.0, + "step": 20595 + }, + { + "epoch": 2.6200228978501463, + "ewc_loss": 0.08044014871120453, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043086629011668265, + "grad_norm": 9.385103225708008, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8534672260284424, + "num_tokens": 785951426.0, + "step": 20596 + }, + { + "epoch": 2.620150108128737, + "ewc_loss": 0.08048520237207413, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004313168756198138, + "grad_norm": 9.329728126525879, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8689777851104736, + "num_tokens": 785989906.0, + "step": 20597 + }, + { + "epoch": 2.6202773184073274, + "ewc_loss": 0.08082158118486404, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004322392342146486, + "grad_norm": 9.504525184631348, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8599047660827637, + "num_tokens": 786024763.0, + "step": 20598 + }, + { + "epoch": 2.620404528685918, + "ewc_loss": 0.08127456903457642, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.00042944494634866714, + "grad_norm": 9.623830795288086, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8610270023345947, + "num_tokens": 786063055.0, + "step": 20599 + }, + { + "epoch": 2.6205317389645084, + "ewc_loss": 0.08011461794376373, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042761105578392744, + "grad_norm": 9.34189224243164, + "learning_rate": 1e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8373317718505859, + "num_tokens": 786097675.0, + "step": 20600 + }, + { + "epoch": 2.620658949243099, + "ewc_loss": 0.0811586081981659, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043560954509302974, + "grad_norm": 9.460335731506348, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8762738108634949, + "num_tokens": 786133721.0, + "step": 20601 + }, + { + "epoch": 2.6207861595216895, + "ewc_loss": 0.07999366521835327, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004264014423824847, + "grad_norm": 9.305502891540527, + "learning_rate": 1e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8472605347633362, + "num_tokens": 786179615.0, + "step": 20602 + }, + { + "epoch": 2.62091336980028, + "ewc_loss": 0.0812201127409935, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043622456723824143, + "grad_norm": 9.53480052947998, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8778684735298157, + "num_tokens": 786214945.0, + "step": 20603 + }, + { + "epoch": 2.6210405800788705, + "ewc_loss": 0.07989096641540527, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004253744555171579, + "grad_norm": 9.282495498657227, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8667064905166626, + "num_tokens": 786250880.0, + "step": 20604 + }, + { + "epoch": 2.621167790357461, + "ewc_loss": 0.08114683628082275, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004354917909950018, + "grad_norm": 9.48009967803955, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8661211729049683, + "num_tokens": 786286095.0, + "step": 20605 + }, + { + "epoch": 2.621295000636051, + "ewc_loss": 0.079856276512146, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004250275669619441, + "grad_norm": 9.314152717590332, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8697606325149536, + "num_tokens": 786319547.0, + "step": 20606 + }, + { + "epoch": 2.621422210914642, + "ewc_loss": 0.08077191561460495, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000434183981269598, + "grad_norm": 9.452863693237305, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8642958402633667, + "num_tokens": 786358699.0, + "step": 20607 + }, + { + "epoch": 2.621549421193232, + "ewc_loss": 0.08000411093235016, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004265059542376548, + "grad_norm": 9.260930061340332, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8691962361335754, + "num_tokens": 786399732.0, + "step": 20608 + }, + { + "epoch": 2.621676631471823, + "ewc_loss": 0.08086395263671875, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043510441901162267, + "grad_norm": 9.45606803894043, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8556399941444397, + "num_tokens": 786437375.0, + "step": 20609 + }, + { + "epoch": 2.6218038417504133, + "ewc_loss": 0.08011482656002045, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042761306394822896, + "grad_norm": 9.370354652404785, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8544648289680481, + "num_tokens": 786475538.0, + "step": 20610 + }, + { + "epoch": 2.621931052029004, + "ewc_loss": 0.08055868744850159, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004320517473388463, + "grad_norm": 9.380965232849121, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8663221597671509, + "num_tokens": 786516802.0, + "step": 20611 + }, + { + "epoch": 2.6220582623075943, + "ewc_loss": 0.08030954003334045, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004295602848287672, + "grad_norm": 9.327239036560059, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8656883239746094, + "num_tokens": 786558470.0, + "step": 20612 + }, + { + "epoch": 2.622185472586185, + "ewc_loss": 0.08065854012966156, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043305029976181686, + "grad_norm": 9.463068008422852, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8593434691429138, + "num_tokens": 786601163.0, + "step": 20613 + }, + { + "epoch": 2.6223126828647754, + "ewc_loss": 0.08033670485019684, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004273905069567263, + "grad_norm": 9.344747543334961, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8723952770233154, + "num_tokens": 786636741.0, + "step": 20614 + }, + { + "epoch": 2.622439893143366, + "ewc_loss": 0.08069520443677902, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004334168916102499, + "grad_norm": 9.503277778625488, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8528221845626831, + "num_tokens": 786672931.0, + "step": 20615 + }, + { + "epoch": 2.6225671034219564, + "ewc_loss": 0.07994440197944641, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042590891825966537, + "grad_norm": 9.337977409362793, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8763603568077087, + "num_tokens": 786709310.0, + "step": 20616 + }, + { + "epoch": 2.622694313700547, + "ewc_loss": 0.08055265247821808, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043199132778681815, + "grad_norm": 9.477019309997559, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8704811334609985, + "num_tokens": 786747113.0, + "step": 20617 + }, + { + "epoch": 2.6228215239791375, + "ewc_loss": 0.07997556775808334, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004262205329723656, + "grad_norm": 9.34100341796875, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8646814823150635, + "num_tokens": 786782033.0, + "step": 20618 + }, + { + "epoch": 2.622948734257728, + "ewc_loss": 0.0804532915353775, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043099778122268617, + "grad_norm": 9.533726692199707, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8651355504989624, + "num_tokens": 786827229.0, + "step": 20619 + }, + { + "epoch": 2.6230759445363185, + "ewc_loss": 0.0800262838602066, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042428632150404155, + "grad_norm": 9.359110832214355, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8741492033004761, + "num_tokens": 786861722.0, + "step": 20620 + }, + { + "epoch": 2.623203154814909, + "ewc_loss": 0.08039940148591995, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043045886559411883, + "grad_norm": 9.477581977844238, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.86785888671875, + "num_tokens": 786896202.0, + "step": 20621 + }, + { + "epoch": 2.6233303650934996, + "ewc_loss": 0.07971271872520447, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042359199142083526, + "grad_norm": 9.288246154785156, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8664764761924744, + "num_tokens": 786932154.0, + "step": 20622 + }, + { + "epoch": 2.62345757537209, + "ewc_loss": 0.08051234483718872, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043158832704648376, + "grad_norm": 9.449804306030273, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.855729341506958, + "num_tokens": 786972184.0, + "step": 20623 + }, + { + "epoch": 2.6235847856506807, + "ewc_loss": 0.07963506877422333, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004228155012242496, + "grad_norm": 9.309918403625488, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8582881689071655, + "num_tokens": 787015514.0, + "step": 20624 + }, + { + "epoch": 2.623711995929271, + "ewc_loss": 0.08049267530441284, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004313916142564267, + "grad_norm": 9.459772109985352, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8638660907745361, + "num_tokens": 787054023.0, + "step": 20625 + }, + { + "epoch": 2.6238392062078617, + "ewc_loss": 0.0797148048877716, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042361285886727273, + "grad_norm": 9.346145629882812, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8598728775978088, + "num_tokens": 787088858.0, + "step": 20626 + }, + { + "epoch": 2.6239664164864522, + "ewc_loss": 0.08020623028278351, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042852709884755313, + "grad_norm": 9.394739151000977, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8673742413520813, + "num_tokens": 787124805.0, + "step": 20627 + }, + { + "epoch": 2.6240936267650428, + "ewc_loss": 0.08023886382579803, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004264120652806014, + "grad_norm": 9.386957168579102, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8783107399940491, + "num_tokens": 787164922.0, + "step": 20628 + }, + { + "epoch": 2.6242208370436333, + "ewc_loss": 0.08039125800132751, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042793602915480733, + "grad_norm": 9.363794326782227, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8768572807312012, + "num_tokens": 787204936.0, + "step": 20629 + }, + { + "epoch": 2.624348047322224, + "ewc_loss": 0.08002530038356781, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004267178592272103, + "grad_norm": 9.277746200561523, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8770703077316284, + "num_tokens": 787238292.0, + "step": 20630 + }, + { + "epoch": 2.624475257600814, + "ewc_loss": 0.08039277791976929, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043039265437982976, + "grad_norm": 9.40516471862793, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8845525979995728, + "num_tokens": 787269054.0, + "step": 20631 + }, + { + "epoch": 2.624602467879405, + "ewc_loss": 0.07985866814851761, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042505151941441, + "grad_norm": 9.31299114227295, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8633546829223633, + "num_tokens": 787310022.0, + "step": 20632 + }, + { + "epoch": 2.624729678157995, + "ewc_loss": 0.08046289533376694, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043109379475936294, + "grad_norm": 9.353829383850098, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8551620244979858, + "num_tokens": 787350917.0, + "step": 20633 + }, + { + "epoch": 2.624856888436586, + "ewc_loss": 0.08023986220359802, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042886342271231115, + "grad_norm": 9.354852676391602, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8731988668441772, + "num_tokens": 787388187.0, + "step": 20634 + }, + { + "epoch": 2.624984098715176, + "ewc_loss": 0.08034119755029678, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004298768180888146, + "grad_norm": 9.312750816345215, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8713351488113403, + "num_tokens": 787424998.0, + "step": 20635 + }, + { + "epoch": 2.6251113089937665, + "ewc_loss": 0.08041179180145264, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004305827897042036, + "grad_norm": 9.3953218460083, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8766137957572937, + "num_tokens": 787465456.0, + "step": 20636 + }, + { + "epoch": 2.625238519272357, + "ewc_loss": 0.08021510392427444, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042861589463427663, + "grad_norm": 9.401776313781738, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8635389804840088, + "num_tokens": 787504797.0, + "step": 20637 + }, + { + "epoch": 2.6253657295509476, + "ewc_loss": 0.0802723839879036, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004291886871214956, + "grad_norm": 9.337546348571777, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8755298852920532, + "num_tokens": 787542911.0, + "step": 20638 + }, + { + "epoch": 2.625492939829538, + "ewc_loss": 0.08041591942310333, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043062400072813034, + "grad_norm": 9.41942310333252, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8555870056152344, + "num_tokens": 787576143.0, + "step": 20639 + }, + { + "epoch": 2.6256201501081287, + "ewc_loss": 0.08018713444471359, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004283362068235874, + "grad_norm": 9.328048706054688, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8670600652694702, + "num_tokens": 787619912.0, + "step": 20640 + }, + { + "epoch": 2.625747360386719, + "ewc_loss": 0.0805729478597641, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004321942979004234, + "grad_norm": 9.40169620513916, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.854812502861023, + "num_tokens": 787658181.0, + "step": 20641 + }, + { + "epoch": 2.6258745706653097, + "ewc_loss": 0.08022108674049377, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004286757030058652, + "grad_norm": 9.381526947021484, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8553633093833923, + "num_tokens": 787694889.0, + "step": 20642 + }, + { + "epoch": 2.6260017809439002, + "ewc_loss": 0.08056193590164185, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004320841981098056, + "grad_norm": 9.396589279174805, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8800647258758545, + "num_tokens": 787737633.0, + "step": 20643 + }, + { + "epoch": 2.6261289912224908, + "ewc_loss": 0.08022232353687286, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042868813034147024, + "grad_norm": 9.357584953308105, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.853405237197876, + "num_tokens": 787779069.0, + "step": 20644 + }, + { + "epoch": 2.6262562015010813, + "ewc_loss": 0.08048857748508453, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004313505778554827, + "grad_norm": 9.486579895019531, + "learning_rate": 1e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.8514828085899353, + "num_tokens": 787815544.0, + "step": 20645 + }, + { + "epoch": 2.626383411779672, + "ewc_loss": 0.08002413809299469, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004267062759026885, + "grad_norm": 9.295357704162598, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8600195646286011, + "num_tokens": 787852684.0, + "step": 20646 + }, + { + "epoch": 2.6265106220582624, + "ewc_loss": 0.08069431036710739, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043340795673429966, + "grad_norm": 9.397656440734863, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8622423410415649, + "num_tokens": 787893452.0, + "step": 20647 + }, + { + "epoch": 2.626637832336853, + "ewc_loss": 0.08012951910495758, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004277600673958659, + "grad_norm": 9.391059875488281, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8649008274078369, + "num_tokens": 787929430.0, + "step": 20648 + }, + { + "epoch": 2.6267650426154434, + "ewc_loss": 0.08036929368972778, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004301578155718744, + "grad_norm": 9.38565731048584, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.863095760345459, + "num_tokens": 787969768.0, + "step": 20649 + }, + { + "epoch": 2.626892252894034, + "ewc_loss": 0.08033914864063263, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004298562998883426, + "grad_norm": 9.398064613342285, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8654530048370361, + "num_tokens": 788009605.0, + "step": 20650 + }, + { + "epoch": 2.6270194631726245, + "ewc_loss": 0.08022330701351166, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042869793833233416, + "grad_norm": 9.418477058410645, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8782724142074585, + "num_tokens": 788044652.0, + "step": 20651 + }, + { + "epoch": 2.627146673451215, + "ewc_loss": 0.08023306727409363, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004287954652681947, + "grad_norm": 9.40744400024414, + "learning_rate": 1e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.8373579978942871, + "num_tokens": 788085981.0, + "step": 20652 + }, + { + "epoch": 2.6272738837298055, + "ewc_loss": 0.08017008006572723, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042816565837711096, + "grad_norm": 9.356818199157715, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8624798655509949, + "num_tokens": 788132196.0, + "step": 20653 + }, + { + "epoch": 2.6274010940083956, + "ewc_loss": 0.08040709793567657, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043053587432950735, + "grad_norm": 9.517537117004395, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8762231469154358, + "num_tokens": 788160099.0, + "step": 20654 + }, + { + "epoch": 2.6275283042869866, + "ewc_loss": 0.07983973622322083, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042486225720494986, + "grad_norm": 9.37242317199707, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8617753982543945, + "num_tokens": 788197264.0, + "step": 20655 + }, + { + "epoch": 2.6276555145655767, + "ewc_loss": 0.08039707690477371, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004304356116335839, + "grad_norm": 9.45377254486084, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8762221336364746, + "num_tokens": 788232900.0, + "step": 20656 + }, + { + "epoch": 2.6277827248441676, + "ewc_loss": 0.0798414796590805, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000424879661295563, + "grad_norm": 9.336841583251953, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8605400919914246, + "num_tokens": 788273720.0, + "step": 20657 + }, + { + "epoch": 2.6279099351227577, + "ewc_loss": 0.08043782413005829, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004308430652599782, + "grad_norm": 9.43808364868164, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8665879368782043, + "num_tokens": 788308425.0, + "step": 20658 + }, + { + "epoch": 2.6280371454013487, + "ewc_loss": 0.07996518164873123, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042611666140146554, + "grad_norm": 9.341485977172852, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.855487048625946, + "num_tokens": 788343701.0, + "step": 20659 + }, + { + "epoch": 2.628164355679939, + "ewc_loss": 0.08032022416591644, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004296670958865434, + "grad_norm": 9.512911796569824, + "learning_rate": 1e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8498669862747192, + "num_tokens": 788372789.0, + "step": 20660 + }, + { + "epoch": 2.6282915659585293, + "ewc_loss": 0.07985368371009827, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004250016645528376, + "grad_norm": 9.349315643310547, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8697280287742615, + "num_tokens": 788408422.0, + "step": 20661 + }, + { + "epoch": 2.62841877623712, + "ewc_loss": 0.08082655072212219, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004322889435570687, + "grad_norm": 9.589309692382812, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8677977323532104, + "num_tokens": 788446113.0, + "step": 20662 + }, + { + "epoch": 2.6285459865157104, + "ewc_loss": 0.07936649769544601, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042012982885353267, + "grad_norm": 9.244433403015137, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8580710291862488, + "num_tokens": 788485009.0, + "step": 20663 + }, + { + "epoch": 2.628673196794301, + "ewc_loss": 0.08092180639505386, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043568291584961116, + "grad_norm": 9.49009895324707, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8684263825416565, + "num_tokens": 788523446.0, + "step": 20664 + }, + { + "epoch": 2.6288004070728914, + "ewc_loss": 0.07962848246097565, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042030829354189336, + "grad_norm": 9.224024772644043, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8740978240966797, + "num_tokens": 788557423.0, + "step": 20665 + }, + { + "epoch": 2.628927617351482, + "ewc_loss": 0.08106894046068192, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043715425999835134, + "grad_norm": 9.58838939666748, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8607053756713867, + "num_tokens": 788588177.0, + "step": 20666 + }, + { + "epoch": 2.6290548276300725, + "ewc_loss": 0.07947757095098495, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042124054743908346, + "grad_norm": 9.205370903015137, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8656305074691772, + "num_tokens": 788624877.0, + "step": 20667 + }, + { + "epoch": 2.629182037908663, + "ewc_loss": 0.08139455318450928, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004379689635243267, + "grad_norm": 9.51738452911377, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8751783967018127, + "num_tokens": 788662532.0, + "step": 20668 + }, + { + "epoch": 2.6293092481872535, + "ewc_loss": 0.0799025148153305, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042304853559471667, + "grad_norm": 9.224164009094238, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8653541803359985, + "num_tokens": 788702734.0, + "step": 20669 + }, + { + "epoch": 2.629436458465844, + "ewc_loss": 0.08122025430202484, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004362259933259338, + "grad_norm": 9.529346466064453, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8642740249633789, + "num_tokens": 788738041.0, + "step": 20670 + }, + { + "epoch": 2.6295636687444346, + "ewc_loss": 0.07991212606430054, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042314475285820663, + "grad_norm": 9.273736953735352, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.865395188331604, + "num_tokens": 788772998.0, + "step": 20671 + }, + { + "epoch": 2.629690879023025, + "ewc_loss": 0.08097866922616959, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043625151738524437, + "grad_norm": 9.565505981445312, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8482388257980347, + "num_tokens": 788814310.0, + "step": 20672 + }, + { + "epoch": 2.6298180893016156, + "ewc_loss": 0.07991104573011398, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042313389712944627, + "grad_norm": 9.263833045959473, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8692014217376709, + "num_tokens": 788852811.0, + "step": 20673 + }, + { + "epoch": 2.629945299580206, + "ewc_loss": 0.08113628625869751, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004353863187134266, + "grad_norm": 9.569690704345703, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8724432587623596, + "num_tokens": 788888756.0, + "step": 20674 + }, + { + "epoch": 2.6300725098587967, + "ewc_loss": 0.0798272043466568, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004222954448778182, + "grad_norm": 9.262579917907715, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8691967725753784, + "num_tokens": 788926292.0, + "step": 20675 + }, + { + "epoch": 2.6301997201373872, + "ewc_loss": 0.08095487952232361, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043601362267509103, + "grad_norm": 9.507133483886719, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8752562999725342, + "num_tokens": 788965625.0, + "step": 20676 + }, + { + "epoch": 2.6303269304159778, + "ewc_loss": 0.07954542338848114, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004219191032461822, + "grad_norm": 9.196890830993652, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8701117038726807, + "num_tokens": 789007390.0, + "step": 20677 + }, + { + "epoch": 2.6304541406945683, + "ewc_loss": 0.08096857368946075, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004361506144050509, + "grad_norm": 9.544465065002441, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8677384853363037, + "num_tokens": 789047940.0, + "step": 20678 + }, + { + "epoch": 2.6305813509731584, + "ewc_loss": 0.07988229393959045, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004228463803883642, + "grad_norm": 9.225049018859863, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8650476932525635, + "num_tokens": 789086321.0, + "step": 20679 + }, + { + "epoch": 2.6307085612517493, + "ewc_loss": 0.08109746873378754, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043743953574448824, + "grad_norm": 9.527005195617676, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8782164454460144, + "num_tokens": 789121757.0, + "step": 20680 + }, + { + "epoch": 2.6308357715303394, + "ewc_loss": 0.07973873615264893, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000423852150561288, + "grad_norm": 9.266192436218262, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8795053958892822, + "num_tokens": 789160076.0, + "step": 20681 + }, + { + "epoch": 2.6309629818089304, + "ewc_loss": 0.08126065135002136, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043662998359650373, + "grad_norm": 9.544239044189453, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8563315272331238, + "num_tokens": 789197080.0, + "step": 20682 + }, + { + "epoch": 2.6310901920875205, + "ewc_loss": 0.07961711287498474, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042263593059033155, + "grad_norm": 9.22104549407959, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8601794838905334, + "num_tokens": 789249876.0, + "step": 20683 + }, + { + "epoch": 2.6312174023661115, + "ewc_loss": 0.08092939853668213, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043575887684710324, + "grad_norm": 9.456463813781738, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8642760515213013, + "num_tokens": 789290146.0, + "step": 20684 + }, + { + "epoch": 2.6313446126447015, + "ewc_loss": 0.07994053512811661, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004258702101651579, + "grad_norm": 9.288945198059082, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8696418404579163, + "num_tokens": 789335610.0, + "step": 20685 + }, + { + "epoch": 2.631471822923292, + "ewc_loss": 0.08059839904308319, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043244886910542846, + "grad_norm": 9.500269889831543, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8591314554214478, + "num_tokens": 789371956.0, + "step": 20686 + }, + { + "epoch": 2.6315990332018826, + "ewc_loss": 0.08000995963811874, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042656445293687284, + "grad_norm": 9.250778198242188, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8618285059928894, + "num_tokens": 789415257.0, + "step": 20687 + }, + { + "epoch": 2.631726243480473, + "ewc_loss": 0.08090746402740479, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043553952127695084, + "grad_norm": 9.55337142944336, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8667013645172119, + "num_tokens": 789452533.0, + "step": 20688 + }, + { + "epoch": 2.6318534537590637, + "ewc_loss": 0.07986024767160416, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000425067322794348, + "grad_norm": 9.292258262634277, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.860977828502655, + "num_tokens": 789488780.0, + "step": 20689 + }, + { + "epoch": 2.631980664037654, + "ewc_loss": 0.08082026243209839, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004346674249973148, + "grad_norm": 9.441014289855957, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8621804118156433, + "num_tokens": 789528169.0, + "step": 20690 + }, + { + "epoch": 2.6321078743162447, + "ewc_loss": 0.0800880491733551, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000427345308708027, + "grad_norm": 9.34529972076416, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.858381986618042, + "num_tokens": 789568382.0, + "step": 20691 + }, + { + "epoch": 2.6322350845948352, + "ewc_loss": 0.08044128119945526, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043087766971439123, + "grad_norm": 9.38005542755127, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8616117835044861, + "num_tokens": 789606895.0, + "step": 20692 + }, + { + "epoch": 2.6323622948734258, + "ewc_loss": 0.08035311102867126, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004299959691707045, + "grad_norm": 9.367133140563965, + "learning_rate": 1e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8499135971069336, + "num_tokens": 789649106.0, + "step": 20693 + }, + { + "epoch": 2.6324895051520163, + "ewc_loss": 0.08039681613445282, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043043302139267325, + "grad_norm": 9.37177848815918, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8698256611824036, + "num_tokens": 789689449.0, + "step": 20694 + }, + { + "epoch": 2.632616715430607, + "ewc_loss": 0.08043384552001953, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043080333853140473, + "grad_norm": 9.376757621765137, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.876469612121582, + "num_tokens": 789726179.0, + "step": 20695 + }, + { + "epoch": 2.6327439257091974, + "ewc_loss": 0.08022942394018173, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042875908548012376, + "grad_norm": 9.365412712097168, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8469393849372864, + "num_tokens": 789764858.0, + "step": 20696 + }, + { + "epoch": 2.632871135987788, + "ewc_loss": 0.0805157870054245, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043162269867025316, + "grad_norm": 9.422987937927246, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8523237705230713, + "num_tokens": 789799196.0, + "step": 20697 + }, + { + "epoch": 2.6329983462663784, + "ewc_loss": 0.08016816526651382, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042814650805667043, + "grad_norm": 9.347338676452637, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8637033700942993, + "num_tokens": 789840498.0, + "step": 20698 + }, + { + "epoch": 2.633125556544969, + "ewc_loss": 0.0804503858089447, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004309687064960599, + "grad_norm": 9.372095108032227, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8692852258682251, + "num_tokens": 789874269.0, + "step": 20699 + }, + { + "epoch": 2.6332527668235595, + "ewc_loss": 0.08031225204467773, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004295873804949224, + "grad_norm": 9.395726203918457, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8663133978843689, + "num_tokens": 789914938.0, + "step": 20700 + }, + { + "epoch": 2.63337997710215, + "ewc_loss": 0.08024226129055023, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042888749158009887, + "grad_norm": 9.339383125305176, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8659347891807556, + "num_tokens": 789959219.0, + "step": 20701 + }, + { + "epoch": 2.6335071873807405, + "ewc_loss": 0.0803338810801506, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004298036510590464, + "grad_norm": 9.440251350402832, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8585119247436523, + "num_tokens": 789993707.0, + "step": 20702 + }, + { + "epoch": 2.633634397659331, + "ewc_loss": 0.08028687536716461, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004293335950933397, + "grad_norm": 9.3550443649292, + "learning_rate": 1e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8453258275985718, + "num_tokens": 790034285.0, + "step": 20703 + }, + { + "epoch": 2.633761607937921, + "ewc_loss": 0.08034375309944153, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042990242945961654, + "grad_norm": 9.375280380249023, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8683237433433533, + "num_tokens": 790069714.0, + "step": 20704 + }, + { + "epoch": 2.633888818216512, + "ewc_loss": 0.08012565970420837, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004277214757166803, + "grad_norm": 9.351513862609863, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8645287752151489, + "num_tokens": 790110733.0, + "step": 20705 + }, + { + "epoch": 2.634016028495102, + "ewc_loss": 0.08041524142026901, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004306172486394644, + "grad_norm": 9.407981872558594, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8699308633804321, + "num_tokens": 790144468.0, + "step": 20706 + }, + { + "epoch": 2.634143238773693, + "ewc_loss": 0.08001897484064102, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042665458749979734, + "grad_norm": 9.306219100952148, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8625977635383606, + "num_tokens": 790186881.0, + "step": 20707 + }, + { + "epoch": 2.6342704490522832, + "ewc_loss": 0.08051498234272003, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043161469511687756, + "grad_norm": 9.450371742248535, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8705151081085205, + "num_tokens": 790225083.0, + "step": 20708 + }, + { + "epoch": 2.6343976593308738, + "ewc_loss": 0.08013838529586792, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042784868855960667, + "grad_norm": 9.348909378051758, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8647918701171875, + "num_tokens": 790266822.0, + "step": 20709 + }, + { + "epoch": 2.6345248696094643, + "ewc_loss": 0.08052808046340942, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004317456914577633, + "grad_norm": 9.443965911865234, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8617310523986816, + "num_tokens": 790305948.0, + "step": 20710 + }, + { + "epoch": 2.634652079888055, + "ewc_loss": 0.08016479015350342, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004281127476133406, + "grad_norm": 9.332158088684082, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8666803240776062, + "num_tokens": 790341639.0, + "step": 20711 + }, + { + "epoch": 2.6347792901666454, + "ewc_loss": 0.08053953945636749, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004318601859267801, + "grad_norm": 9.426874160766602, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8742842078208923, + "num_tokens": 790377442.0, + "step": 20712 + }, + { + "epoch": 2.634906500445236, + "ewc_loss": 0.08031036704778671, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042956852121278644, + "grad_norm": 9.339032173156738, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8631610870361328, + "num_tokens": 790414911.0, + "step": 20713 + }, + { + "epoch": 2.6350337107238264, + "ewc_loss": 0.08072217553853989, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043368659680709243, + "grad_norm": 9.469473838806152, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.863740086555481, + "num_tokens": 790453968.0, + "step": 20714 + }, + { + "epoch": 2.635160921002417, + "ewc_loss": 0.08011054992675781, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000427570310421288, + "grad_norm": 9.312417984008789, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8612912893295288, + "num_tokens": 790493216.0, + "step": 20715 + }, + { + "epoch": 2.6352881312810075, + "ewc_loss": 0.0807601660490036, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043406651820987463, + "grad_norm": 9.470860481262207, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8601815700531006, + "num_tokens": 790529380.0, + "step": 20716 + }, + { + "epoch": 2.635415341559598, + "ewc_loss": 0.08031907677650452, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042721425415948033, + "grad_norm": 9.336200714111328, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8602907657623291, + "num_tokens": 790569568.0, + "step": 20717 + }, + { + "epoch": 2.6355425518381885, + "ewc_loss": 0.08083315193653107, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043479641317389905, + "grad_norm": 9.488450050354004, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8701344132423401, + "num_tokens": 790602269.0, + "step": 20718 + }, + { + "epoch": 2.635669762116779, + "ewc_loss": 0.08014167845249176, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042788160499185324, + "grad_norm": 9.337242126464844, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8528419137001038, + "num_tokens": 790639723.0, + "step": 20719 + }, + { + "epoch": 2.6357969723953696, + "ewc_loss": 0.08055821061134338, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004320469161029905, + "grad_norm": 9.499246597290039, + "learning_rate": 1e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8499163389205933, + "num_tokens": 790676103.0, + "step": 20720 + }, + { + "epoch": 2.63592418267396, + "ewc_loss": 0.08009848743677139, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004274497041478753, + "grad_norm": 9.318501472473145, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8663017749786377, + "num_tokens": 790712118.0, + "step": 20721 + }, + { + "epoch": 2.6360513929525506, + "ewc_loss": 0.08085934817790985, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043505834764800966, + "grad_norm": 9.614465713500977, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.863996148109436, + "num_tokens": 790747079.0, + "step": 20722 + }, + { + "epoch": 2.636178603231141, + "ewc_loss": 0.07964901626110077, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042295505409128964, + "grad_norm": 9.26085376739502, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8544413447380066, + "num_tokens": 790781741.0, + "step": 20723 + }, + { + "epoch": 2.6363058135097317, + "ewc_loss": 0.08111628890037537, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043762775021605194, + "grad_norm": 9.568638801574707, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8640978336334229, + "num_tokens": 790820243.0, + "step": 20724 + }, + { + "epoch": 2.6364330237883222, + "ewc_loss": 0.07964874804019928, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042295228922739625, + "grad_norm": 9.23947525024414, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8791186809539795, + "num_tokens": 790851492.0, + "step": 20725 + }, + { + "epoch": 2.6365602340669128, + "ewc_loss": 0.08120127767324448, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000438477611169219, + "grad_norm": 9.556382179260254, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8697234392166138, + "num_tokens": 790882293.0, + "step": 20726 + }, + { + "epoch": 2.636687444345503, + "ewc_loss": 0.07974231988191605, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042388803558424115, + "grad_norm": 9.296818733215332, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8599777817726135, + "num_tokens": 790922188.0, + "step": 20727 + }, + { + "epoch": 2.636814654624094, + "ewc_loss": 0.08124470710754395, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043647049460560083, + "grad_norm": 9.819334030151367, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8740494251251221, + "num_tokens": 790960675.0, + "step": 20728 + }, + { + "epoch": 2.636941864902684, + "ewc_loss": 0.0793188288807869, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041965313721448183, + "grad_norm": 9.149149894714355, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8728765249252319, + "num_tokens": 790999352.0, + "step": 20729 + }, + { + "epoch": 2.637069075181275, + "ewc_loss": 0.08170753717422485, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004435402515809983, + "grad_norm": 9.627110481262207, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8648397326469421, + "num_tokens": 791035898.0, + "step": 20730 + }, + { + "epoch": 2.637196285459865, + "ewc_loss": 0.07938403636217117, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042030520853586495, + "grad_norm": 9.149371147155762, + "learning_rate": 1e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.8409108519554138, + "num_tokens": 791079731.0, + "step": 20731 + }, + { + "epoch": 2.637323495738456, + "ewc_loss": 0.08202680945396423, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004442915378604084, + "grad_norm": 9.609187126159668, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8764612674713135, + "num_tokens": 791118565.0, + "step": 20732 + }, + { + "epoch": 2.637450706017046, + "ewc_loss": 0.07949298620223999, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042139465222135186, + "grad_norm": 9.213399887084961, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8649693131446838, + "num_tokens": 791154876.0, + "step": 20733 + }, + { + "epoch": 2.6375779162956365, + "ewc_loss": 0.08150976896286011, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044156250078231096, + "grad_norm": 9.589465141296387, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8641083240509033, + "num_tokens": 791196003.0, + "step": 20734 + }, + { + "epoch": 2.637705126574227, + "ewc_loss": 0.07973720133304596, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042383684194646776, + "grad_norm": 9.19373893737793, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8632072806358337, + "num_tokens": 791236462.0, + "step": 20735 + }, + { + "epoch": 2.6378323368528176, + "ewc_loss": 0.0815599262714386, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004420640761964023, + "grad_norm": 9.697529792785645, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8517538905143738, + "num_tokens": 791271918.0, + "step": 20736 + }, + { + "epoch": 2.637959547131408, + "ewc_loss": 0.08001925051212311, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004242159193381667, + "grad_norm": 9.183242797851562, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8768521547317505, + "num_tokens": 791309813.0, + "step": 20737 + }, + { + "epoch": 2.6380867574099987, + "ewc_loss": 0.08198593556880951, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044632417848333716, + "grad_norm": 9.751304626464844, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8718047738075256, + "num_tokens": 791338854.0, + "step": 20738 + }, + { + "epoch": 2.638213967688589, + "ewc_loss": 0.07970206439495087, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004234854714013636, + "grad_norm": 9.315403938293457, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8703035116195679, + "num_tokens": 791376580.0, + "step": 20739 + }, + { + "epoch": 2.6383411779671797, + "ewc_loss": 0.08158968389034271, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004423616628628224, + "grad_norm": 9.621859550476074, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8619676232337952, + "num_tokens": 791415931.0, + "step": 20740 + }, + { + "epoch": 2.6384683882457702, + "ewc_loss": 0.07995276153087616, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004259924462530762, + "grad_norm": 9.331465721130371, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8610271215438843, + "num_tokens": 791448201.0, + "step": 20741 + }, + { + "epoch": 2.6385955985243608, + "ewc_loss": 0.08162465691566467, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044026997056789696, + "grad_norm": 9.721640586853027, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8569441437721252, + "num_tokens": 791489183.0, + "step": 20742 + }, + { + "epoch": 2.6387228088029513, + "ewc_loss": 0.07983940839767456, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004248588811606169, + "grad_norm": 9.338547706604004, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8655604124069214, + "num_tokens": 791528414.0, + "step": 20743 + }, + { + "epoch": 2.638850019081542, + "ewc_loss": 0.08149039000272751, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043892732355743647, + "grad_norm": 9.584654808044434, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8837827444076538, + "num_tokens": 791566379.0, + "step": 20744 + }, + { + "epoch": 2.6389772293601323, + "ewc_loss": 0.08030562847852707, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042707970715127885, + "grad_norm": 9.352005958557129, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8658150434494019, + "num_tokens": 791599789.0, + "step": 20745 + }, + { + "epoch": 2.639104439638723, + "ewc_loss": 0.08114081621170044, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043543160427361727, + "grad_norm": 9.559602737426758, + "learning_rate": 1e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8501743078231812, + "num_tokens": 791642333.0, + "step": 20746 + }, + { + "epoch": 2.6392316499173134, + "ewc_loss": 0.08044551312923431, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042847858276218176, + "grad_norm": 9.390840530395508, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8660295605659485, + "num_tokens": 791687190.0, + "step": 20747 + }, + { + "epoch": 2.639358860195904, + "ewc_loss": 0.08096076548099518, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004336311249062419, + "grad_norm": 9.518685340881348, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8502122163772583, + "num_tokens": 791722806.0, + "step": 20748 + }, + { + "epoch": 2.6394860704744945, + "ewc_loss": 0.08033902943134308, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042741376091726124, + "grad_norm": 9.411874771118164, + "learning_rate": 1e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8494284749031067, + "num_tokens": 791757726.0, + "step": 20749 + }, + { + "epoch": 2.639613280753085, + "ewc_loss": 0.08076944202184677, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004317178390920162, + "grad_norm": 9.55305290222168, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8718013167381287, + "num_tokens": 791795160.0, + "step": 20750 + }, + { + "epoch": 2.6397404910316755, + "ewc_loss": 0.08013724535703659, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042539587593637407, + "grad_norm": 9.39331340789795, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8776509165763855, + "num_tokens": 791831682.0, + "step": 20751 + }, + { + "epoch": 2.6398677013102656, + "ewc_loss": 0.0806572288274765, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043059574090875685, + "grad_norm": 9.48154067993164, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8759208917617798, + "num_tokens": 791863230.0, + "step": 20752 + }, + { + "epoch": 2.6399949115888566, + "ewc_loss": 0.08027956634759903, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004268190823495388, + "grad_norm": 9.38381290435791, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8799113035202026, + "num_tokens": 791904323.0, + "step": 20753 + }, + { + "epoch": 2.6401221218674467, + "ewc_loss": 0.08045392483472824, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004285626928322017, + "grad_norm": 9.443103790283203, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.857606053352356, + "num_tokens": 791951836.0, + "step": 20754 + }, + { + "epoch": 2.6402493321460376, + "ewc_loss": 0.08003178238868713, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042678264435380697, + "grad_norm": 9.421101570129395, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8683921694755554, + "num_tokens": 791989782.0, + "step": 20755 + }, + { + "epoch": 2.6403765424246277, + "ewc_loss": 0.0801522508263588, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042798733920790255, + "grad_norm": 9.477728843688965, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.865757405757904, + "num_tokens": 792024038.0, + "step": 20756 + }, + { + "epoch": 2.6405037527032187, + "ewc_loss": 0.07993276417255402, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042579250293783844, + "grad_norm": 9.363080024719238, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8651746511459351, + "num_tokens": 792067893.0, + "step": 20757 + }, + { + "epoch": 2.6406309629818088, + "ewc_loss": 0.08065535128116608, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004305769980419427, + "grad_norm": 9.435661315917969, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8733872771263123, + "num_tokens": 792108593.0, + "step": 20758 + }, + { + "epoch": 2.6407581732603993, + "ewc_loss": 0.0801909938454628, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004259333945810795, + "grad_norm": 9.42529582977295, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8772913813591003, + "num_tokens": 792141201.0, + "step": 20759 + }, + { + "epoch": 2.64088538353899, + "ewc_loss": 0.08048489689826965, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004288724157959223, + "grad_norm": 9.418893814086914, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8851006031036377, + "num_tokens": 792182908.0, + "step": 20760 + }, + { + "epoch": 2.6410125938175804, + "ewc_loss": 0.08006647974252701, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004271296493243426, + "grad_norm": 9.41699504852295, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8667468428611755, + "num_tokens": 792225223.0, + "step": 20761 + }, + { + "epoch": 2.641139804096171, + "ewc_loss": 0.08020716905593872, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004285365284886211, + "grad_norm": 9.5155029296875, + "learning_rate": 1e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8485639095306396, + "num_tokens": 792258131.0, + "step": 20762 + }, + { + "epoch": 2.6412670143747614, + "ewc_loss": 0.08002813160419464, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004243048024363816, + "grad_norm": 9.333362579345703, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8712908625602722, + "num_tokens": 792291429.0, + "step": 20763 + }, + { + "epoch": 2.641394224653352, + "ewc_loss": 0.080381378531456, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004302786837797612, + "grad_norm": 9.468303680419922, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8600260019302368, + "num_tokens": 792327777.0, + "step": 20764 + }, + { + "epoch": 2.6415214349319425, + "ewc_loss": 0.07977107167243958, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042417552322149277, + "grad_norm": 9.334643363952637, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8679571747779846, + "num_tokens": 792367918.0, + "step": 20765 + }, + { + "epoch": 2.641648645210533, + "ewc_loss": 0.08054054528474808, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043187031405977905, + "grad_norm": 9.516822814941406, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8687168955802917, + "num_tokens": 792405550.0, + "step": 20766 + }, + { + "epoch": 2.6417758554891235, + "ewc_loss": 0.07987797260284424, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004252446233294904, + "grad_norm": 9.335290908813477, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.86854088306427, + "num_tokens": 792440948.0, + "step": 20767 + }, + { + "epoch": 2.641903065767714, + "ewc_loss": 0.08041056245565414, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004305704787839204, + "grad_norm": 9.446800231933594, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8717007040977478, + "num_tokens": 792488065.0, + "step": 20768 + }, + { + "epoch": 2.6420302760463046, + "ewc_loss": 0.08002496510744095, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042671451228670776, + "grad_norm": 9.373424530029297, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8718630075454712, + "num_tokens": 792522845.0, + "step": 20769 + }, + { + "epoch": 2.642157486324895, + "ewc_loss": 0.08046142756938934, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043107912642881274, + "grad_norm": 9.507303237915039, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8750371932983398, + "num_tokens": 792561118.0, + "step": 20770 + }, + { + "epoch": 2.6422846966034856, + "ewc_loss": 0.07996787130832672, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004261435242369771, + "grad_norm": 9.459589004516602, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8659749031066895, + "num_tokens": 792600125.0, + "step": 20771 + }, + { + "epoch": 2.642411906882076, + "ewc_loss": 0.08074361085891724, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00042901813867501915, + "grad_norm": 9.455047607421875, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8694486021995544, + "num_tokens": 792633856.0, + "step": 20772 + }, + { + "epoch": 2.6425391171606667, + "ewc_loss": 0.08016777038574219, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042814252083189785, + "grad_norm": 9.39138126373291, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8611304759979248, + "num_tokens": 792672427.0, + "step": 20773 + }, + { + "epoch": 2.6426663274392572, + "ewc_loss": 0.08024097979068756, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004288746858946979, + "grad_norm": 9.390317916870117, + "learning_rate": 1e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8434514403343201, + "num_tokens": 792713856.0, + "step": 20774 + }, + { + "epoch": 2.6427935377178478, + "ewc_loss": 0.08035218715667725, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004299866850487888, + "grad_norm": 9.395331382751465, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8569164872169495, + "num_tokens": 792757602.0, + "step": 20775 + }, + { + "epoch": 2.6429207479964383, + "ewc_loss": 0.08039109408855438, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004303757450543344, + "grad_norm": 9.448149681091309, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8544599413871765, + "num_tokens": 792797067.0, + "step": 20776 + }, + { + "epoch": 2.6430479582750284, + "ewc_loss": 0.0802324190735817, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042878903332166374, + "grad_norm": 9.418139457702637, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8685110807418823, + "num_tokens": 792835748.0, + "step": 20777 + }, + { + "epoch": 2.6431751685536193, + "ewc_loss": 0.08030077815055847, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042947265319526196, + "grad_norm": 9.38547420501709, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8676624894142151, + "num_tokens": 792875039.0, + "step": 20778 + }, + { + "epoch": 2.6433023788322094, + "ewc_loss": 0.08049587160348892, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004314235702622682, + "grad_norm": 9.434110641479492, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8716117739677429, + "num_tokens": 792916580.0, + "step": 20779 + }, + { + "epoch": 2.6434295891108004, + "ewc_loss": 0.080454982817173, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004310146614443511, + "grad_norm": 9.445182800292969, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8657491207122803, + "num_tokens": 792952507.0, + "step": 20780 + }, + { + "epoch": 2.6435567993893905, + "ewc_loss": 0.08079862594604492, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043200975051149726, + "grad_norm": 9.44959831237793, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.867151141166687, + "num_tokens": 792994046.0, + "step": 20781 + }, + { + "epoch": 2.6436840096679814, + "ewc_loss": 0.08041940629482269, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004306589253246784, + "grad_norm": 9.412187576293945, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8845931887626648, + "num_tokens": 793033855.0, + "step": 20782 + }, + { + "epoch": 2.6438112199465715, + "ewc_loss": 0.08050627261400223, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004315275582484901, + "grad_norm": 9.467479705810547, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8690452575683594, + "num_tokens": 793072226.0, + "step": 20783 + }, + { + "epoch": 2.643938430225162, + "ewc_loss": 0.08039671182632446, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043043200275860727, + "grad_norm": 9.461101531982422, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8681150674819946, + "num_tokens": 793110884.0, + "step": 20784 + }, + { + "epoch": 2.6440656405037526, + "ewc_loss": 0.08055399358272552, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004320048028603196, + "grad_norm": 9.544205665588379, + "learning_rate": 1e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.8471633195877075, + "num_tokens": 793151885.0, + "step": 20785 + }, + { + "epoch": 2.644192850782343, + "ewc_loss": 0.08036765456199646, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004277000261936337, + "grad_norm": 9.366454124450684, + "learning_rate": 1e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8431079387664795, + "num_tokens": 793187253.0, + "step": 20786 + }, + { + "epoch": 2.6443200610609336, + "ewc_loss": 0.08079974353313446, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043446224299259484, + "grad_norm": 9.489404678344727, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8702273964881897, + "num_tokens": 793231249.0, + "step": 20787 + }, + { + "epoch": 2.644447271339524, + "ewc_loss": 0.08009381592273712, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004274030216038227, + "grad_norm": 9.338647842407227, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8772643208503723, + "num_tokens": 793269918.0, + "step": 20788 + }, + { + "epoch": 2.6445744816181147, + "ewc_loss": 0.08079557865858078, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004344206245150417, + "grad_norm": 9.469450950622559, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8754680156707764, + "num_tokens": 793306856.0, + "step": 20789 + }, + { + "epoch": 2.6447016918967052, + "ewc_loss": 0.08027736842632294, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004292384837754071, + "grad_norm": 9.398130416870117, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8544548153877258, + "num_tokens": 793347866.0, + "step": 20790 + }, + { + "epoch": 2.6448289021752958, + "ewc_loss": 0.08063400536775589, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004328048962634057, + "grad_norm": 9.44746208190918, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8634612560272217, + "num_tokens": 793391037.0, + "step": 20791 + }, + { + "epoch": 2.6449561124538863, + "ewc_loss": 0.08032593131065369, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042972411029040813, + "grad_norm": 9.466705322265625, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8517444133758545, + "num_tokens": 793423560.0, + "step": 20792 + }, + { + "epoch": 2.645083322732477, + "ewc_loss": 0.08040004223585129, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004304652684368193, + "grad_norm": 9.398931503295898, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8711137771606445, + "num_tokens": 793460215.0, + "step": 20793 + }, + { + "epoch": 2.6452105330110673, + "ewc_loss": 0.08057937026023865, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004322585300542414, + "grad_norm": 9.527922630310059, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8513201475143433, + "num_tokens": 793492671.0, + "step": 20794 + }, + { + "epoch": 2.645337743289658, + "ewc_loss": 0.08009003102779388, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004273650993127376, + "grad_norm": 9.308238983154297, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8643099069595337, + "num_tokens": 793529526.0, + "step": 20795 + }, + { + "epoch": 2.6454649535682484, + "ewc_loss": 0.0810815691947937, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004372804833110422, + "grad_norm": 9.47370719909668, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8739936351776123, + "num_tokens": 793566748.0, + "step": 20796 + }, + { + "epoch": 2.645592163846839, + "ewc_loss": 0.0801251232624054, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004277160333003849, + "grad_norm": 9.32159423828125, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.879400372505188, + "num_tokens": 793600703.0, + "step": 20797 + }, + { + "epoch": 2.6457193741254295, + "ewc_loss": 0.08094759285449982, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004359408048912883, + "grad_norm": 9.494412422180176, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8640899658203125, + "num_tokens": 793639226.0, + "step": 20798 + }, + { + "epoch": 2.64584658440402, + "ewc_loss": 0.0802135318517685, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004286001203581691, + "grad_norm": 9.347593307495117, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8790568709373474, + "num_tokens": 793676147.0, + "step": 20799 + }, + { + "epoch": 2.6459737946826105, + "ewc_loss": 0.08090329170227051, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043549772817641497, + "grad_norm": 9.5393648147583, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8764910101890564, + "num_tokens": 793715778.0, + "step": 20800 + }, + { + "epoch": 2.646101004961201, + "ewc_loss": 0.07999511063098907, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042641593609005213, + "grad_norm": 9.293728828430176, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8806295990943909, + "num_tokens": 793752096.0, + "step": 20801 + }, + { + "epoch": 2.646228215239791, + "ewc_loss": 0.081178218126297, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043824705062434077, + "grad_norm": 9.535642623901367, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8601570725440979, + "num_tokens": 793789809.0, + "step": 20802 + }, + { + "epoch": 2.646355425518382, + "ewc_loss": 0.07989515364170074, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042541639413684607, + "grad_norm": 9.271964073181152, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8616589307785034, + "num_tokens": 793826505.0, + "step": 20803 + }, + { + "epoch": 2.646482635796972, + "ewc_loss": 0.08134104311466217, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004398752353154123, + "grad_norm": 9.567510604858398, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8633719086647034, + "num_tokens": 793868360.0, + "step": 20804 + }, + { + "epoch": 2.646609846075563, + "ewc_loss": 0.07985548675060272, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042501967982389033, + "grad_norm": 9.294394493103027, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8653416633605957, + "num_tokens": 793906566.0, + "step": 20805 + }, + { + "epoch": 2.6467370563541532, + "ewc_loss": 0.08129256218671799, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004393904819153249, + "grad_norm": 9.536507606506348, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8609461784362793, + "num_tokens": 793947814.0, + "step": 20806 + }, + { + "epoch": 2.6468642666327438, + "ewc_loss": 0.08004014939069748, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042686634697020054, + "grad_norm": 9.317179679870605, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8701316118240356, + "num_tokens": 793987391.0, + "step": 20807 + }, + { + "epoch": 2.6469914769113343, + "ewc_loss": 0.08120613545179367, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043852621456608176, + "grad_norm": 9.55361270904541, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8760741949081421, + "num_tokens": 794024045.0, + "step": 20808 + }, + { + "epoch": 2.647118687189925, + "ewc_loss": 0.08009441196918488, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004274089587852359, + "grad_norm": 9.356070518493652, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8694463968276978, + "num_tokens": 794063405.0, + "step": 20809 + }, + { + "epoch": 2.6472458974685154, + "ewc_loss": 0.08129215240478516, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004369450034573674, + "grad_norm": 9.5770845413208, + "learning_rate": 1e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.840429961681366, + "num_tokens": 794099729.0, + "step": 20810 + }, + { + "epoch": 2.647373107747106, + "ewc_loss": 0.08009175956249237, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042738247429952025, + "grad_norm": 9.342329978942871, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8576341867446899, + "num_tokens": 794139669.0, + "step": 20811 + }, + { + "epoch": 2.6475003180256964, + "ewc_loss": 0.0809485986828804, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043595084571279585, + "grad_norm": 9.472990036010742, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8685120344161987, + "num_tokens": 794176906.0, + "step": 20812 + }, + { + "epoch": 2.647627528304287, + "ewc_loss": 0.08029846847057343, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042944957385770977, + "grad_norm": 9.432509422302246, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8696156740188599, + "num_tokens": 794216594.0, + "step": 20813 + }, + { + "epoch": 2.6477547385828775, + "ewc_loss": 0.08059850335121155, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004324499168433249, + "grad_norm": 9.497514724731445, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8694779872894287, + "num_tokens": 794248379.0, + "step": 20814 + }, + { + "epoch": 2.647881948861468, + "ewc_loss": 0.08026350289583206, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042909986223094165, + "grad_norm": 9.383112907409668, + "learning_rate": 1e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8408089876174927, + "num_tokens": 794293604.0, + "step": 20815 + }, + { + "epoch": 2.6480091591400585, + "ewc_loss": 0.08045598119497299, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004310246731620282, + "grad_norm": 9.443331718444824, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8569203615188599, + "num_tokens": 794332084.0, + "step": 20816 + }, + { + "epoch": 2.648136369418649, + "ewc_loss": 0.08050140738487244, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043147889664396644, + "grad_norm": 9.397192001342773, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8811554312705994, + "num_tokens": 794369158.0, + "step": 20817 + }, + { + "epoch": 2.6482635796972396, + "ewc_loss": 0.08088500797748566, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004328734939917922, + "grad_norm": 9.44561767578125, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.869284987449646, + "num_tokens": 794410531.0, + "step": 20818 + }, + { + "epoch": 2.64839078997583, + "ewc_loss": 0.08069945871829987, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043101797928102314, + "grad_norm": 9.444238662719727, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8661746382713318, + "num_tokens": 794457556.0, + "step": 20819 + }, + { + "epoch": 2.6485180002544206, + "ewc_loss": 0.08070213347673416, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043104475480504334, + "grad_norm": 9.423065185546875, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8543037176132202, + "num_tokens": 794492737.0, + "step": 20820 + }, + { + "epoch": 2.648645210533011, + "ewc_loss": 0.08086630702018738, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004326865018811077, + "grad_norm": 9.483362197875977, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8756200075149536, + "num_tokens": 794532589.0, + "step": 20821 + }, + { + "epoch": 2.6487724208116017, + "ewc_loss": 0.08023181557655334, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004287829506210983, + "grad_norm": 9.431624412536621, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.854814350605011, + "num_tokens": 794566982.0, + "step": 20822 + }, + { + "epoch": 2.648899631090192, + "ewc_loss": 0.08053430914878845, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043180794455111027, + "grad_norm": 9.520038604736328, + "learning_rate": 1e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8491805791854858, + "num_tokens": 794604278.0, + "step": 20823 + }, + { + "epoch": 2.6490268413687827, + "ewc_loss": 0.08012671768665314, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042773206951096654, + "grad_norm": 9.344825744628906, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8750535845756531, + "num_tokens": 794643785.0, + "step": 20824 + }, + { + "epoch": 2.649154051647373, + "ewc_loss": 0.0809885784983635, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043390924111008644, + "grad_norm": 9.501646995544434, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8741302490234375, + "num_tokens": 794678302.0, + "step": 20825 + }, + { + "epoch": 2.649281261925964, + "ewc_loss": 0.08001174032688141, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042658220627345145, + "grad_norm": 9.353324890136719, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8567891120910645, + "num_tokens": 794715732.0, + "step": 20826 + }, + { + "epoch": 2.649408472204554, + "ewc_loss": 0.08082370460033417, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043470191303640604, + "grad_norm": 9.479537963867188, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8705692291259766, + "num_tokens": 794754359.0, + "step": 20827 + }, + { + "epoch": 2.649535682483145, + "ewc_loss": 0.08063779026269913, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00042795995250344276, + "grad_norm": 9.40125560760498, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8627804517745972, + "num_tokens": 794795770.0, + "step": 20828 + }, + { + "epoch": 2.649662892761735, + "ewc_loss": 0.0808686763048172, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004327102506067604, + "grad_norm": 9.451882362365723, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8581815958023071, + "num_tokens": 794835654.0, + "step": 20829 + }, + { + "epoch": 2.649790103040326, + "ewc_loss": 0.08038601279258728, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043032498797401786, + "grad_norm": 9.352190971374512, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8691595196723938, + "num_tokens": 794872527.0, + "step": 20830 + }, + { + "epoch": 2.649917313318916, + "ewc_loss": 0.08057259023189545, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043219077633693814, + "grad_norm": 9.387510299682617, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8694735765457153, + "num_tokens": 794918577.0, + "step": 20831 + }, + { + "epoch": 2.6500445235975065, + "ewc_loss": 0.08056017011404037, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004320665611885488, + "grad_norm": 9.445744514465332, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8763813972473145, + "num_tokens": 794953566.0, + "step": 20832 + }, + { + "epoch": 2.650171733876097, + "ewc_loss": 0.08064772188663483, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004305006586946547, + "grad_norm": 9.51797103881836, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8733525276184082, + "num_tokens": 794990547.0, + "step": 20833 + }, + { + "epoch": 2.6502989441546876, + "ewc_loss": 0.08025820553302765, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042904692236334085, + "grad_norm": 9.391036987304688, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8824544548988342, + "num_tokens": 795028419.0, + "step": 20834 + }, + { + "epoch": 2.650426154433278, + "ewc_loss": 0.08067180961370468, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043318295502103865, + "grad_norm": 9.506539344787598, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8816453814506531, + "num_tokens": 795064303.0, + "step": 20835 + }, + { + "epoch": 2.6505533647118686, + "ewc_loss": 0.08011512458324432, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000427616061642766, + "grad_norm": 9.369303703308105, + "learning_rate": 1e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8428490161895752, + "num_tokens": 795104280.0, + "step": 20836 + }, + { + "epoch": 2.650680574990459, + "ewc_loss": 0.08062013983726501, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004326661874074489, + "grad_norm": 9.497453689575195, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.856789767742157, + "num_tokens": 795142547.0, + "step": 20837 + }, + { + "epoch": 2.6508077852690497, + "ewc_loss": 0.08005709946155548, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042703578947111964, + "grad_norm": 9.383926391601562, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8630847334861755, + "num_tokens": 795186193.0, + "step": 20838 + }, + { + "epoch": 2.6509349955476402, + "ewc_loss": 0.08055208623409271, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004319856525398791, + "grad_norm": 9.47629165649414, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8828187584877014, + "num_tokens": 795215453.0, + "step": 20839 + }, + { + "epoch": 2.6510622058262308, + "ewc_loss": 0.08010082691907883, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004274731036275625, + "grad_norm": 9.371529579162598, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8620864748954773, + "num_tokens": 795252820.0, + "step": 20840 + }, + { + "epoch": 2.6511894161048213, + "ewc_loss": 0.08074693381786346, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043393418309278786, + "grad_norm": 9.654397010803223, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8640496730804443, + "num_tokens": 795290000.0, + "step": 20841 + }, + { + "epoch": 2.651316626383412, + "ewc_loss": 0.07968027889728546, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042326762923039496, + "grad_norm": 9.25991153717041, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8823736906051636, + "num_tokens": 795330389.0, + "step": 20842 + }, + { + "epoch": 2.6514438366620023, + "ewc_loss": 0.08108076453208923, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043727245065383613, + "grad_norm": 9.530317306518555, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8617620468139648, + "num_tokens": 795366693.0, + "step": 20843 + }, + { + "epoch": 2.651571046940593, + "ewc_loss": 0.07986316084861755, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004226549935992807, + "grad_norm": 9.345784187316895, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8687585592269897, + "num_tokens": 795405748.0, + "step": 20844 + }, + { + "epoch": 2.6516982572191834, + "ewc_loss": 0.08087336272001266, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043519848259165883, + "grad_norm": 9.536653518676758, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8766238689422607, + "num_tokens": 795446179.0, + "step": 20845 + }, + { + "epoch": 2.651825467497774, + "ewc_loss": 0.07979972660541534, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042446207953616977, + "grad_norm": 9.266823768615723, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8610668182373047, + "num_tokens": 795488234.0, + "step": 20846 + }, + { + "epoch": 2.6519526777763645, + "ewc_loss": 0.08123594522476196, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043638289207592607, + "grad_norm": 9.64439582824707, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8624404668807983, + "num_tokens": 795526508.0, + "step": 20847 + }, + { + "epoch": 2.652079888054955, + "ewc_loss": 0.07954393327236176, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042190420208498836, + "grad_norm": 9.257157325744629, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8728532791137695, + "num_tokens": 795566299.0, + "step": 20848 + }, + { + "epoch": 2.6522070983335455, + "ewc_loss": 0.08274783194065094, + "ewc_loss_diag": 3.886222839355469e-05, + "ewc_loss_parallel": 0.00043929475941695273, + "grad_norm": 54.10542297363281, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8608270287513733, + "num_tokens": 795603892.0, + "step": 20849 + }, + { + "epoch": 2.6523343086121356, + "ewc_loss": 0.14103958010673523, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0010319778230041265, + "grad_norm": 15.45719051361084, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8748422265052795, + "num_tokens": 795639622.0, + "step": 20850 + }, + { + "epoch": 2.6524615188907266, + "ewc_loss": 0.07868783175945282, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00041090179001912475, + "grad_norm": 8.141704559326172, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8878356218338013, + "num_tokens": 795677299.0, + "step": 20851 + }, + { + "epoch": 2.6525887291693167, + "ewc_loss": 0.12080869823694229, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0008345518144778907, + "grad_norm": 14.494680404663086, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8649918437004089, + "num_tokens": 795714304.0, + "step": 20852 + }, + { + "epoch": 2.6527159394479076, + "ewc_loss": 0.12716320157051086, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000895655364729464, + "grad_norm": 14.353774070739746, + "learning_rate": 1e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8609801530838013, + "num_tokens": 795752330.0, + "step": 20853 + }, + { + "epoch": 2.6528431497264977, + "ewc_loss": 0.09313556551933289, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0005553790833801031, + "grad_norm": 9.966201782226562, + "learning_rate": 1e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.844226598739624, + "num_tokens": 795789503.0, + "step": 20854 + }, + { + "epoch": 2.6529703600050887, + "ewc_loss": 0.1006292924284935, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0006303163827396929, + "grad_norm": 12.315954208374023, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8637278079986572, + "num_tokens": 795828449.0, + "step": 20855 + }, + { + "epoch": 2.6530975702836788, + "ewc_loss": 0.10443097352981567, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0006707745487801731, + "grad_norm": 11.89125919342041, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8664547204971313, + "num_tokens": 795868454.0, + "step": 20856 + }, + { + "epoch": 2.6532247805622693, + "ewc_loss": 0.08991210162639618, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005255858995951712, + "grad_norm": 10.27587604522705, + "learning_rate": 1e-06, + "loss": 0.5424, + "mean_token_accuracy": 0.8525320291519165, + "num_tokens": 795904629.0, + "step": 20857 + }, + { + "epoch": 2.65335199084086, + "ewc_loss": 0.09286925196647644, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005551573703996837, + "grad_norm": 11.12113094329834, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8531017303466797, + "num_tokens": 795943197.0, + "step": 20858 + }, + { + "epoch": 2.6534792011194503, + "ewc_loss": 0.09112167358398438, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005376815679483116, + "grad_norm": 10.422749519348145, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8623994588851929, + "num_tokens": 795984672.0, + "step": 20859 + }, + { + "epoch": 2.653606411398041, + "ewc_loss": 0.08803616464138031, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005068264435976744, + "grad_norm": 10.38941478729248, + "learning_rate": 1e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8469498753547668, + "num_tokens": 796022599.0, + "step": 20860 + }, + { + "epoch": 2.6537336216766314, + "ewc_loss": 0.08728286623954773, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000499293499160558, + "grad_norm": 10.260656356811523, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.864869236946106, + "num_tokens": 796057798.0, + "step": 20861 + }, + { + "epoch": 2.653860831955222, + "ewc_loss": 0.08621122688055038, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004885771195404232, + "grad_norm": 10.119989395141602, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8643680810928345, + "num_tokens": 796094928.0, + "step": 20862 + }, + { + "epoch": 2.6539880422338125, + "ewc_loss": 0.08443168550729752, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00047078169882297516, + "grad_norm": 9.876032829284668, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8667596578598022, + "num_tokens": 796132614.0, + "step": 20863 + }, + { + "epoch": 2.654115252512403, + "ewc_loss": 0.08502476662397385, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00047427110257558525, + "grad_norm": 9.974586486816406, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8634675741195679, + "num_tokens": 796171653.0, + "step": 20864 + }, + { + "epoch": 2.6542424627909935, + "ewc_loss": 0.08317360281944275, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004582008405122906, + "grad_norm": 9.75112247467041, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8587286472320557, + "num_tokens": 796215218.0, + "step": 20865 + }, + { + "epoch": 2.654369673069584, + "ewc_loss": 0.08346137404441833, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046107859816402197, + "grad_norm": 9.83549976348877, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8784065246582031, + "num_tokens": 796249165.0, + "step": 20866 + }, + { + "epoch": 2.6544968833481746, + "ewc_loss": 0.08215721696615219, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044803699711337686, + "grad_norm": 9.674025535583496, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8610649704933167, + "num_tokens": 796283878.0, + "step": 20867 + }, + { + "epoch": 2.654624093626765, + "ewc_loss": 0.0824713259935379, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045117808622308075, + "grad_norm": 9.688568115234375, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8527792692184448, + "num_tokens": 796330760.0, + "step": 20868 + }, + { + "epoch": 2.6547513039053556, + "ewc_loss": 0.0819072276353836, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004455371235962957, + "grad_norm": 9.678750038146973, + "learning_rate": 1e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8529130220413208, + "num_tokens": 796371547.0, + "step": 20869 + }, + { + "epoch": 2.654878514183946, + "ewc_loss": 0.08153484016656876, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004418132302816957, + "grad_norm": 9.517006874084473, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8499864935874939, + "num_tokens": 796409893.0, + "step": 20870 + }, + { + "epoch": 2.6550057244625367, + "ewc_loss": 0.08181868493556976, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044465166865848005, + "grad_norm": 9.638006210327148, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8505381345748901, + "num_tokens": 796442493.0, + "step": 20871 + }, + { + "epoch": 2.655132934741127, + "ewc_loss": 0.08101428300142288, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004366076609585434, + "grad_norm": 9.501535415649414, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8671991229057312, + "num_tokens": 796480853.0, + "step": 20872 + }, + { + "epoch": 2.6552601450197177, + "ewc_loss": 0.08146499842405319, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004411148256622255, + "grad_norm": 9.634404182434082, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8700435161590576, + "num_tokens": 796520057.0, + "step": 20873 + }, + { + "epoch": 2.6553873552983083, + "ewc_loss": 0.08075153827667236, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004339802253525704, + "grad_norm": 9.487757682800293, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8676332831382751, + "num_tokens": 796549779.0, + "step": 20874 + }, + { + "epoch": 2.6555145655768984, + "ewc_loss": 0.08102722465991974, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004367370856925845, + "grad_norm": 9.55303955078125, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8694854974746704, + "num_tokens": 796590923.0, + "step": 20875 + }, + { + "epoch": 2.6556417758554893, + "ewc_loss": 0.08064395189285278, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043290440225973725, + "grad_norm": 9.453492164611816, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.861966609954834, + "num_tokens": 796629455.0, + "step": 20876 + }, + { + "epoch": 2.6557689861340794, + "ewc_loss": 0.08090290427207947, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004354938573669642, + "grad_norm": 9.463887214660645, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8611106276512146, + "num_tokens": 796669950.0, + "step": 20877 + }, + { + "epoch": 2.6558961964126704, + "ewc_loss": 0.08073883503675461, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043385318713262677, + "grad_norm": 9.417061805725098, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8537921905517578, + "num_tokens": 796712263.0, + "step": 20878 + }, + { + "epoch": 2.6560234066912605, + "ewc_loss": 0.08100473880767822, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004365122294984758, + "grad_norm": 9.497390747070312, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.864639401435852, + "num_tokens": 796753025.0, + "step": 20879 + }, + { + "epoch": 2.6561506169698514, + "ewc_loss": 0.08068221807479858, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004332870012149215, + "grad_norm": 9.425419807434082, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8777139186859131, + "num_tokens": 796788810.0, + "step": 20880 + }, + { + "epoch": 2.6562778272484415, + "ewc_loss": 0.08099396526813507, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004364044580142945, + "grad_norm": 9.499858856201172, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8672959804534912, + "num_tokens": 796826275.0, + "step": 20881 + }, + { + "epoch": 2.656405037527032, + "ewc_loss": 0.08064103126525879, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043287515291012824, + "grad_norm": 9.396629333496094, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8602981567382812, + "num_tokens": 796859587.0, + "step": 20882 + }, + { + "epoch": 2.6565322478056226, + "ewc_loss": 0.08106692880392075, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004371341201476753, + "grad_norm": 9.484454154968262, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8714728355407715, + "num_tokens": 796899440.0, + "step": 20883 + }, + { + "epoch": 2.656659458084213, + "ewc_loss": 0.08062189072370529, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043268376612104475, + "grad_norm": 9.408727645874023, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8548500537872314, + "num_tokens": 796938703.0, + "step": 20884 + }, + { + "epoch": 2.6567866683628036, + "ewc_loss": 0.08092983067035675, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004357631260063499, + "grad_norm": 9.445684432983398, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8512079119682312, + "num_tokens": 796982287.0, + "step": 20885 + }, + { + "epoch": 2.656913878641394, + "ewc_loss": 0.08076225966215134, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000434087443863973, + "grad_norm": 9.493450164794922, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8673243522644043, + "num_tokens": 797017640.0, + "step": 20886 + }, + { + "epoch": 2.6570410889199847, + "ewc_loss": 0.08105672895908356, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004345906781964004, + "grad_norm": 9.395269393920898, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8484822511672974, + "num_tokens": 797054050.0, + "step": 20887 + }, + { + "epoch": 2.6571682991985752, + "ewc_loss": 0.08092953264713287, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043576015741564333, + "grad_norm": 9.434483528137207, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8693922758102417, + "num_tokens": 797092429.0, + "step": 20888 + }, + { + "epoch": 2.6572955094771658, + "ewc_loss": 0.08063677698373795, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004328326031100005, + "grad_norm": 9.460247993469238, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.862256646156311, + "num_tokens": 797125646.0, + "step": 20889 + }, + { + "epoch": 2.6574227197557563, + "ewc_loss": 0.0808330625295639, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004347954236436635, + "grad_norm": 9.429816246032715, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.869644820690155, + "num_tokens": 797164560.0, + "step": 20890 + }, + { + "epoch": 2.657549930034347, + "ewc_loss": 0.08076789975166321, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043414384708739817, + "grad_norm": 9.457119941711426, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8638392686843872, + "num_tokens": 797194168.0, + "step": 20891 + }, + { + "epoch": 2.6576771403129373, + "ewc_loss": 0.08060336112976074, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043249846203252673, + "grad_norm": 9.36196517944336, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8848305940628052, + "num_tokens": 797227045.0, + "step": 20892 + }, + { + "epoch": 2.657804350591528, + "ewc_loss": 0.08107084035873413, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043717329390347004, + "grad_norm": 9.523125648498535, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8802343606948853, + "num_tokens": 797267607.0, + "step": 20893 + }, + { + "epoch": 2.6579315608701184, + "ewc_loss": 0.08034800738096237, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004299449210520834, + "grad_norm": 9.349267959594727, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8554688692092896, + "num_tokens": 797309320.0, + "step": 20894 + }, + { + "epoch": 2.658058771148709, + "ewc_loss": 0.08110791444778442, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043754398939199746, + "grad_norm": 9.530959129333496, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8620163798332214, + "num_tokens": 797348258.0, + "step": 20895 + }, + { + "epoch": 2.6581859814272994, + "ewc_loss": 0.08056055754423141, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042962899897247553, + "grad_norm": 9.39772891998291, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8647583723068237, + "num_tokens": 797391563.0, + "step": 20896 + }, + { + "epoch": 2.65831319170589, + "ewc_loss": 0.08092181384563446, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004356829740572721, + "grad_norm": 9.450114250183105, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8732905387878418, + "num_tokens": 797428314.0, + "step": 20897 + }, + { + "epoch": 2.6584404019844805, + "ewc_loss": 0.08052684366703033, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004317333223298192, + "grad_norm": 9.43869400024414, + "learning_rate": 1e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8438804149627686, + "num_tokens": 797464480.0, + "step": 20898 + }, + { + "epoch": 2.658567612263071, + "ewc_loss": 0.08067937195301056, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043325850856490433, + "grad_norm": 9.453064918518066, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8659322261810303, + "num_tokens": 797504135.0, + "step": 20899 + }, + { + "epoch": 2.658694822541661, + "ewc_loss": 0.08053664863109589, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043183137313462794, + "grad_norm": 9.446919441223145, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8673737049102783, + "num_tokens": 797544222.0, + "step": 20900 + }, + { + "epoch": 2.658822032820252, + "ewc_loss": 0.08068260550498962, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043329093023203313, + "grad_norm": 9.514361381530762, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8703719973564148, + "num_tokens": 797583695.0, + "step": 20901 + }, + { + "epoch": 2.658949243098842, + "ewc_loss": 0.08032586425542831, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042972349910996854, + "grad_norm": 9.40419864654541, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8502712249755859, + "num_tokens": 797625660.0, + "step": 20902 + }, + { + "epoch": 2.659076453377433, + "ewc_loss": 0.08073214441537857, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004337863065302372, + "grad_norm": 9.470730781555176, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8636034727096558, + "num_tokens": 797665807.0, + "step": 20903 + }, + { + "epoch": 2.6592036636560232, + "ewc_loss": 0.08043703436851501, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043083514901809394, + "grad_norm": 9.448152542114258, + "learning_rate": 1e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8461893796920776, + "num_tokens": 797707935.0, + "step": 20904 + }, + { + "epoch": 2.6593308739346138, + "ewc_loss": 0.08083979785442352, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043242136598564684, + "grad_norm": 9.461203575134277, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8567270040512085, + "num_tokens": 797745502.0, + "step": 20905 + }, + { + "epoch": 2.6594580842132043, + "ewc_loss": 0.08062978088855743, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004327626957092434, + "grad_norm": 9.517592430114746, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8638412952423096, + "num_tokens": 797779412.0, + "step": 20906 + }, + { + "epoch": 2.659585294491795, + "ewc_loss": 0.08045872300863266, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000431052059866488, + "grad_norm": 9.437955856323242, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.86913001537323, + "num_tokens": 797816677.0, + "step": 20907 + }, + { + "epoch": 2.6597125047703853, + "ewc_loss": 0.08054867386817932, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004319515428505838, + "grad_norm": 9.520431518554688, + "learning_rate": 1e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8463432788848877, + "num_tokens": 797854260.0, + "step": 20908 + }, + { + "epoch": 2.659839715048976, + "ewc_loss": 0.08021098375320435, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004285746836103499, + "grad_norm": 9.388503074645996, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8521544933319092, + "num_tokens": 797892349.0, + "step": 20909 + }, + { + "epoch": 2.6599669253275664, + "ewc_loss": 0.0805981382727623, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004324462206568569, + "grad_norm": 9.486480712890625, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8675387501716614, + "num_tokens": 797928028.0, + "step": 20910 + }, + { + "epoch": 2.660094135606157, + "ewc_loss": 0.08017051964998245, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004281700530555099, + "grad_norm": 9.429973602294922, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8503828644752502, + "num_tokens": 797963885.0, + "step": 20911 + }, + { + "epoch": 2.6602213458847475, + "ewc_loss": 0.08066752552986145, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004331400559749454, + "grad_norm": 9.474574089050293, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8503198623657227, + "num_tokens": 798009842.0, + "step": 20912 + }, + { + "epoch": 2.660348556163338, + "ewc_loss": 0.08027702569961548, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004292351077310741, + "grad_norm": 9.429062843322754, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8763752579689026, + "num_tokens": 798043207.0, + "step": 20913 + }, + { + "epoch": 2.6604757664419285, + "ewc_loss": 0.08045418560504913, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004310066578909755, + "grad_norm": 9.519784927368164, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8792170286178589, + "num_tokens": 798080887.0, + "step": 20914 + }, + { + "epoch": 2.660602976720519, + "ewc_loss": 0.08019651472568512, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000428429979365319, + "grad_norm": 9.35842514038086, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8733928203582764, + "num_tokens": 798126393.0, + "step": 20915 + }, + { + "epoch": 2.6607301869991096, + "ewc_loss": 0.08074353635311127, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004339002480264753, + "grad_norm": 9.509546279907227, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8806581497192383, + "num_tokens": 798166099.0, + "step": 20916 + }, + { + "epoch": 2.6608573972777, + "ewc_loss": 0.08004892617464066, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004269540950190276, + "grad_norm": 9.40146541595459, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8598741888999939, + "num_tokens": 798202285.0, + "step": 20917 + }, + { + "epoch": 2.6609846075562906, + "ewc_loss": 0.08084285259246826, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043489341624081135, + "grad_norm": 9.476160049438477, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8728625178337097, + "num_tokens": 798243463.0, + "step": 20918 + }, + { + "epoch": 2.661111817834881, + "ewc_loss": 0.08023510873317719, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004288158961571753, + "grad_norm": 9.386893272399902, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.866305947303772, + "num_tokens": 798275507.0, + "step": 20919 + }, + { + "epoch": 2.6612390281134717, + "ewc_loss": 0.08073084056377411, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004337732680141926, + "grad_norm": 9.43362808227539, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8837917447090149, + "num_tokens": 798310536.0, + "step": 20920 + }, + { + "epoch": 2.661366238392062, + "ewc_loss": 0.08045358955860138, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043100069160573184, + "grad_norm": 9.457863807678223, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8607900142669678, + "num_tokens": 798347874.0, + "step": 20921 + }, + { + "epoch": 2.6614934486706527, + "ewc_loss": 0.08048452436923981, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043131012353114784, + "grad_norm": 9.394876480102539, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.88833087682724, + "num_tokens": 798381865.0, + "step": 20922 + }, + { + "epoch": 2.661620658949243, + "ewc_loss": 0.08066800236701965, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004331448581069708, + "grad_norm": 9.403426170349121, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.863349974155426, + "num_tokens": 798423378.0, + "step": 20923 + }, + { + "epoch": 2.661747869227834, + "ewc_loss": 0.0805412232875824, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004318771243561059, + "grad_norm": 9.393686294555664, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8720263242721558, + "num_tokens": 798460278.0, + "step": 20924 + }, + { + "epoch": 2.661875079506424, + "ewc_loss": 0.08059054613113403, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004323702887631953, + "grad_norm": 9.365185737609863, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8651975393295288, + "num_tokens": 798496622.0, + "step": 20925 + }, + { + "epoch": 2.662002289785015, + "ewc_loss": 0.08080059289932251, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004344707413110882, + "grad_norm": 9.457589149475098, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8590259552001953, + "num_tokens": 798533061.0, + "step": 20926 + }, + { + "epoch": 2.662129500063605, + "ewc_loss": 0.08050767332315445, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043154158629477024, + "grad_norm": 9.421651840209961, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8576761484146118, + "num_tokens": 798573689.0, + "step": 20927 + }, + { + "epoch": 2.662256710342196, + "ewc_loss": 0.08091852813959122, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043320871191099286, + "grad_norm": 9.404762268066406, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.873460054397583, + "num_tokens": 798614058.0, + "step": 20928 + }, + { + "epoch": 2.662383920620786, + "ewc_loss": 0.08075834810733795, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004340482992120087, + "grad_norm": 9.442614555358887, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8551340103149414, + "num_tokens": 798657769.0, + "step": 20929 + }, + { + "epoch": 2.6625111308993765, + "ewc_loss": 0.08056613802909851, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043212619493715465, + "grad_norm": 9.402758598327637, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8685304522514343, + "num_tokens": 798701084.0, + "step": 20930 + }, + { + "epoch": 2.662638341177967, + "ewc_loss": 0.08080954104661942, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043456023558974266, + "grad_norm": 9.417557716369629, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8706697225570679, + "num_tokens": 798738029.0, + "step": 20931 + }, + { + "epoch": 2.6627655514565576, + "ewc_loss": 0.08077436685562134, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004342085449025035, + "grad_norm": 9.439123153686523, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8607041835784912, + "num_tokens": 798773537.0, + "step": 20932 + }, + { + "epoch": 2.662892761735148, + "ewc_loss": 0.0807347223162651, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004338120634201914, + "grad_norm": 9.442797660827637, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8628365397453308, + "num_tokens": 798810397.0, + "step": 20933 + }, + { + "epoch": 2.6630199720137386, + "ewc_loss": 0.08104772865772247, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004345007473602891, + "grad_norm": 9.472801208496094, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8610560894012451, + "num_tokens": 798850796.0, + "step": 20934 + }, + { + "epoch": 2.663147182292329, + "ewc_loss": 0.08076949417591095, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043415979598648846, + "grad_norm": 9.454507827758789, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8646660447120667, + "num_tokens": 798892379.0, + "step": 20935 + }, + { + "epoch": 2.6632743925709197, + "ewc_loss": 0.08066324889659882, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004330973024480045, + "grad_norm": 9.481022834777832, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8521011471748352, + "num_tokens": 798929444.0, + "step": 20936 + }, + { + "epoch": 2.66340160284951, + "ewc_loss": 0.08053267747163773, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043179161730222404, + "grad_norm": 9.426207542419434, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8699367642402649, + "num_tokens": 798964086.0, + "step": 20937 + }, + { + "epoch": 2.6635288131281007, + "ewc_loss": 0.0808771401643753, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043523620115593076, + "grad_norm": 9.463157653808594, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8724930286407471, + "num_tokens": 799007408.0, + "step": 20938 + }, + { + "epoch": 2.6636560234066913, + "ewc_loss": 0.08055395632982254, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004320043954066932, + "grad_norm": 9.419325828552246, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8693602085113525, + "num_tokens": 799045930.0, + "step": 20939 + }, + { + "epoch": 2.663783233685282, + "ewc_loss": 0.08087936043739319, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043525840737856925, + "grad_norm": 9.546781539916992, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8756837844848633, + "num_tokens": 799083568.0, + "step": 20940 + }, + { + "epoch": 2.6639104439638723, + "ewc_loss": 0.0803380161523819, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042984497849829495, + "grad_norm": 9.433854103088379, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8619874715805054, + "num_tokens": 799123681.0, + "step": 20941 + }, + { + "epoch": 2.664037654242463, + "ewc_loss": 0.08066119253635406, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004330767260398716, + "grad_norm": 9.55162239074707, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8610367178916931, + "num_tokens": 799164623.0, + "step": 20942 + }, + { + "epoch": 2.6641648645210534, + "ewc_loss": 0.08034701645374298, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042993496754206717, + "grad_norm": 9.426612854003906, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8637864589691162, + "num_tokens": 799200408.0, + "step": 20943 + }, + { + "epoch": 2.664292074799644, + "ewc_loss": 0.08080579340457916, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004345227498561144, + "grad_norm": 9.515984535217285, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8544245958328247, + "num_tokens": 799245106.0, + "step": 20944 + }, + { + "epoch": 2.6644192850782344, + "ewc_loss": 0.08022958040237427, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004287605988793075, + "grad_norm": 9.400449752807617, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8726210594177246, + "num_tokens": 799290429.0, + "step": 20945 + }, + { + "epoch": 2.664546495356825, + "ewc_loss": 0.08093737065792084, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043583859223872423, + "grad_norm": 9.538918495178223, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8627636432647705, + "num_tokens": 799322951.0, + "step": 20946 + }, + { + "epoch": 2.6646737056354155, + "ewc_loss": 0.08034040033817291, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004298689018469304, + "grad_norm": 9.414215087890625, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8619369268417358, + "num_tokens": 799365498.0, + "step": 20947 + }, + { + "epoch": 2.6648009159140056, + "ewc_loss": 0.08084580302238464, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043492286931723356, + "grad_norm": 9.519950866699219, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8596748113632202, + "num_tokens": 799405894.0, + "step": 20948 + }, + { + "epoch": 2.6649281261925966, + "ewc_loss": 0.08023253083229065, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042879011016339064, + "grad_norm": 9.44771671295166, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.863991916179657, + "num_tokens": 799439750.0, + "step": 20949 + }, + { + "epoch": 2.6650553364711866, + "ewc_loss": 0.08058103919029236, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004322752356529236, + "grad_norm": 9.490570068359375, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8565212488174438, + "num_tokens": 799475170.0, + "step": 20950 + }, + { + "epoch": 2.6651825467497776, + "ewc_loss": 0.0804339349269867, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000430804182542488, + "grad_norm": 9.397358894348145, + "learning_rate": 1e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8512078523635864, + "num_tokens": 799514231.0, + "step": 20951 + }, + { + "epoch": 2.6653097570283677, + "ewc_loss": 0.08080384135246277, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004345033084973693, + "grad_norm": 9.585477828979492, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8645614385604858, + "num_tokens": 799549512.0, + "step": 20952 + }, + { + "epoch": 2.6654369673069587, + "ewc_loss": 0.0805940181016922, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004275222308933735, + "grad_norm": 9.307684898376465, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8599516749382019, + "num_tokens": 799592144.0, + "step": 20953 + }, + { + "epoch": 2.6655641775855488, + "ewc_loss": 0.08113481104373932, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043781293788924813, + "grad_norm": 9.515070915222168, + "learning_rate": 1e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8501225113868713, + "num_tokens": 799634779.0, + "step": 20954 + }, + { + "epoch": 2.6656913878641393, + "ewc_loss": 0.0802035853266716, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004285007016733289, + "grad_norm": 9.354593276977539, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.87647944688797, + "num_tokens": 799677548.0, + "step": 20955 + }, + { + "epoch": 2.66581859814273, + "ewc_loss": 0.08109115064144135, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004373763222247362, + "grad_norm": 9.531189918518066, + "learning_rate": 1e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8472472429275513, + "num_tokens": 799715679.0, + "step": 20956 + }, + { + "epoch": 2.6659458084213203, + "ewc_loss": 0.0804150402545929, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004306152695789933, + "grad_norm": 9.393348693847656, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8564586639404297, + "num_tokens": 799755701.0, + "step": 20957 + }, + { + "epoch": 2.666073018699911, + "ewc_loss": 0.08089000731706619, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043536489829421043, + "grad_norm": 9.489572525024414, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8593021035194397, + "num_tokens": 799795151.0, + "step": 20958 + }, + { + "epoch": 2.6662002289785014, + "ewc_loss": 0.0806233137845993, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004326979396864772, + "grad_norm": 9.42003345489502, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8702797889709473, + "num_tokens": 799832249.0, + "step": 20959 + }, + { + "epoch": 2.666327439257092, + "ewc_loss": 0.0808013454079628, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043447830830700696, + "grad_norm": 9.50434684753418, + "learning_rate": 1e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8418521881103516, + "num_tokens": 799867762.0, + "step": 20960 + }, + { + "epoch": 2.6664546495356825, + "ewc_loss": 0.0805821642279625, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004322864697314799, + "grad_norm": 9.380168914794922, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8617685437202454, + "num_tokens": 799909012.0, + "step": 20961 + }, + { + "epoch": 2.666581859814273, + "ewc_loss": 0.08098866045475006, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004363514599390328, + "grad_norm": 9.53203296661377, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8603149652481079, + "num_tokens": 799950316.0, + "step": 20962 + }, + { + "epoch": 2.6667090700928635, + "ewc_loss": 0.08025675266981125, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004290323704481125, + "grad_norm": 9.338434219360352, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8793025612831116, + "num_tokens": 799990037.0, + "step": 20963 + }, + { + "epoch": 2.666836280371454, + "ewc_loss": 0.08102138340473175, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043667867430485785, + "grad_norm": 9.599417686462402, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8745052814483643, + "num_tokens": 800023935.0, + "step": 20964 + }, + { + "epoch": 2.6669634906500446, + "ewc_loss": 0.0800817608833313, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042728244443424046, + "grad_norm": 9.260672569274902, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8715871572494507, + "num_tokens": 800066357.0, + "step": 20965 + }, + { + "epoch": 2.667090700928635, + "ewc_loss": 0.08147040009498596, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044116887147538364, + "grad_norm": 9.587433815002441, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8492937684059143, + "num_tokens": 800100667.0, + "step": 20966 + }, + { + "epoch": 2.6672179112072256, + "ewc_loss": 0.08001512289047241, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042661611223593354, + "grad_norm": 9.344953536987305, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8684393167495728, + "num_tokens": 800135101.0, + "step": 20967 + }, + { + "epoch": 2.667345121485816, + "ewc_loss": 0.08140558004379272, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004405206418596208, + "grad_norm": 9.548312187194824, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8794804811477661, + "num_tokens": 800176308.0, + "step": 20968 + }, + { + "epoch": 2.6674723317644067, + "ewc_loss": 0.08027514815330505, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004292163357604295, + "grad_norm": 9.467281341552734, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8756389021873474, + "num_tokens": 800205585.0, + "step": 20969 + }, + { + "epoch": 2.667599542042997, + "ewc_loss": 0.08093836903572083, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004358484875410795, + "grad_norm": 9.535806655883789, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8634471297264099, + "num_tokens": 800241400.0, + "step": 20970 + }, + { + "epoch": 2.6677267523215877, + "ewc_loss": 0.08028367161750793, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042930160998366773, + "grad_norm": 9.415294647216797, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8720187544822693, + "num_tokens": 800279060.0, + "step": 20971 + }, + { + "epoch": 2.6678539626001783, + "ewc_loss": 0.08085616677999496, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043502650805749, + "grad_norm": 9.52418041229248, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8733294010162354, + "num_tokens": 800318606.0, + "step": 20972 + }, + { + "epoch": 2.6679811728787683, + "ewc_loss": 0.08080370724201202, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004296190745662898, + "grad_norm": 9.451082229614258, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8645599484443665, + "num_tokens": 800356795.0, + "step": 20973 + }, + { + "epoch": 2.6681083831573593, + "ewc_loss": 0.08077284693717957, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004341932653915137, + "grad_norm": 9.535924911499023, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8735386729240417, + "num_tokens": 800393786.0, + "step": 20974 + }, + { + "epoch": 2.6682355934359494, + "ewc_loss": 0.08030912280082703, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042955606477335095, + "grad_norm": 9.327629089355469, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8617729544639587, + "num_tokens": 800440153.0, + "step": 20975 + }, + { + "epoch": 2.6683628037145404, + "ewc_loss": 0.08114303648471832, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004378952144179493, + "grad_norm": 9.560319900512695, + "learning_rate": 1e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8464276194572449, + "num_tokens": 800476343.0, + "step": 20976 + }, + { + "epoch": 2.6684900139931305, + "ewc_loss": 0.08002415299415588, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004267064214218408, + "grad_norm": 9.30319881439209, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8776766061782837, + "num_tokens": 800514227.0, + "step": 20977 + }, + { + "epoch": 2.668617224271721, + "ewc_loss": 0.08128742128610611, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004393390554469079, + "grad_norm": 9.581731796264648, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8630989789962769, + "num_tokens": 800552357.0, + "step": 20978 + }, + { + "epoch": 2.6687444345503115, + "ewc_loss": 0.08012548089027405, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004277196421753615, + "grad_norm": 9.422842979431152, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8736419081687927, + "num_tokens": 800587624.0, + "step": 20979 + }, + { + "epoch": 2.668871644828902, + "ewc_loss": 0.08086726069450378, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043513745185919106, + "grad_norm": 9.457536697387695, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8581056594848633, + "num_tokens": 800631183.0, + "step": 20980 + }, + { + "epoch": 2.6689988551074926, + "ewc_loss": 0.08049291372299194, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043139394256286323, + "grad_norm": 9.384900093078613, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8683326244354248, + "num_tokens": 800670334.0, + "step": 20981 + }, + { + "epoch": 2.669126065386083, + "ewc_loss": 0.08080458641052246, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004345107590779662, + "grad_norm": 9.508668899536133, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.861167311668396, + "num_tokens": 800703500.0, + "step": 20982 + }, + { + "epoch": 2.6692532756646736, + "ewc_loss": 0.08024394512176514, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042890425538644195, + "grad_norm": 9.38257884979248, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8566349744796753, + "num_tokens": 800742904.0, + "step": 20983 + }, + { + "epoch": 2.669380485943264, + "ewc_loss": 0.08075831830501556, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004340480372775346, + "grad_norm": 9.590178489685059, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8612837791442871, + "num_tokens": 800777556.0, + "step": 20984 + }, + { + "epoch": 2.6695076962218547, + "ewc_loss": 0.08016419410705566, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042810681043192744, + "grad_norm": 9.373181343078613, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8557918071746826, + "num_tokens": 800817532.0, + "step": 20985 + }, + { + "epoch": 2.669634906500445, + "ewc_loss": 0.08102954924106598, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043676033965311944, + "grad_norm": 9.514315605163574, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8744675517082214, + "num_tokens": 800856567.0, + "step": 20986 + }, + { + "epoch": 2.6697621167790357, + "ewc_loss": 0.08034884929656982, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042995333205908537, + "grad_norm": 9.588082313537598, + "learning_rate": 1e-06, + "loss": 0.5429, + "mean_token_accuracy": 0.8420355319976807, + "num_tokens": 800901234.0, + "step": 20987 + }, + { + "epoch": 2.6698893270576263, + "ewc_loss": 0.08041761815547943, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000430641055572778, + "grad_norm": 9.394794464111328, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8608410358428955, + "num_tokens": 800942647.0, + "step": 20988 + }, + { + "epoch": 2.670016537336217, + "ewc_loss": 0.081128790974617, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004353113181423396, + "grad_norm": 9.39916706085205, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8717972040176392, + "num_tokens": 800980194.0, + "step": 20989 + }, + { + "epoch": 2.6701437476148073, + "ewc_loss": 0.08082295954227448, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000432253087637946, + "grad_norm": 9.486207008361816, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8694425821304321, + "num_tokens": 801016978.0, + "step": 20990 + }, + { + "epoch": 2.670270957893398, + "ewc_loss": 0.0808294415473938, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004323178145568818, + "grad_norm": 9.426383018493652, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.849976122379303, + "num_tokens": 801051921.0, + "step": 20991 + }, + { + "epoch": 2.6703981681719884, + "ewc_loss": 0.08101020008325577, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043412542436271906, + "grad_norm": 9.528420448303223, + "learning_rate": 1e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.8386977910995483, + "num_tokens": 801087189.0, + "step": 20992 + }, + { + "epoch": 2.670525378450579, + "ewc_loss": 0.08070848882198334, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043110837577842176, + "grad_norm": 9.45050048828125, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8710767030715942, + "num_tokens": 801121574.0, + "step": 20993 + }, + { + "epoch": 2.6706525887291694, + "ewc_loss": 0.08131587505340576, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004347407375462353, + "grad_norm": 9.52380657196045, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8701449632644653, + "num_tokens": 801160497.0, + "step": 20994 + }, + { + "epoch": 2.67077979900776, + "ewc_loss": 0.08062104135751724, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043023383477702737, + "grad_norm": 9.479442596435547, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8667389154434204, + "num_tokens": 801198556.0, + "step": 20995 + }, + { + "epoch": 2.6709070092863505, + "ewc_loss": 0.08085981011390686, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004326215130276978, + "grad_norm": 9.49256706237793, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8908985257148743, + "num_tokens": 801230616.0, + "step": 20996 + }, + { + "epoch": 2.671034219564941, + "ewc_loss": 0.08057596534490585, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004297830746509135, + "grad_norm": 9.444002151489258, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.863399863243103, + "num_tokens": 801266582.0, + "step": 20997 + }, + { + "epoch": 2.671161429843531, + "ewc_loss": 0.08083789050579071, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043240233208052814, + "grad_norm": 9.510705947875977, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.87604820728302, + "num_tokens": 801305127.0, + "step": 20998 + }, + { + "epoch": 2.671288640122122, + "ewc_loss": 0.08048734813928604, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042889692122116685, + "grad_norm": 9.374224662780762, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8682742118835449, + "num_tokens": 801348284.0, + "step": 20999 + }, + { + "epoch": 2.671415850400712, + "ewc_loss": 0.08108319342136383, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043485540663823485, + "grad_norm": 9.576542854309082, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8852464556694031, + "num_tokens": 801385298.0, + "step": 21000 + }, + { + "epoch": 2.671543060679303, + "ewc_loss": 0.08017244935035706, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004257479158695787, + "grad_norm": 9.440077781677246, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8560988903045654, + "num_tokens": 801417020.0, + "step": 21001 + }, + { + "epoch": 2.6716702709578932, + "ewc_loss": 0.08097809553146362, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043380438000895083, + "grad_norm": 9.531964302062988, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8587424755096436, + "num_tokens": 801455328.0, + "step": 21002 + }, + { + "epoch": 2.6717974812364838, + "ewc_loss": 0.08018344640731812, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042585792834870517, + "grad_norm": 9.322566986083984, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.875434398651123, + "num_tokens": 801494342.0, + "step": 21003 + }, + { + "epoch": 2.6719246915150743, + "ewc_loss": 0.08101870119571686, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043421046575531363, + "grad_norm": 9.589113235473633, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8680692315101624, + "num_tokens": 801528182.0, + "step": 21004 + }, + { + "epoch": 2.672051901793665, + "ewc_loss": 0.08011838793754578, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004252073704265058, + "grad_norm": 9.452452659606934, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8632769584655762, + "num_tokens": 801569424.0, + "step": 21005 + }, + { + "epoch": 2.6721791120722553, + "ewc_loss": 0.08091270923614502, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004331505624577403, + "grad_norm": 9.53035831451416, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8638655543327332, + "num_tokens": 801608809.0, + "step": 21006 + }, + { + "epoch": 2.672306322350846, + "ewc_loss": 0.08024272322654724, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000426450656959787, + "grad_norm": 9.523152351379395, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8551040887832642, + "num_tokens": 801651910.0, + "step": 21007 + }, + { + "epoch": 2.6724335326294364, + "ewc_loss": 0.08058157563209534, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004298392159398645, + "grad_norm": 9.636970520019531, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8639068603515625, + "num_tokens": 801692961.0, + "step": 21008 + }, + { + "epoch": 2.672560742908027, + "ewc_loss": 0.07992400974035263, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004257049586158246, + "grad_norm": 9.350057601928711, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8519377112388611, + "num_tokens": 801731278.0, + "step": 21009 + }, + { + "epoch": 2.6726879531866174, + "ewc_loss": 0.0805804431438446, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043226932757534087, + "grad_norm": 9.76453971862793, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8601829409599304, + "num_tokens": 801769562.0, + "step": 21010 + }, + { + "epoch": 2.672815163465208, + "ewc_loss": 0.07940304279327393, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042049522744491696, + "grad_norm": 9.224164962768555, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8615900278091431, + "num_tokens": 801811046.0, + "step": 21011 + }, + { + "epoch": 2.6729423737437985, + "ewc_loss": 0.0816294252872467, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044275904656387866, + "grad_norm": 9.910807609558105, + "learning_rate": 1e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8419089317321777, + "num_tokens": 801847833.0, + "step": 21012 + }, + { + "epoch": 2.673069584022389, + "ewc_loss": 0.07905252277851105, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004169900785200298, + "grad_norm": 9.193887710571289, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8590710163116455, + "num_tokens": 801887819.0, + "step": 21013 + }, + { + "epoch": 2.6731967943009796, + "ewc_loss": 0.08231601119041443, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044962496031075716, + "grad_norm": 10.175442695617676, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8771415948867798, + "num_tokens": 801922535.0, + "step": 21014 + }, + { + "epoch": 2.67332400457957, + "ewc_loss": 0.07882636785507202, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004147285653743893, + "grad_norm": 9.093310356140137, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8687793016433716, + "num_tokens": 801958025.0, + "step": 21015 + }, + { + "epoch": 2.6734512148581606, + "ewc_loss": 0.08335389941930771, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004600038519129157, + "grad_norm": 10.432807922363281, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8677513599395752, + "num_tokens": 801991291.0, + "step": 21016 + }, + { + "epoch": 2.673578425136751, + "ewc_loss": 0.07922206819057465, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00041868555126711726, + "grad_norm": 9.13823413848877, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.867074728012085, + "num_tokens": 802024952.0, + "step": 21017 + }, + { + "epoch": 2.6737056354153417, + "ewc_loss": 0.08400574326515198, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004665223241318017, + "grad_norm": 10.097943305969238, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8628189563751221, + "num_tokens": 802065741.0, + "step": 21018 + }, + { + "epoch": 2.673832845693932, + "ewc_loss": 0.07975354790687561, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004240003472659737, + "grad_norm": 9.401633262634277, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8624343872070312, + "num_tokens": 802107153.0, + "step": 21019 + }, + { + "epoch": 2.6739600559725227, + "ewc_loss": 0.08289302885532379, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004553950857371092, + "grad_norm": 9.872549057006836, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8689498901367188, + "num_tokens": 802148177.0, + "step": 21020 + }, + { + "epoch": 2.674087266251113, + "ewc_loss": 0.08043771237134933, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004308419593144208, + "grad_norm": 9.5021390914917, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8654035329818726, + "num_tokens": 802186144.0, + "step": 21021 + }, + { + "epoch": 2.674214476529704, + "ewc_loss": 0.08174550533294678, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004439198528416455, + "grad_norm": 9.726055145263672, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8788663148880005, + "num_tokens": 802224054.0, + "step": 21022 + }, + { + "epoch": 2.674341686808294, + "ewc_loss": 0.08064734935760498, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043293830822221935, + "grad_norm": 9.532526969909668, + "learning_rate": 1e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8481225967407227, + "num_tokens": 802264483.0, + "step": 21023 + }, + { + "epoch": 2.674468897086885, + "ewc_loss": 0.08097455650568008, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043621042277663946, + "grad_norm": 9.55399227142334, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8721444606781006, + "num_tokens": 802302933.0, + "step": 21024 + }, + { + "epoch": 2.674596107365475, + "ewc_loss": 0.08102347701787949, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004366996290627867, + "grad_norm": 9.601759910583496, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8575372099876404, + "num_tokens": 802347354.0, + "step": 21025 + }, + { + "epoch": 2.674723317644066, + "ewc_loss": 0.08057281374931335, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004321929591242224, + "grad_norm": 9.464004516601562, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8739166855812073, + "num_tokens": 802392159.0, + "step": 21026 + }, + { + "epoch": 2.674850527922656, + "ewc_loss": 0.08131370693445206, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043716051732189953, + "grad_norm": 9.530089378356934, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8675627708435059, + "num_tokens": 802432629.0, + "step": 21027 + }, + { + "epoch": 2.6749777382012465, + "ewc_loss": 0.08050058782100677, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043147074757143855, + "grad_norm": 9.497169494628906, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8751944303512573, + "num_tokens": 802471776.0, + "step": 21028 + }, + { + "epoch": 2.675104948479837, + "ewc_loss": 0.08100295066833496, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043649435974657536, + "grad_norm": 9.671111106872559, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8572938442230225, + "num_tokens": 802507314.0, + "step": 21029 + }, + { + "epoch": 2.6752321587584276, + "ewc_loss": 0.08035533875226974, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004300182336010039, + "grad_norm": 9.440997123718262, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8735208511352539, + "num_tokens": 802550587.0, + "step": 21030 + }, + { + "epoch": 2.675359369037018, + "ewc_loss": 0.08104385435581207, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043690335587598383, + "grad_norm": 9.57914924621582, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8647407293319702, + "num_tokens": 802591544.0, + "step": 21031 + }, + { + "epoch": 2.6754865793156086, + "ewc_loss": 0.08029931783676147, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004294580430723727, + "grad_norm": 9.427412033081055, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8727436065673828, + "num_tokens": 802627336.0, + "step": 21032 + }, + { + "epoch": 2.675613789594199, + "ewc_loss": 0.08094902336597443, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043595509487204254, + "grad_norm": 9.523468017578125, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8651697635650635, + "num_tokens": 802668358.0, + "step": 21033 + }, + { + "epoch": 2.6757409998727897, + "ewc_loss": 0.08050307631492615, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004314956604503095, + "grad_norm": 9.507675170898438, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8719645142555237, + "num_tokens": 802708023.0, + "step": 21034 + }, + { + "epoch": 2.67586821015138, + "ewc_loss": 0.08075080811977386, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004339729784987867, + "grad_norm": 9.497838020324707, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.887459933757782, + "num_tokens": 802745856.0, + "step": 21035 + }, + { + "epoch": 2.6759954204299707, + "ewc_loss": 0.08061140775680542, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004325788759160787, + "grad_norm": 9.500216484069824, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8760624527931213, + "num_tokens": 802779032.0, + "step": 21036 + }, + { + "epoch": 2.6761226307085613, + "ewc_loss": 0.08095154166221619, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004359802696853876, + "grad_norm": 9.542287826538086, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8645649552345276, + "num_tokens": 802815736.0, + "step": 21037 + }, + { + "epoch": 2.676249840987152, + "ewc_loss": 0.08059203624725342, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043238516082055867, + "grad_norm": 9.488946914672852, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8598369359970093, + "num_tokens": 802854133.0, + "step": 21038 + }, + { + "epoch": 2.6763770512657423, + "ewc_loss": 0.08067399263381958, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004332047246862203, + "grad_norm": 9.4635009765625, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8715861439704895, + "num_tokens": 802892969.0, + "step": 21039 + }, + { + "epoch": 2.676504261544333, + "ewc_loss": 0.08078770339488983, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004343418695498258, + "grad_norm": 9.463022232055664, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8807330131530762, + "num_tokens": 802936998.0, + "step": 21040 + }, + { + "epoch": 2.6766314718229234, + "ewc_loss": 0.08073312044143677, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043379608541727066, + "grad_norm": 9.507534980773926, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.873253345489502, + "num_tokens": 802979040.0, + "step": 21041 + }, + { + "epoch": 2.676758682101514, + "ewc_loss": 0.0807752013206482, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043421683949418366, + "grad_norm": 9.478283882141113, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8749775290489197, + "num_tokens": 803016516.0, + "step": 21042 + }, + { + "epoch": 2.6768858923801044, + "ewc_loss": 0.08104123175144196, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004368771333247423, + "grad_norm": 9.53354263305664, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8627738356590271, + "num_tokens": 803055566.0, + "step": 21043 + }, + { + "epoch": 2.677013102658695, + "ewc_loss": 0.08074603974819183, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043392519000917673, + "grad_norm": 9.4625825881958, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8673524856567383, + "num_tokens": 803093683.0, + "step": 21044 + }, + { + "epoch": 2.6771403129372855, + "ewc_loss": 0.08100660890340805, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000436530914157629, + "grad_norm": 9.497747421264648, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8733301162719727, + "num_tokens": 803133691.0, + "step": 21045 + }, + { + "epoch": 2.6772675232158756, + "ewc_loss": 0.08082146942615509, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043467950308695436, + "grad_norm": 9.468175888061523, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8742513656616211, + "num_tokens": 803169870.0, + "step": 21046 + }, + { + "epoch": 2.6773947334944665, + "ewc_loss": 0.08094785362482071, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004359433660283685, + "grad_norm": 9.521294593811035, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8583582639694214, + "num_tokens": 803209038.0, + "step": 21047 + }, + { + "epoch": 2.6775219437730566, + "ewc_loss": 0.08088544011116028, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043531920528039336, + "grad_norm": 9.525474548339844, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8510581851005554, + "num_tokens": 803239725.0, + "step": 21048 + }, + { + "epoch": 2.6776491540516476, + "ewc_loss": 0.0808267891407013, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004347327630966902, + "grad_norm": 9.445002555847168, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8657706379890442, + "num_tokens": 803273580.0, + "step": 21049 + }, + { + "epoch": 2.6777763643302377, + "ewc_loss": 0.0809285119175911, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004357499710749835, + "grad_norm": 9.445723533630371, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8710684180259705, + "num_tokens": 803313583.0, + "step": 21050 + }, + { + "epoch": 2.6779035746088287, + "ewc_loss": 0.08095972239971161, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004360620805528015, + "grad_norm": 9.493346214294434, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8725688457489014, + "num_tokens": 803347517.0, + "step": 21051 + }, + { + "epoch": 2.6780307848874187, + "ewc_loss": 0.08087003231048584, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004351652169134468, + "grad_norm": 9.485584259033203, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8521408438682556, + "num_tokens": 803388183.0, + "step": 21052 + }, + { + "epoch": 2.6781579951660093, + "ewc_loss": 0.08095940947532654, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004360589664429426, + "grad_norm": 9.46694564819336, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8661757707595825, + "num_tokens": 803428377.0, + "step": 21053 + }, + { + "epoch": 2.6782852054446, + "ewc_loss": 0.08089286834001541, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043539353646337986, + "grad_norm": 9.555908203125, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8755324482917786, + "num_tokens": 803460930.0, + "step": 21054 + }, + { + "epoch": 2.6784124157231903, + "ewc_loss": 0.08068963885307312, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004333611868787557, + "grad_norm": 9.39592170715332, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8746504187583923, + "num_tokens": 803497971.0, + "step": 21055 + }, + { + "epoch": 2.678539626001781, + "ewc_loss": 0.0811554342508316, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004380191385280341, + "grad_norm": 9.549447059631348, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8802672624588013, + "num_tokens": 803535514.0, + "step": 21056 + }, + { + "epoch": 2.6786668362803714, + "ewc_loss": 0.08036869764328003, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043015184928663075, + "grad_norm": 9.387861251831055, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8598513603210449, + "num_tokens": 803580767.0, + "step": 21057 + }, + { + "epoch": 2.678794046558962, + "ewc_loss": 0.08133306354284286, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043979549081996083, + "grad_norm": 9.539276123046875, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8673118948936462, + "num_tokens": 803619168.0, + "step": 21058 + }, + { + "epoch": 2.6789212568375524, + "ewc_loss": 0.08049920201301575, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043145683594048023, + "grad_norm": 9.413528442382812, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8504999876022339, + "num_tokens": 803654785.0, + "step": 21059 + }, + { + "epoch": 2.679048467116143, + "ewc_loss": 0.08112073689699173, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004376722208689898, + "grad_norm": 9.5181303024292, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8743616342544556, + "num_tokens": 803692505.0, + "step": 21060 + }, + { + "epoch": 2.6791756773947335, + "ewc_loss": 0.08061076700687408, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004325725603848696, + "grad_norm": 9.382400512695312, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8729446530342102, + "num_tokens": 803732592.0, + "step": 21061 + }, + { + "epoch": 2.679302887673324, + "ewc_loss": 0.08128780126571655, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043934289715252817, + "grad_norm": 9.581198692321777, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8539922833442688, + "num_tokens": 803767601.0, + "step": 21062 + }, + { + "epoch": 2.6794300979519146, + "ewc_loss": 0.08024800568819046, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042894491343759, + "grad_norm": 9.334325790405273, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.856393575668335, + "num_tokens": 803804329.0, + "step": 21063 + }, + { + "epoch": 2.679557308230505, + "ewc_loss": 0.08152160793542862, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004416809242684394, + "grad_norm": 9.580911636352539, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8608995079994202, + "num_tokens": 803846182.0, + "step": 21064 + }, + { + "epoch": 2.6796845185090956, + "ewc_loss": 0.08023439347743988, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042880879482254386, + "grad_norm": 9.397212028503418, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8606373071670532, + "num_tokens": 803883561.0, + "step": 21065 + }, + { + "epoch": 2.679811728787686, + "ewc_loss": 0.08126363158226013, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004391011316329241, + "grad_norm": 9.582318305969238, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8844172358512878, + "num_tokens": 803916860.0, + "step": 21066 + }, + { + "epoch": 2.6799389390662767, + "ewc_loss": 0.08035319298505783, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004299967549741268, + "grad_norm": 9.378040313720703, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8658795356750488, + "num_tokens": 803951949.0, + "step": 21067 + }, + { + "epoch": 2.680066149344867, + "ewc_loss": 0.08115988969802856, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043806369649246335, + "grad_norm": 9.525968551635742, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8602371215820312, + "num_tokens": 803992094.0, + "step": 21068 + }, + { + "epoch": 2.6801933596234577, + "ewc_loss": 0.08044859766960144, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000430950807640329, + "grad_norm": 9.401008605957031, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8643589615821838, + "num_tokens": 804034058.0, + "step": 21069 + }, + { + "epoch": 2.6803205699020483, + "ewc_loss": 0.08098344504833221, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004362992476671934, + "grad_norm": 9.489301681518555, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8568578362464905, + "num_tokens": 804072722.0, + "step": 21070 + }, + { + "epoch": 2.6804477801806383, + "ewc_loss": 0.08060012757778168, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043246615678071976, + "grad_norm": 9.383572578430176, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8730617165565491, + "num_tokens": 804114253.0, + "step": 21071 + }, + { + "epoch": 2.6805749904592293, + "ewc_loss": 0.0810273140668869, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004367379588074982, + "grad_norm": 9.469141960144043, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8682895302772522, + "num_tokens": 804154877.0, + "step": 21072 + }, + { + "epoch": 2.6807022007378194, + "ewc_loss": 0.08079427480697632, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004344075859989971, + "grad_norm": 9.530146598815918, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8602952361106873, + "num_tokens": 804191060.0, + "step": 21073 + }, + { + "epoch": 2.6808294110164104, + "ewc_loss": 0.08054469525814056, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043191175791434944, + "grad_norm": 9.412409782409668, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8573393821716309, + "num_tokens": 804231574.0, + "step": 21074 + }, + { + "epoch": 2.6809566212950005, + "ewc_loss": 0.08094419538974762, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043590678251348436, + "grad_norm": 9.54990291595459, + "learning_rate": 1e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8449482917785645, + "num_tokens": 804264027.0, + "step": 21075 + }, + { + "epoch": 2.681083831573591, + "ewc_loss": 0.08042822778224945, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043074708082713187, + "grad_norm": 9.397150993347168, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8671524524688721, + "num_tokens": 804302906.0, + "step": 21076 + }, + { + "epoch": 2.6812110418521815, + "ewc_loss": 0.08116324245929718, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043809725320897996, + "grad_norm": 9.558972358703613, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8600645065307617, + "num_tokens": 804339107.0, + "step": 21077 + }, + { + "epoch": 2.681338252130772, + "ewc_loss": 0.08025334775447845, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042899834807030857, + "grad_norm": 9.420802116394043, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8696191906929016, + "num_tokens": 804374292.0, + "step": 21078 + }, + { + "epoch": 2.6814654624093626, + "ewc_loss": 0.08115963637828827, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000438061251770705, + "grad_norm": 9.534116744995117, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8678293228149414, + "num_tokens": 804420287.0, + "step": 21079 + }, + { + "epoch": 2.681592672687953, + "ewc_loss": 0.08053676038980484, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043183244997635484, + "grad_norm": 9.438875198364258, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8712260723114014, + "num_tokens": 804453977.0, + "step": 21080 + }, + { + "epoch": 2.6817198829665436, + "ewc_loss": 0.08096728473901749, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004361376923043281, + "grad_norm": 9.54160213470459, + "learning_rate": 1e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8498039841651917, + "num_tokens": 804494958.0, + "step": 21081 + }, + { + "epoch": 2.681847093245134, + "ewc_loss": 0.08048193156719208, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043128416291438043, + "grad_norm": 9.472777366638184, + "learning_rate": 1e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8455204963684082, + "num_tokens": 804532209.0, + "step": 21082 + }, + { + "epoch": 2.6819743035237247, + "ewc_loss": 0.08087144792079926, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043517936137504876, + "grad_norm": 9.455336570739746, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8671066761016846, + "num_tokens": 804572937.0, + "step": 21083 + }, + { + "epoch": 2.682101513802315, + "ewc_loss": 0.08076845109462738, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004341494059190154, + "grad_norm": 9.501445770263672, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8639627695083618, + "num_tokens": 804610263.0, + "step": 21084 + }, + { + "epoch": 2.6822287240809057, + "ewc_loss": 0.08074550330638885, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043391986400820315, + "grad_norm": 9.469446182250977, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8691624402999878, + "num_tokens": 804645516.0, + "step": 21085 + }, + { + "epoch": 2.6823559343594963, + "ewc_loss": 0.08073057979345322, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004337706486694515, + "grad_norm": 9.449278831481934, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8766195178031921, + "num_tokens": 804684540.0, + "step": 21086 + }, + { + "epoch": 2.682483144638087, + "ewc_loss": 0.08066949248313904, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004331597883719951, + "grad_norm": 9.457978248596191, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8675594329833984, + "num_tokens": 804723030.0, + "step": 21087 + }, + { + "epoch": 2.6826103549166773, + "ewc_loss": 0.08099022507667542, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004363671177998185, + "grad_norm": 9.491917610168457, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.873674750328064, + "num_tokens": 804759202.0, + "step": 21088 + }, + { + "epoch": 2.682737565195268, + "ewc_loss": 0.0806872695684433, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004333375836722553, + "grad_norm": 9.477739334106445, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8622722029685974, + "num_tokens": 804803199.0, + "step": 21089 + }, + { + "epoch": 2.6828647754738584, + "ewc_loss": 0.08091318607330322, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004355967103037983, + "grad_norm": 9.619196891784668, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.862823486328125, + "num_tokens": 804835675.0, + "step": 21090 + }, + { + "epoch": 2.682991985752449, + "ewc_loss": 0.08048320561647415, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004312969103921205, + "grad_norm": 9.46357536315918, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8521697521209717, + "num_tokens": 804876583.0, + "step": 21091 + }, + { + "epoch": 2.6831191960310394, + "ewc_loss": 0.08083741366863251, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004348389629740268, + "grad_norm": 9.563684463500977, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8554337024688721, + "num_tokens": 804906110.0, + "step": 21092 + }, + { + "epoch": 2.68324640630963, + "ewc_loss": 0.08022521436214447, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004287169431336224, + "grad_norm": 9.340682029724121, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8569296598434448, + "num_tokens": 804942196.0, + "step": 21093 + }, + { + "epoch": 2.6833736165882205, + "ewc_loss": 0.0812540277838707, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004390051180962473, + "grad_norm": 9.586920738220215, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8643418550491333, + "num_tokens": 804978760.0, + "step": 21094 + }, + { + "epoch": 2.683500826866811, + "ewc_loss": 0.08013784140348434, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042784324614331126, + "grad_norm": 9.343441009521484, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8566533923149109, + "num_tokens": 805014212.0, + "step": 21095 + }, + { + "epoch": 2.683628037145401, + "ewc_loss": 0.0812767967581749, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004392328264657408, + "grad_norm": 9.586759567260742, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8890169858932495, + "num_tokens": 805050528.0, + "step": 21096 + }, + { + "epoch": 2.683755247423992, + "ewc_loss": 0.08002692461013794, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042673404095694423, + "grad_norm": 9.294264793395996, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.868165910243988, + "num_tokens": 805087355.0, + "step": 21097 + }, + { + "epoch": 2.683882457702582, + "ewc_loss": 0.08148764818906784, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004413413116708398, + "grad_norm": 9.663300514221191, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8515824675559998, + "num_tokens": 805117837.0, + "step": 21098 + }, + { + "epoch": 2.684009667981173, + "ewc_loss": 0.07994446903467178, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042590952944010496, + "grad_norm": 9.289234161376953, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8646060228347778, + "num_tokens": 805157734.0, + "step": 21099 + }, + { + "epoch": 2.684136878259763, + "ewc_loss": 0.08157962560653687, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044226108002476394, + "grad_norm": 9.641180992126465, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8715864419937134, + "num_tokens": 805190657.0, + "step": 21100 + }, + { + "epoch": 2.6842640885383537, + "ewc_loss": 0.08015946298837662, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004280594876036048, + "grad_norm": 9.36021900177002, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8565084934234619, + "num_tokens": 805229654.0, + "step": 21101 + }, + { + "epoch": 2.6843912988169443, + "ewc_loss": 0.0813269168138504, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004397340235300362, + "grad_norm": 9.53147029876709, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8640122413635254, + "num_tokens": 805269199.0, + "step": 21102 + }, + { + "epoch": 2.684518509095535, + "ewc_loss": 0.08032259345054626, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042969081550836563, + "grad_norm": 9.35260009765625, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8592003583908081, + "num_tokens": 805308918.0, + "step": 21103 + }, + { + "epoch": 2.6846457193741253, + "ewc_loss": 0.08137638866901398, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044022875954397023, + "grad_norm": 9.625044822692871, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8610059022903442, + "num_tokens": 805345014.0, + "step": 21104 + }, + { + "epoch": 2.684772929652716, + "ewc_loss": 0.08024962246417999, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004289610660634935, + "grad_norm": 9.359930992126465, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8784776926040649, + "num_tokens": 805383674.0, + "step": 21105 + }, + { + "epoch": 2.6849001399313064, + "ewc_loss": 0.08138813078403473, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044034613529220223, + "grad_norm": 9.627151489257812, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8680540323257446, + "num_tokens": 805419765.0, + "step": 21106 + }, + { + "epoch": 2.685027350209897, + "ewc_loss": 0.08016742765903473, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042813908657990396, + "grad_norm": 9.42007064819336, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8659512400627136, + "num_tokens": 805461346.0, + "step": 21107 + }, + { + "epoch": 2.6851545604884874, + "ewc_loss": 0.08115717768669128, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004380365717224777, + "grad_norm": 9.605997085571289, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8727912902832031, + "num_tokens": 805494519.0, + "step": 21108 + }, + { + "epoch": 2.685281770767078, + "ewc_loss": 0.0803053230047226, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042951805517077446, + "grad_norm": 9.412044525146484, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8538012504577637, + "num_tokens": 805530542.0, + "step": 21109 + }, + { + "epoch": 2.6854089810456685, + "ewc_loss": 0.08105702698230743, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043703510891646147, + "grad_norm": 9.614559173583984, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8614926934242249, + "num_tokens": 805564531.0, + "step": 21110 + }, + { + "epoch": 2.685536191324259, + "ewc_loss": 0.08018702268600464, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042833510087803006, + "grad_norm": 9.367724418640137, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8609142899513245, + "num_tokens": 805606011.0, + "step": 21111 + }, + { + "epoch": 2.6856634016028496, + "ewc_loss": 0.08117084205150604, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004381732433103025, + "grad_norm": 9.592706680297852, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8606582880020142, + "num_tokens": 805643078.0, + "step": 21112 + }, + { + "epoch": 2.68579061188144, + "ewc_loss": 0.08009356260299683, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042740051867440343, + "grad_norm": 9.403509140014648, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8696897029876709, + "num_tokens": 805678554.0, + "step": 21113 + }, + { + "epoch": 2.6859178221600306, + "ewc_loss": 0.08101768791675568, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043664174154400826, + "grad_norm": 9.549583435058594, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8749130964279175, + "num_tokens": 805713736.0, + "step": 21114 + }, + { + "epoch": 2.686045032438621, + "ewc_loss": 0.08031570166349411, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042962186853401363, + "grad_norm": 9.431001663208008, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8587898015975952, + "num_tokens": 805751798.0, + "step": 21115 + }, + { + "epoch": 2.6861722427172117, + "ewc_loss": 0.08084709942340851, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043493584962561727, + "grad_norm": 9.508133888244629, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8730319738388062, + "num_tokens": 805789205.0, + "step": 21116 + }, + { + "epoch": 2.686299452995802, + "ewc_loss": 0.0805588960647583, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042961243889294565, + "grad_norm": 9.445143699645996, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.868291974067688, + "num_tokens": 805824910.0, + "step": 21117 + }, + { + "epoch": 2.6864266632743927, + "ewc_loss": 0.0808214321732521, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043467918294481933, + "grad_norm": 9.624723434448242, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8606297969818115, + "num_tokens": 805861335.0, + "step": 21118 + }, + { + "epoch": 2.686553873552983, + "ewc_loss": 0.0800894945859909, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042735980241559446, + "grad_norm": 9.381708145141602, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8608284592628479, + "num_tokens": 805901488.0, + "step": 21119 + }, + { + "epoch": 2.686681083831574, + "ewc_loss": 0.08102939277887344, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004367587680462748, + "grad_norm": 9.599861145019531, + "learning_rate": 1e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.8264040946960449, + "num_tokens": 805941336.0, + "step": 21120 + }, + { + "epoch": 2.686808294110164, + "ewc_loss": 0.0800199955701828, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042666474473662674, + "grad_norm": 9.41281795501709, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.873268723487854, + "num_tokens": 805974222.0, + "step": 21121 + }, + { + "epoch": 2.686935504388755, + "ewc_loss": 0.08109241724014282, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004373889823909849, + "grad_norm": 9.603316307067871, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8613364696502686, + "num_tokens": 806014942.0, + "step": 21122 + }, + { + "epoch": 2.687062714667345, + "ewc_loss": 0.0801563635468483, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042802849202416837, + "grad_norm": 9.411846160888672, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8760920166969299, + "num_tokens": 806048894.0, + "step": 21123 + }, + { + "epoch": 2.687189924945936, + "ewc_loss": 0.08094586431980133, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043592345900833607, + "grad_norm": 9.527185440063477, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8636371493339539, + "num_tokens": 806089714.0, + "step": 21124 + }, + { + "epoch": 2.687317135224526, + "ewc_loss": 0.08035948127508163, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004300596483517438, + "grad_norm": 9.384269714355469, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8525791168212891, + "num_tokens": 806128974.0, + "step": 21125 + }, + { + "epoch": 2.6874443455031165, + "ewc_loss": 0.08107239753007889, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043718883534893394, + "grad_norm": 9.613553047180176, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8624210357666016, + "num_tokens": 806169857.0, + "step": 21126 + }, + { + "epoch": 2.687571555781707, + "ewc_loss": 0.0802379846572876, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000428844679845497, + "grad_norm": 9.439959526062012, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8672575354576111, + "num_tokens": 806202826.0, + "step": 21127 + }, + { + "epoch": 2.6876987660602976, + "ewc_loss": 0.08110406994819641, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043750557233579457, + "grad_norm": 9.626100540161133, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8635359406471252, + "num_tokens": 806242850.0, + "step": 21128 + }, + { + "epoch": 2.687825976338888, + "ewc_loss": 0.08031587302684784, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042718221084214747, + "grad_norm": 9.420671463012695, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8675094842910767, + "num_tokens": 806279024.0, + "step": 21129 + }, + { + "epoch": 2.6879531866174786, + "ewc_loss": 0.08079619705677032, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004344267654232681, + "grad_norm": 9.537263870239258, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8500150442123413, + "num_tokens": 806313669.0, + "step": 21130 + }, + { + "epoch": 2.688080396896069, + "ewc_loss": 0.08008772134780884, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004273420199751854, + "grad_norm": 9.323214530944824, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8677190542221069, + "num_tokens": 806353128.0, + "step": 21131 + }, + { + "epoch": 2.6882076071746597, + "ewc_loss": 0.08079653233289719, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004344301705714315, + "grad_norm": 9.578512191772461, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8659070730209351, + "num_tokens": 806385044.0, + "step": 21132 + }, + { + "epoch": 2.68833481745325, + "ewc_loss": 0.07999622821807861, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042642708285711706, + "grad_norm": 9.32912826538086, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8560280799865723, + "num_tokens": 806422157.0, + "step": 21133 + }, + { + "epoch": 2.6884620277318407, + "ewc_loss": 0.0810588151216507, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043705294956453145, + "grad_norm": 9.563323974609375, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8647676706314087, + "num_tokens": 806463180.0, + "step": 21134 + }, + { + "epoch": 2.6885892380104313, + "ewc_loss": 0.07996691763401031, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042613406549207866, + "grad_norm": 9.333504676818848, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8612169623374939, + "num_tokens": 806502031.0, + "step": 21135 + }, + { + "epoch": 2.688716448289022, + "ewc_loss": 0.08108429610729218, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043730775360018015, + "grad_norm": 9.481575012207031, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8672866821289062, + "num_tokens": 806539585.0, + "step": 21136 + }, + { + "epoch": 2.6888436585676123, + "ewc_loss": 0.08014063537120819, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004278711858205497, + "grad_norm": 9.384109497070312, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.884079098701477, + "num_tokens": 806573147.0, + "step": 21137 + }, + { + "epoch": 2.688970868846203, + "ewc_loss": 0.08087421208620071, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004352069809101522, + "grad_norm": 9.48522663116455, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8767415881156921, + "num_tokens": 806609785.0, + "step": 21138 + }, + { + "epoch": 2.6890980791247934, + "ewc_loss": 0.08045901358127594, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004310549411457032, + "grad_norm": 9.422558784484863, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8711718320846558, + "num_tokens": 806647709.0, + "step": 21139 + }, + { + "epoch": 2.689225289403384, + "ewc_loss": 0.08081123232841492, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000434577144915238, + "grad_norm": 9.53289794921875, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8697651624679565, + "num_tokens": 806684448.0, + "step": 21140 + }, + { + "epoch": 2.6893524996819744, + "ewc_loss": 0.0803886204957962, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004303510067984462, + "grad_norm": 9.448599815368652, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8695759177207947, + "num_tokens": 806725326.0, + "step": 21141 + }, + { + "epoch": 2.689479709960565, + "ewc_loss": 0.08078104257583618, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043427522177807987, + "grad_norm": 9.480111122131348, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8629506826400757, + "num_tokens": 806759797.0, + "step": 21142 + }, + { + "epoch": 2.6896069202391555, + "ewc_loss": 0.08055034279823303, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043196830665692687, + "grad_norm": 9.466158866882324, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8723248243331909, + "num_tokens": 806799986.0, + "step": 21143 + }, + { + "epoch": 2.6897341305177456, + "ewc_loss": 0.08056799322366714, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043214476318098605, + "grad_norm": 9.385944366455078, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8607962727546692, + "num_tokens": 806843055.0, + "step": 21144 + }, + { + "epoch": 2.6898613407963365, + "ewc_loss": 0.08063657581806183, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004328305658418685, + "grad_norm": 9.449413299560547, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8794984817504883, + "num_tokens": 806875897.0, + "step": 21145 + }, + { + "epoch": 2.6899885510749266, + "ewc_loss": 0.08057846128940582, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043224950786679983, + "grad_norm": 9.426080703735352, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8724782466888428, + "num_tokens": 806915583.0, + "step": 21146 + }, + { + "epoch": 2.6901157613535176, + "ewc_loss": 0.08068045973777771, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043326945160515606, + "grad_norm": 9.475238800048828, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8693051338195801, + "num_tokens": 806953851.0, + "step": 21147 + }, + { + "epoch": 2.6902429716321077, + "ewc_loss": 0.08040530979633331, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004305178881622851, + "grad_norm": 9.387588500976562, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8606515526771545, + "num_tokens": 806988519.0, + "step": 21148 + }, + { + "epoch": 2.6903701819106987, + "ewc_loss": 0.08079281449317932, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004343930340837687, + "grad_norm": 9.420759201049805, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8683146834373474, + "num_tokens": 807029612.0, + "step": 21149 + }, + { + "epoch": 2.6904973921892887, + "ewc_loss": 0.08061262965202332, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043259112862870097, + "grad_norm": 9.44138240814209, + "learning_rate": 1e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8491325378417969, + "num_tokens": 807064009.0, + "step": 21150 + }, + { + "epoch": 2.6906246024678793, + "ewc_loss": 0.08073297142982483, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043379460112191737, + "grad_norm": 9.408429145812988, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8624777793884277, + "num_tokens": 807104694.0, + "step": 21151 + }, + { + "epoch": 2.69075181274647, + "ewc_loss": 0.08077868819236755, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043425176409073174, + "grad_norm": 9.45419692993164, + "learning_rate": 1e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.8423123359680176, + "num_tokens": 807142500.0, + "step": 21152 + }, + { + "epoch": 2.6908790230250603, + "ewc_loss": 0.0805397778749466, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004318626597523689, + "grad_norm": 9.393142700195312, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8615641593933105, + "num_tokens": 807172123.0, + "step": 21153 + }, + { + "epoch": 2.691006233303651, + "ewc_loss": 0.08104678988456726, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004369327798485756, + "grad_norm": 9.475374221801758, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8676758408546448, + "num_tokens": 807206422.0, + "step": 21154 + }, + { + "epoch": 2.6911334435822414, + "ewc_loss": 0.08071106672286987, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004335754783824086, + "grad_norm": 9.405466079711914, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8505231738090515, + "num_tokens": 807240867.0, + "step": 21155 + }, + { + "epoch": 2.691260653860832, + "ewc_loss": 0.08130452781915665, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000437068723840639, + "grad_norm": 9.48193073272705, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8753381967544556, + "num_tokens": 807275838.0, + "step": 21156 + }, + { + "epoch": 2.6913878641394224, + "ewc_loss": 0.08060604333877563, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043252529576420784, + "grad_norm": 9.354280471801758, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8725137710571289, + "num_tokens": 807315164.0, + "step": 21157 + }, + { + "epoch": 2.691515074418013, + "ewc_loss": 0.08153516054153442, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043937505688518286, + "grad_norm": 9.46267318725586, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8571834564208984, + "num_tokens": 807358016.0, + "step": 21158 + }, + { + "epoch": 2.6916422846966035, + "ewc_loss": 0.08056138455867767, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043207869748584926, + "grad_norm": 9.363466262817383, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8640350699424744, + "num_tokens": 807402751.0, + "step": 21159 + }, + { + "epoch": 2.691769494975194, + "ewc_loss": 0.08113524317741394, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043781724525615573, + "grad_norm": 9.468926429748535, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8685152530670166, + "num_tokens": 807442761.0, + "step": 21160 + }, + { + "epoch": 2.6918967052537845, + "ewc_loss": 0.08076652884483337, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043413008097559214, + "grad_norm": 9.41034984588623, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8713243007659912, + "num_tokens": 807478550.0, + "step": 21161 + }, + { + "epoch": 2.692023915532375, + "ewc_loss": 0.08123940974473953, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004388589586596936, + "grad_norm": 9.479459762573242, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8729095458984375, + "num_tokens": 807522756.0, + "step": 21162 + }, + { + "epoch": 2.6921511258109656, + "ewc_loss": 0.08085167407989502, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043498160084709525, + "grad_norm": 9.455063819885254, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8706516027450562, + "num_tokens": 807561701.0, + "step": 21163 + }, + { + "epoch": 2.692278336089556, + "ewc_loss": 0.08096295595169067, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004360944149084389, + "grad_norm": 9.487793922424316, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8638200759887695, + "num_tokens": 807604058.0, + "step": 21164 + }, + { + "epoch": 2.6924055463681467, + "ewc_loss": 0.08085722476243973, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004350370727479458, + "grad_norm": 9.467110633850098, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8709748983383179, + "num_tokens": 807637344.0, + "step": 21165 + }, + { + "epoch": 2.692532756646737, + "ewc_loss": 0.08088327944278717, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043529761023819447, + "grad_norm": 9.515362739562988, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.857978343963623, + "num_tokens": 807676867.0, + "step": 21166 + }, + { + "epoch": 2.6926599669253277, + "ewc_loss": 0.0806291401386261, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000432756234658882, + "grad_norm": 9.432334899902344, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8682529926300049, + "num_tokens": 807707183.0, + "step": 21167 + }, + { + "epoch": 2.6927871772039182, + "ewc_loss": 0.08094201982021332, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043588507105596364, + "grad_norm": 9.53548526763916, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8551863431930542, + "num_tokens": 807737617.0, + "step": 21168 + }, + { + "epoch": 2.6929143874825083, + "ewc_loss": 0.08076182007789612, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004316416452638805, + "grad_norm": 9.455802917480469, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8576245903968811, + "num_tokens": 807774141.0, + "step": 21169 + }, + { + "epoch": 2.6930415977610993, + "ewc_loss": 0.08088816702365875, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043534653377719223, + "grad_norm": 9.440987586975098, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8706402778625488, + "num_tokens": 807813375.0, + "step": 21170 + }, + { + "epoch": 2.6931688080396894, + "ewc_loss": 0.0809863954782486, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043388744234107435, + "grad_norm": 9.476483345031738, + "learning_rate": 1e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.8507081270217896, + "num_tokens": 807852317.0, + "step": 21171 + }, + { + "epoch": 2.6932960183182804, + "ewc_loss": 0.08069939911365509, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043345888843759894, + "grad_norm": 9.475125312805176, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8659945130348206, + "num_tokens": 807895266.0, + "step": 21172 + }, + { + "epoch": 2.6934232285968704, + "ewc_loss": 0.081108458340168, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004351080278865993, + "grad_norm": 9.479205131530762, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8668742775917053, + "num_tokens": 807938600.0, + "step": 21173 + }, + { + "epoch": 2.693550438875461, + "ewc_loss": 0.08060631155967712, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043252797331660986, + "grad_norm": 9.51841926574707, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8853505849838257, + "num_tokens": 807974329.0, + "step": 21174 + }, + { + "epoch": 2.6936776491540515, + "ewc_loss": 0.0805782675743103, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004322475288063288, + "grad_norm": 9.434333801269531, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8765451908111572, + "num_tokens": 808015666.0, + "step": 21175 + }, + { + "epoch": 2.693804859432642, + "ewc_loss": 0.08086133003234863, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043507819646038115, + "grad_norm": 9.551688194274902, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8771021962165833, + "num_tokens": 808050176.0, + "step": 21176 + }, + { + "epoch": 2.6939320697112326, + "ewc_loss": 0.08038818091154099, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004303466703277081, + "grad_norm": 9.446210861206055, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8579690456390381, + "num_tokens": 808091910.0, + "step": 21177 + }, + { + "epoch": 2.694059279989823, + "ewc_loss": 0.0808335542678833, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043480037129484117, + "grad_norm": 9.523691177368164, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8616943955421448, + "num_tokens": 808128646.0, + "step": 21178 + }, + { + "epoch": 2.6941864902684136, + "ewc_loss": 0.08048047125339508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043126955279149115, + "grad_norm": 9.546857833862305, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8693532943725586, + "num_tokens": 808166544.0, + "step": 21179 + }, + { + "epoch": 2.694313700547004, + "ewc_loss": 0.08045922219753265, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043105706572532654, + "grad_norm": 9.443094253540039, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8539842367172241, + "num_tokens": 808200051.0, + "step": 21180 + }, + { + "epoch": 2.6944409108255947, + "ewc_loss": 0.08060809969902039, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004325458430685103, + "grad_norm": 9.528417587280273, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8669959306716919, + "num_tokens": 808239251.0, + "step": 21181 + }, + { + "epoch": 2.694568121104185, + "ewc_loss": 0.08037985861301422, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004302634624764323, + "grad_norm": 9.423432350158691, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8597421646118164, + "num_tokens": 808280639.0, + "step": 21182 + }, + { + "epoch": 2.6946953313827757, + "ewc_loss": 0.08068417757749557, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004333066171966493, + "grad_norm": 9.464781761169434, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8728266954421997, + "num_tokens": 808316141.0, + "step": 21183 + }, + { + "epoch": 2.6948225416613663, + "ewc_loss": 0.0806509405374527, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004305328766349703, + "grad_norm": 9.460311889648438, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8635095357894897, + "num_tokens": 808359172.0, + "step": 21184 + }, + { + "epoch": 2.694949751939957, + "ewc_loss": 0.08062437176704407, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004327085625845939, + "grad_norm": 9.476275444030762, + "learning_rate": 1e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8365026712417603, + "num_tokens": 808397823.0, + "step": 21185 + }, + { + "epoch": 2.6950769622185473, + "ewc_loss": 0.08046787977218628, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004311436496209353, + "grad_norm": 9.411872863769531, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8744227886199951, + "num_tokens": 808435355.0, + "step": 21186 + }, + { + "epoch": 2.695204172497138, + "ewc_loss": 0.08083339035511017, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043479877058416605, + "grad_norm": 9.545451164245605, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8714205026626587, + "num_tokens": 808470792.0, + "step": 21187 + }, + { + "epoch": 2.6953313827757284, + "ewc_loss": 0.08027887344360352, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004292535304557532, + "grad_norm": 9.434205055236816, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8827962279319763, + "num_tokens": 808504591.0, + "step": 21188 + }, + { + "epoch": 2.695458593054319, + "ewc_loss": 0.08074512332677841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004339160805102438, + "grad_norm": 9.56705379486084, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.877210259437561, + "num_tokens": 808537167.0, + "step": 21189 + }, + { + "epoch": 2.6955858033329094, + "ewc_loss": 0.08054730296134949, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042949646012857556, + "grad_norm": 9.445393562316895, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8812090158462524, + "num_tokens": 808572055.0, + "step": 21190 + }, + { + "epoch": 2.6957130136115, + "ewc_loss": 0.0811334103345871, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00043291610199958086, + "grad_norm": 9.506104469299316, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8782625198364258, + "num_tokens": 808605239.0, + "step": 21191 + }, + { + "epoch": 2.6958402238900905, + "ewc_loss": 0.080586276948452, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042988621862605214, + "grad_norm": 9.503921508789062, + "learning_rate": 1e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.843191385269165, + "num_tokens": 808643227.0, + "step": 21192 + }, + { + "epoch": 2.695967434168681, + "ewc_loss": 0.08056662231683731, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004321310843806714, + "grad_norm": 9.514786720275879, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8586385250091553, + "num_tokens": 808683855.0, + "step": 21193 + }, + { + "epoch": 2.696094644447271, + "ewc_loss": 0.08046504855155945, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004311153606977314, + "grad_norm": 9.459850311279297, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.871611475944519, + "num_tokens": 808723467.0, + "step": 21194 + }, + { + "epoch": 2.696221854725862, + "ewc_loss": 0.0804363340139389, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043082813499495387, + "grad_norm": 9.525397300720215, + "learning_rate": 1e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.8532881736755371, + "num_tokens": 808759805.0, + "step": 21195 + }, + { + "epoch": 2.696349065004452, + "ewc_loss": 0.08022308349609375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004286956391297281, + "grad_norm": 9.37883186340332, + "learning_rate": 1e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8487558960914612, + "num_tokens": 808802024.0, + "step": 21196 + }, + { + "epoch": 2.696476275283043, + "ewc_loss": 0.08088742196559906, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043533911230042577, + "grad_norm": 9.592606544494629, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8555507659912109, + "num_tokens": 808837086.0, + "step": 21197 + }, + { + "epoch": 2.696603485561633, + "ewc_loss": 0.08038030564785004, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042782651144079864, + "grad_norm": 9.379402160644531, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8646969795227051, + "num_tokens": 808875261.0, + "step": 21198 + }, + { + "epoch": 2.6967306958402237, + "ewc_loss": 0.08113313466310501, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004377962031867355, + "grad_norm": 9.570765495300293, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8727169632911682, + "num_tokens": 808913044.0, + "step": 21199 + }, + { + "epoch": 2.6968579061188143, + "ewc_loss": 0.08008839190006256, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042734877206385136, + "grad_norm": 9.382012367248535, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8744441270828247, + "num_tokens": 808952374.0, + "step": 21200 + }, + { + "epoch": 2.696985116397405, + "ewc_loss": 0.08115407824516296, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004380056634545326, + "grad_norm": 9.60767650604248, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8661878108978271, + "num_tokens": 808994397.0, + "step": 21201 + }, + { + "epoch": 2.6971123266759953, + "ewc_loss": 0.08042971789836884, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042832063627429307, + "grad_norm": 9.322093963623047, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.868405818939209, + "num_tokens": 809036401.0, + "step": 21202 + }, + { + "epoch": 2.697239536954586, + "ewc_loss": 0.08163447678089142, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004403682250995189, + "grad_norm": 9.601099967956543, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8699396848678589, + "num_tokens": 809073073.0, + "step": 21203 + }, + { + "epoch": 2.6973667472331764, + "ewc_loss": 0.08019569516181946, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004284217720851302, + "grad_norm": 9.388484001159668, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8649053573608398, + "num_tokens": 809117348.0, + "step": 21204 + }, + { + "epoch": 2.697493957511767, + "ewc_loss": 0.08176787197589874, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004417021118570119, + "grad_norm": 9.605957984924316, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8783088326454163, + "num_tokens": 809151357.0, + "step": 21205 + }, + { + "epoch": 2.6976211677903574, + "ewc_loss": 0.08029118180274963, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004293766978662461, + "grad_norm": 9.446711540222168, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8597877025604248, + "num_tokens": 809189000.0, + "step": 21206 + }, + { + "epoch": 2.697748378068948, + "ewc_loss": 0.0812397301197052, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043886216008104384, + "grad_norm": 9.672310829162598, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8681736588478088, + "num_tokens": 809227380.0, + "step": 21207 + }, + { + "epoch": 2.6978755883475385, + "ewc_loss": 0.08025240153074265, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042898886022157967, + "grad_norm": 9.348126411437988, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8652033805847168, + "num_tokens": 809265131.0, + "step": 21208 + }, + { + "epoch": 2.698002798626129, + "ewc_loss": 0.08136409521102905, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000440105824964121, + "grad_norm": 9.705643653869629, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8639363050460815, + "num_tokens": 809297190.0, + "step": 21209 + }, + { + "epoch": 2.6981300089047195, + "ewc_loss": 0.08004515618085861, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004269164055585861, + "grad_norm": 9.408122062683105, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8732976913452148, + "num_tokens": 809336774.0, + "step": 21210 + }, + { + "epoch": 2.69825721918331, + "ewc_loss": 0.08142155408859253, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004406803927849978, + "grad_norm": 9.621288299560547, + "learning_rate": 1e-06, + "loss": 0.5241, + "mean_token_accuracy": 0.8470849394798279, + "num_tokens": 809376612.0, + "step": 21211 + }, + { + "epoch": 2.6983844294619006, + "ewc_loss": 0.0802321583032608, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042878641397692263, + "grad_norm": 9.358785629272461, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8884403705596924, + "num_tokens": 809416058.0, + "step": 21212 + }, + { + "epoch": 2.698511639740491, + "ewc_loss": 0.08139709383249283, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000440435775090009, + "grad_norm": 9.61241340637207, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8777979016304016, + "num_tokens": 809449580.0, + "step": 21213 + }, + { + "epoch": 2.6986388500190817, + "ewc_loss": 0.08038246631622314, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004302895104046911, + "grad_norm": 9.385138511657715, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8676767945289612, + "num_tokens": 809484983.0, + "step": 21214 + }, + { + "epoch": 2.698766060297672, + "ewc_loss": 0.0815695971250534, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004421608173288405, + "grad_norm": 9.655132293701172, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8692116141319275, + "num_tokens": 809521086.0, + "step": 21215 + }, + { + "epoch": 2.6988932705762627, + "ewc_loss": 0.080254927277565, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004290140641387552, + "grad_norm": 9.358051300048828, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8636205196380615, + "num_tokens": 809562475.0, + "step": 21216 + }, + { + "epoch": 2.699020480854853, + "ewc_loss": 0.08181575685739517, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044462241930887103, + "grad_norm": 9.669951438903809, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8516958355903625, + "num_tokens": 809596686.0, + "step": 21217 + }, + { + "epoch": 2.6991476911334438, + "ewc_loss": 0.08028894662857056, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042935425881296396, + "grad_norm": 9.454614639282227, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8678957223892212, + "num_tokens": 809636614.0, + "step": 21218 + }, + { + "epoch": 2.699274901412034, + "ewc_loss": 0.08143778145313263, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004408426466397941, + "grad_norm": 9.655821800231934, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8535294532775879, + "num_tokens": 809674542.0, + "step": 21219 + }, + { + "epoch": 2.699402111690625, + "ewc_loss": 0.080643430352211, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043045778875239193, + "grad_norm": 9.338325500488281, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8829416036605835, + "num_tokens": 809716726.0, + "step": 21220 + }, + { + "epoch": 2.699529321969215, + "ewc_loss": 0.08165337890386581, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004429986292961985, + "grad_norm": 9.681591033935547, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8750653266906738, + "num_tokens": 809756790.0, + "step": 21221 + }, + { + "epoch": 2.699656532247806, + "ewc_loss": 0.08042307198047638, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043069550883956254, + "grad_norm": 9.44878101348877, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8627291321754456, + "num_tokens": 809794616.0, + "step": 21222 + }, + { + "epoch": 2.699783742526396, + "ewc_loss": 0.08159372210502625, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004424020880833268, + "grad_norm": 9.576526641845703, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.871906578540802, + "num_tokens": 809836662.0, + "step": 21223 + }, + { + "epoch": 2.6999109528049865, + "ewc_loss": 0.08060440421104431, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043250893941149116, + "grad_norm": 9.452170372009277, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8624261617660522, + "num_tokens": 809873812.0, + "step": 21224 + }, + { + "epoch": 2.700038163083577, + "ewc_loss": 0.08139778673648834, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004404426726978272, + "grad_norm": 9.624524116516113, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8657366633415222, + "num_tokens": 809913139.0, + "step": 21225 + }, + { + "epoch": 2.7001653733621676, + "ewc_loss": 0.08068430423736572, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004333078977651894, + "grad_norm": 9.552614212036133, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8677877187728882, + "num_tokens": 809947741.0, + "step": 21226 + }, + { + "epoch": 2.700292583640758, + "ewc_loss": 0.08093468844890594, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004358117002993822, + "grad_norm": 9.512961387634277, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8575356602668762, + "num_tokens": 809987120.0, + "step": 21227 + }, + { + "epoch": 2.7004197939193486, + "ewc_loss": 0.0809798389673233, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043626324622891843, + "grad_norm": 9.499652862548828, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8575239777565002, + "num_tokens": 810026972.0, + "step": 21228 + }, + { + "epoch": 2.700547004197939, + "ewc_loss": 0.08097715675830841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004362364124972373, + "grad_norm": 9.57337760925293, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8653128147125244, + "num_tokens": 810062767.0, + "step": 21229 + }, + { + "epoch": 2.7006742144765297, + "ewc_loss": 0.0808030515909195, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004344953049439937, + "grad_norm": 9.475363731384277, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8741927146911621, + "num_tokens": 810096930.0, + "step": 21230 + }, + { + "epoch": 2.70080142475512, + "ewc_loss": 0.08120135217905045, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004384783678688109, + "grad_norm": 9.59403133392334, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8515279293060303, + "num_tokens": 810134600.0, + "step": 21231 + }, + { + "epoch": 2.7009286350337107, + "ewc_loss": 0.08058926463127136, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004323575412854552, + "grad_norm": 9.436331748962402, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8674156069755554, + "num_tokens": 810169535.0, + "step": 21232 + }, + { + "epoch": 2.7010558453123013, + "ewc_loss": 0.0815042108297348, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043906556675210595, + "grad_norm": 9.59317398071289, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8538516759872437, + "num_tokens": 810205288.0, + "step": 21233 + }, + { + "epoch": 2.701183055590892, + "ewc_loss": 0.08062015473842621, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004326663911342621, + "grad_norm": 9.52013111114502, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8626057505607605, + "num_tokens": 810242868.0, + "step": 21234 + }, + { + "epoch": 2.7013102658694823, + "ewc_loss": 0.08114586770534515, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004379235324449837, + "grad_norm": 9.587676048278809, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8537288308143616, + "num_tokens": 810286539.0, + "step": 21235 + }, + { + "epoch": 2.701437476148073, + "ewc_loss": 0.08083389699459076, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004348038346506655, + "grad_norm": 9.536616325378418, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8683613538742065, + "num_tokens": 810331598.0, + "step": 21236 + }, + { + "epoch": 2.7015646864266634, + "ewc_loss": 0.08092458546161652, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043571071000769734, + "grad_norm": 9.555347442626953, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8739683628082275, + "num_tokens": 810369501.0, + "step": 21237 + }, + { + "epoch": 2.701691896705254, + "ewc_loss": 0.0807780921459198, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004342457978054881, + "grad_norm": 9.52453327178955, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8626527190208435, + "num_tokens": 810410727.0, + "step": 21238 + }, + { + "epoch": 2.7018191069838444, + "ewc_loss": 0.08089424669742584, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043540727347135544, + "grad_norm": 9.588663101196289, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.857538104057312, + "num_tokens": 810450315.0, + "step": 21239 + }, + { + "epoch": 2.701946317262435, + "ewc_loss": 0.08059439063072205, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043240879313088953, + "grad_norm": 9.553003311157227, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8562714457511902, + "num_tokens": 810493972.0, + "step": 21240 + }, + { + "epoch": 2.7020735275410255, + "ewc_loss": 0.08070249855518341, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043348988401703537, + "grad_norm": 9.478106498718262, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8776940703392029, + "num_tokens": 810526038.0, + "step": 21241 + }, + { + "epoch": 2.7022007378196156, + "ewc_loss": 0.08074215054512024, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043388630729168653, + "grad_norm": 9.51060676574707, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8700382709503174, + "num_tokens": 810560772.0, + "step": 21242 + }, + { + "epoch": 2.7023279480982065, + "ewc_loss": 0.08062516152858734, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004327164788264781, + "grad_norm": 9.468133926391602, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8717211484909058, + "num_tokens": 810598322.0, + "step": 21243 + }, + { + "epoch": 2.7024551583767966, + "ewc_loss": 0.08084972202777863, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004349620430730283, + "grad_norm": 9.515810012817383, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8854237794876099, + "num_tokens": 810638652.0, + "step": 21244 + }, + { + "epoch": 2.7025823686553876, + "ewc_loss": 0.08066065609455109, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043307137093506753, + "grad_norm": 9.420008659362793, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.862882137298584, + "num_tokens": 810680318.0, + "step": 21245 + }, + { + "epoch": 2.7027095789339777, + "ewc_loss": 0.08107535541057587, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043721834663301706, + "grad_norm": 9.528425216674805, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8680086731910706, + "num_tokens": 810715001.0, + "step": 21246 + }, + { + "epoch": 2.7028367892125686, + "ewc_loss": 0.08056750148534775, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004321398737374693, + "grad_norm": 9.404412269592285, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8565355539321899, + "num_tokens": 810748897.0, + "step": 21247 + }, + { + "epoch": 2.7029639994911587, + "ewc_loss": 0.08128677308559418, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043933253618888557, + "grad_norm": 9.565948486328125, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8600742220878601, + "num_tokens": 810788954.0, + "step": 21248 + }, + { + "epoch": 2.7030912097697493, + "ewc_loss": 0.08054797351360321, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004319446161389351, + "grad_norm": 9.393972396850586, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8680319786071777, + "num_tokens": 810828897.0, + "step": 21249 + }, + { + "epoch": 2.70321842004834, + "ewc_loss": 0.08133093267679214, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004397741868160665, + "grad_norm": 9.596327781677246, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8825348615646362, + "num_tokens": 810866698.0, + "step": 21250 + }, + { + "epoch": 2.7033456303269303, + "ewc_loss": 0.08049266040325165, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004313914105296135, + "grad_norm": 9.395750999450684, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8579084277153015, + "num_tokens": 810903140.0, + "step": 21251 + }, + { + "epoch": 2.703472840605521, + "ewc_loss": 0.08140765130519867, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044054139289073646, + "grad_norm": 9.563666343688965, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8678869009017944, + "num_tokens": 810940664.0, + "step": 21252 + }, + { + "epoch": 2.7036000508841114, + "ewc_loss": 0.08053076267242432, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043177249608561397, + "grad_norm": 9.371267318725586, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8637255430221558, + "num_tokens": 810981818.0, + "step": 21253 + }, + { + "epoch": 2.703727261162702, + "ewc_loss": 0.08146390318870544, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004411038535181433, + "grad_norm": 9.528340339660645, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.86466383934021, + "num_tokens": 811019897.0, + "step": 21254 + }, + { + "epoch": 2.7038544714412924, + "ewc_loss": 0.08075497299432755, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043401456787250936, + "grad_norm": 9.459820747375488, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8678310513496399, + "num_tokens": 811056310.0, + "step": 21255 + }, + { + "epoch": 2.703981681719883, + "ewc_loss": 0.08128315210342407, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043929641833528876, + "grad_norm": 9.538932800292969, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8559702634811401, + "num_tokens": 811091505.0, + "step": 21256 + }, + { + "epoch": 2.7041088919984735, + "ewc_loss": 0.08079857379198074, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004344505723565817, + "grad_norm": 9.380807876586914, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.878468930721283, + "num_tokens": 811134861.0, + "step": 21257 + }, + { + "epoch": 2.704236102277064, + "ewc_loss": 0.08149661123752594, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044143100967630744, + "grad_norm": 9.642951011657715, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8545503616333008, + "num_tokens": 811170001.0, + "step": 21258 + }, + { + "epoch": 2.7043633125556545, + "ewc_loss": 0.08044782280921936, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043094303691759706, + "grad_norm": 9.384499549865723, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8704767227172852, + "num_tokens": 811204203.0, + "step": 21259 + }, + { + "epoch": 2.704490522834245, + "ewc_loss": 0.08173888176679611, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044385367073118687, + "grad_norm": 9.633116722106934, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8682347536087036, + "num_tokens": 811244598.0, + "step": 21260 + }, + { + "epoch": 2.7046177331128356, + "ewc_loss": 0.08040176331996918, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043048246880061924, + "grad_norm": 9.371883392333984, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8758879899978638, + "num_tokens": 811283352.0, + "step": 21261 + }, + { + "epoch": 2.704744943391426, + "ewc_loss": 0.08165577799081802, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044302261085249484, + "grad_norm": 9.693026542663574, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8538336753845215, + "num_tokens": 811317605.0, + "step": 21262 + }, + { + "epoch": 2.7048721536700167, + "ewc_loss": 0.08026288449764252, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004290936340112239, + "grad_norm": 9.346067428588867, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8802973031997681, + "num_tokens": 811352117.0, + "step": 21263 + }, + { + "epoch": 2.704999363948607, + "ewc_loss": 0.08206984400749207, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044472189620137215, + "grad_norm": 9.663637161254883, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8590652942657471, + "num_tokens": 811386118.0, + "step": 21264 + }, + { + "epoch": 2.7051265742271977, + "ewc_loss": 0.08036583662033081, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043012318201363087, + "grad_norm": 9.423872947692871, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8743529915809631, + "num_tokens": 811424479.0, + "step": 21265 + }, + { + "epoch": 2.7052537845057882, + "ewc_loss": 0.08147238194942474, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004411886620800942, + "grad_norm": 9.658267974853516, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8532363176345825, + "num_tokens": 811464683.0, + "step": 21266 + }, + { + "epoch": 2.7053809947843783, + "ewc_loss": 0.08080040663480759, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043202750384807587, + "grad_norm": 9.460149765014648, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.88164222240448, + "num_tokens": 811504801.0, + "step": 21267 + }, + { + "epoch": 2.7055082050629693, + "ewc_loss": 0.08133083581924438, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043977322638966143, + "grad_norm": 9.685989379882812, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8571511507034302, + "num_tokens": 811546278.0, + "step": 21268 + }, + { + "epoch": 2.7056354153415594, + "ewc_loss": 0.08035320043563843, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042999687138944864, + "grad_norm": 9.48859691619873, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8510919809341431, + "num_tokens": 811587019.0, + "step": 21269 + }, + { + "epoch": 2.7057626256201504, + "ewc_loss": 0.08120487630367279, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043851358350366354, + "grad_norm": 9.610774993896484, + "learning_rate": 1e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.8412240743637085, + "num_tokens": 811619843.0, + "step": 21270 + }, + { + "epoch": 2.7058898358987404, + "ewc_loss": 0.0803772509098053, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004302372981328517, + "grad_norm": 9.419464111328125, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8710685968399048, + "num_tokens": 811655797.0, + "step": 21271 + }, + { + "epoch": 2.706017046177331, + "ewc_loss": 0.08134377747774124, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043990262201987207, + "grad_norm": 9.66199779510498, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8555301427841187, + "num_tokens": 811691626.0, + "step": 21272 + }, + { + "epoch": 2.7061442564559215, + "ewc_loss": 0.08024351298809052, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042890000622719526, + "grad_norm": 9.402803421020508, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8668297529220581, + "num_tokens": 811726046.0, + "step": 21273 + }, + { + "epoch": 2.706271466734512, + "ewc_loss": 0.0814744234085083, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004412091220729053, + "grad_norm": 9.690655708312988, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8562558889389038, + "num_tokens": 811765686.0, + "step": 21274 + }, + { + "epoch": 2.7063986770131025, + "ewc_loss": 0.08027850091457367, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004292498924769461, + "grad_norm": 9.50104808807373, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8734254240989685, + "num_tokens": 811801143.0, + "step": 21275 + }, + { + "epoch": 2.706525887291693, + "ewc_loss": 0.08125783503055573, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004390431859064847, + "grad_norm": 9.78508186340332, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8732713460922241, + "num_tokens": 811834807.0, + "step": 21276 + }, + { + "epoch": 2.7066530975702836, + "ewc_loss": 0.08005829155445099, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042704775114543736, + "grad_norm": 9.40092945098877, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8834457397460938, + "num_tokens": 811865676.0, + "step": 21277 + }, + { + "epoch": 2.706780307848874, + "ewc_loss": 0.08175895363092422, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044161296682432294, + "grad_norm": 9.893970489501953, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8608815670013428, + "num_tokens": 811901259.0, + "step": 21278 + }, + { + "epoch": 2.7069075181274647, + "ewc_loss": 0.07966107130050659, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042307551484555006, + "grad_norm": 9.281867980957031, + "learning_rate": 1e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8508355617523193, + "num_tokens": 811937768.0, + "step": 21279 + }, + { + "epoch": 2.707034728406055, + "ewc_loss": 0.08204367756843567, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004469016566872597, + "grad_norm": 10.099509239196777, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8647221922874451, + "num_tokens": 811967805.0, + "step": 21280 + }, + { + "epoch": 2.7071619386846457, + "ewc_loss": 0.07919921725988388, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000418457027990371, + "grad_norm": 9.16425895690918, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.860941469669342, + "num_tokens": 812007468.0, + "step": 21281 + }, + { + "epoch": 2.7072891489632362, + "ewc_loss": 0.08311071991920471, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045757205225527287, + "grad_norm": 10.102729797363281, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8663614392280579, + "num_tokens": 812048127.0, + "step": 21282 + }, + { + "epoch": 2.7074163592418268, + "ewc_loss": 0.07944045960903168, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042086938628926873, + "grad_norm": 9.198037147521973, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8653377294540405, + "num_tokens": 812083904.0, + "step": 21283 + }, + { + "epoch": 2.7075435695204173, + "ewc_loss": 0.08309514820575714, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004574163176584989, + "grad_norm": 9.983114242553711, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8691732287406921, + "num_tokens": 812119643.0, + "step": 21284 + }, + { + "epoch": 2.707670779799008, + "ewc_loss": 0.08007949590682983, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004272598307579756, + "grad_norm": 9.375870704650879, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8631874322891235, + "num_tokens": 812155338.0, + "step": 21285 + }, + { + "epoch": 2.7077979900775984, + "ewc_loss": 0.08280199766159058, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004520433722063899, + "grad_norm": 9.935407638549805, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8644462823867798, + "num_tokens": 812193075.0, + "step": 21286 + }, + { + "epoch": 2.707925200356189, + "ewc_loss": 0.08024948835372925, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042895975639112294, + "grad_norm": 9.53141975402832, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8742270469665527, + "num_tokens": 812228163.0, + "step": 21287 + }, + { + "epoch": 2.7080524106347794, + "ewc_loss": 0.081700898706913, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044347383663989604, + "grad_norm": 9.763650894165039, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8598600029945374, + "num_tokens": 812268565.0, + "step": 21288 + }, + { + "epoch": 2.70817962091337, + "ewc_loss": 0.08033491671085358, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042981404112651944, + "grad_norm": 9.477554321289062, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8708645105361938, + "num_tokens": 812307292.0, + "step": 21289 + }, + { + "epoch": 2.7083068311919605, + "ewc_loss": 0.0811760425567627, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043822528095915914, + "grad_norm": 9.681167602539062, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8754621148109436, + "num_tokens": 812346180.0, + "step": 21290 + }, + { + "epoch": 2.708434041470551, + "ewc_loss": 0.0803549587726593, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004300144501030445, + "grad_norm": 9.511560440063477, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.861298143863678, + "num_tokens": 812376033.0, + "step": 21291 + }, + { + "epoch": 2.708561251749141, + "ewc_loss": 0.08099165558815002, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043638134957291186, + "grad_norm": 9.658369064331055, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8834964632987976, + "num_tokens": 812407925.0, + "step": 21292 + }, + { + "epoch": 2.708688462027732, + "ewc_loss": 0.08038712292909622, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004303360765334219, + "grad_norm": 9.51386833190918, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8699324131011963, + "num_tokens": 812452354.0, + "step": 21293 + }, + { + "epoch": 2.708815672306322, + "ewc_loss": 0.08075025677680969, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043396741966716945, + "grad_norm": 9.594350814819336, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8639064431190491, + "num_tokens": 812490015.0, + "step": 21294 + }, + { + "epoch": 2.708942882584913, + "ewc_loss": 0.08035974204540253, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004300622676964849, + "grad_norm": 9.483780860900879, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8627846240997314, + "num_tokens": 812529919.0, + "step": 21295 + }, + { + "epoch": 2.709070092863503, + "ewc_loss": 0.08078143745660782, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043427920900285244, + "grad_norm": 9.542652130126953, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8780810832977295, + "num_tokens": 812568541.0, + "step": 21296 + }, + { + "epoch": 2.7091973031420937, + "ewc_loss": 0.08044975996017456, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004309623909648508, + "grad_norm": 9.4580078125, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8569451570510864, + "num_tokens": 812612098.0, + "step": 21297 + }, + { + "epoch": 2.7093245134206843, + "ewc_loss": 0.08095042407512665, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004335277189966291, + "grad_norm": 9.62397289276123, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.861303985118866, + "num_tokens": 812648843.0, + "step": 21298 + }, + { + "epoch": 2.709451723699275, + "ewc_loss": 0.08045460283756256, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004310108779463917, + "grad_norm": 9.447498321533203, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.866834819316864, + "num_tokens": 812692080.0, + "step": 21299 + }, + { + "epoch": 2.7095789339778653, + "ewc_loss": 0.0809209868311882, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043567470856942236, + "grad_norm": 9.578375816345215, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8796258568763733, + "num_tokens": 812729441.0, + "step": 21300 + }, + { + "epoch": 2.709706144256456, + "ewc_loss": 0.08036737143993378, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000430138548836112, + "grad_norm": 9.527323722839355, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8597098588943481, + "num_tokens": 812768908.0, + "step": 21301 + }, + { + "epoch": 2.7098333545350464, + "ewc_loss": 0.08078667521476746, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000434331625001505, + "grad_norm": 9.50390625, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8644942045211792, + "num_tokens": 812808384.0, + "step": 21302 + }, + { + "epoch": 2.709960564813637, + "ewc_loss": 0.08068402111530304, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043330504558980465, + "grad_norm": 9.519570350646973, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8575552701950073, + "num_tokens": 812847658.0, + "step": 21303 + }, + { + "epoch": 2.7100877750922274, + "ewc_loss": 0.08058875799179077, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043235241901129484, + "grad_norm": 9.515893936157227, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8656261563301086, + "num_tokens": 812889394.0, + "step": 21304 + }, + { + "epoch": 2.710214985370818, + "ewc_loss": 0.08065448701381683, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043300967081449926, + "grad_norm": 9.527214050292969, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8511326313018799, + "num_tokens": 812924202.0, + "step": 21305 + }, + { + "epoch": 2.7103421956494085, + "ewc_loss": 0.08068782836198807, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043334311340004206, + "grad_norm": 9.514595031738281, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8849791288375854, + "num_tokens": 812954391.0, + "step": 21306 + }, + { + "epoch": 2.710469405927999, + "ewc_loss": 0.08061259984970093, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043259080848656595, + "grad_norm": 9.460882186889648, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8560091853141785, + "num_tokens": 812989498.0, + "step": 21307 + }, + { + "epoch": 2.7105966162065895, + "ewc_loss": 0.08088591694831848, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043532406562007964, + "grad_norm": 9.521464347839355, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.871080756187439, + "num_tokens": 813030968.0, + "step": 21308 + }, + { + "epoch": 2.71072382648518, + "ewc_loss": 0.08067977428436279, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004332625831011683, + "grad_norm": 9.506316184997559, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8888130784034729, + "num_tokens": 813065843.0, + "step": 21309 + }, + { + "epoch": 2.7108510367637706, + "ewc_loss": 0.08075208961963654, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043398572597652674, + "grad_norm": 9.473854064941406, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8705294132232666, + "num_tokens": 813107410.0, + "step": 21310 + }, + { + "epoch": 2.710978247042361, + "ewc_loss": 0.08070992678403854, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004335641278885305, + "grad_norm": 9.427583694458008, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8731703758239746, + "num_tokens": 813155959.0, + "step": 21311 + }, + { + "epoch": 2.7111054573209517, + "ewc_loss": 0.08097843080759048, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004362491599749774, + "grad_norm": 9.545042037963867, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8720505833625793, + "num_tokens": 813194133.0, + "step": 21312 + }, + { + "epoch": 2.711232667599542, + "ewc_loss": 0.08058636635541916, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004323284956626594, + "grad_norm": 9.514471054077148, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8541200160980225, + "num_tokens": 813231015.0, + "step": 21313 + }, + { + "epoch": 2.7113598778781327, + "ewc_loss": 0.08095714449882507, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043603626545518637, + "grad_norm": 9.508102416992188, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8815118670463562, + "num_tokens": 813271497.0, + "step": 21314 + }, + { + "epoch": 2.711487088156723, + "ewc_loss": 0.08077952265739441, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043426008778624237, + "grad_norm": 9.486763954162598, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.866068959236145, + "num_tokens": 813305758.0, + "step": 21315 + }, + { + "epoch": 2.7116142984353138, + "ewc_loss": 0.08098315447568893, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043629639549180865, + "grad_norm": 9.543289184570312, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8640627861022949, + "num_tokens": 813345958.0, + "step": 21316 + }, + { + "epoch": 2.711741508713904, + "ewc_loss": 0.08070211857557297, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004334860423114151, + "grad_norm": 9.502198219299316, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8684881925582886, + "num_tokens": 813381630.0, + "step": 21317 + }, + { + "epoch": 2.711868718992495, + "ewc_loss": 0.0808774083852768, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043523890781216323, + "grad_norm": 9.514873504638672, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8645197153091431, + "num_tokens": 813421320.0, + "step": 21318 + }, + { + "epoch": 2.711995929271085, + "ewc_loss": 0.080784872174263, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043431358062662184, + "grad_norm": 9.493000030517578, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8615434169769287, + "num_tokens": 813458323.0, + "step": 21319 + }, + { + "epoch": 2.712123139549676, + "ewc_loss": 0.08099763095378876, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043644115794450045, + "grad_norm": 9.544589042663574, + "learning_rate": 1e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.8473140597343445, + "num_tokens": 813501436.0, + "step": 21320 + }, + { + "epoch": 2.712250349828266, + "ewc_loss": 0.08055703341960907, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000432035158155486, + "grad_norm": 9.477560043334961, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8660418391227722, + "num_tokens": 813545942.0, + "step": 21321 + }, + { + "epoch": 2.7123775601068565, + "ewc_loss": 0.08106046915054321, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004370695096440613, + "grad_norm": 9.515830039978027, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8604907989501953, + "num_tokens": 813582202.0, + "step": 21322 + }, + { + "epoch": 2.712504770385447, + "ewc_loss": 0.0806596428155899, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043306127190589905, + "grad_norm": 9.488870620727539, + "learning_rate": 1e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8512274026870728, + "num_tokens": 813621711.0, + "step": 21323 + }, + { + "epoch": 2.7126319806640375, + "ewc_loss": 0.0809195265173912, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043566012755036354, + "grad_norm": 9.49781322479248, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8605849742889404, + "num_tokens": 813665404.0, + "step": 21324 + }, + { + "epoch": 2.712759190942628, + "ewc_loss": 0.08078433573246002, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004343082255218178, + "grad_norm": 9.505335807800293, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8714556694030762, + "num_tokens": 813697470.0, + "step": 21325 + }, + { + "epoch": 2.7128864012212186, + "ewc_loss": 0.0808374434709549, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004348392831161618, + "grad_norm": 9.520866394042969, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8745549917221069, + "num_tokens": 813733308.0, + "step": 21326 + }, + { + "epoch": 2.713013611499809, + "ewc_loss": 0.08070747554302216, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000433539564255625, + "grad_norm": 9.425387382507324, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8528875112533569, + "num_tokens": 813770881.0, + "step": 21327 + }, + { + "epoch": 2.7131408217783997, + "ewc_loss": 0.08093380928039551, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004358028818387538, + "grad_norm": 9.502737998962402, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8697636127471924, + "num_tokens": 813806495.0, + "step": 21328 + }, + { + "epoch": 2.71326803205699, + "ewc_loss": 0.08083945512771606, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004348593938630074, + "grad_norm": 9.481720924377441, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8697295188903809, + "num_tokens": 813842408.0, + "step": 21329 + }, + { + "epoch": 2.7133952423355807, + "ewc_loss": 0.08093579113483429, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043582275975495577, + "grad_norm": 9.435091018676758, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8623006939888, + "num_tokens": 813883041.0, + "step": 21330 + }, + { + "epoch": 2.7135224526141712, + "ewc_loss": 0.081067755818367, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000437142385635525, + "grad_norm": 9.548909187316895, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8611791133880615, + "num_tokens": 813922901.0, + "step": 21331 + }, + { + "epoch": 2.7136496628927618, + "ewc_loss": 0.08060944080352783, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004325592308305204, + "grad_norm": 9.430438041687012, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8671935796737671, + "num_tokens": 813961948.0, + "step": 21332 + }, + { + "epoch": 2.7137768731713523, + "ewc_loss": 0.08153878897428513, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004369699163362384, + "grad_norm": 9.980287551879883, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8802523016929626, + "num_tokens": 813999194.0, + "step": 21333 + }, + { + "epoch": 2.713904083449943, + "ewc_loss": 0.08000990748405457, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004241225542500615, + "grad_norm": 9.245888710021973, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8554205298423767, + "num_tokens": 814038986.0, + "step": 21334 + }, + { + "epoch": 2.7140312937285334, + "ewc_loss": 0.08226511627435684, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044911602162756026, + "grad_norm": 9.706635475158691, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.880649745464325, + "num_tokens": 814074351.0, + "step": 21335 + }, + { + "epoch": 2.714158504007124, + "ewc_loss": 0.07975491881370544, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042401408427394927, + "grad_norm": 9.260069847106934, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8659393787384033, + "num_tokens": 814109255.0, + "step": 21336 + }, + { + "epoch": 2.7142857142857144, + "ewc_loss": 0.08217164874076843, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044818129390478134, + "grad_norm": 9.685196876525879, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.861295223236084, + "num_tokens": 814146601.0, + "step": 21337 + }, + { + "epoch": 2.714412924564305, + "ewc_loss": 0.08035637438297272, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000427587132435292, + "grad_norm": 9.367033958435059, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8712369203567505, + "num_tokens": 814181299.0, + "step": 21338 + }, + { + "epoch": 2.7145401348428955, + "ewc_loss": 0.0819568783044815, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044603360584005713, + "grad_norm": 9.64848804473877, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8759595155715942, + "num_tokens": 814218178.0, + "step": 21339 + }, + { + "epoch": 2.7146673451214856, + "ewc_loss": 0.08031708002090454, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004296356928534806, + "grad_norm": 9.331311225891113, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8682383298873901, + "num_tokens": 814254161.0, + "step": 21340 + }, + { + "epoch": 2.7147945554000765, + "ewc_loss": 0.0818263590335846, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004447284445632249, + "grad_norm": 9.681236267089844, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8657702207565308, + "num_tokens": 814290164.0, + "step": 21341 + }, + { + "epoch": 2.7149217656786666, + "ewc_loss": 0.08026447147130966, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042910955380648375, + "grad_norm": 9.33953857421875, + "learning_rate": 1e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8445666432380676, + "num_tokens": 814334459.0, + "step": 21342 + }, + { + "epoch": 2.7150489759572576, + "ewc_loss": 0.08201372623443604, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044660212006419897, + "grad_norm": 9.723730087280273, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8574217557907104, + "num_tokens": 814374291.0, + "step": 21343 + }, + { + "epoch": 2.7151761862358477, + "ewc_loss": 0.0802222192287445, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042868705349974334, + "grad_norm": 9.36540699005127, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8619379997253418, + "num_tokens": 814413196.0, + "step": 21344 + }, + { + "epoch": 2.7153033965144386, + "ewc_loss": 0.08204793184995651, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044694417738355696, + "grad_norm": 9.701391220092773, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8633532524108887, + "num_tokens": 814455312.0, + "step": 21345 + }, + { + "epoch": 2.7154306067930287, + "ewc_loss": 0.08013655990362167, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004278304404579103, + "grad_norm": 9.362678527832031, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8616263270378113, + "num_tokens": 814496586.0, + "step": 21346 + }, + { + "epoch": 2.7155578170716193, + "ewc_loss": 0.08170516788959503, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004435165028553456, + "grad_norm": 9.68397045135498, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8718609809875488, + "num_tokens": 814531566.0, + "step": 21347 + }, + { + "epoch": 2.71568502735021, + "ewc_loss": 0.08026167005300522, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004290815268177539, + "grad_norm": 9.384663581848145, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8727465867996216, + "num_tokens": 814566168.0, + "step": 21348 + }, + { + "epoch": 2.7158122376288003, + "ewc_loss": 0.08179876208305359, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004444525111466646, + "grad_norm": 9.773506164550781, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8663395643234253, + "num_tokens": 814604927.0, + "step": 21349 + }, + { + "epoch": 2.715939447907391, + "ewc_loss": 0.07997909188270569, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004262557195033878, + "grad_norm": 9.32314395904541, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.879889726638794, + "num_tokens": 814641401.0, + "step": 21350 + }, + { + "epoch": 2.7160666581859814, + "ewc_loss": 0.08203176409006119, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044678247650153935, + "grad_norm": 9.7586669921875, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8716760873794556, + "num_tokens": 814674252.0, + "step": 21351 + }, + { + "epoch": 2.716193868464572, + "ewc_loss": 0.0800311416387558, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004267762997187674, + "grad_norm": 9.38287353515625, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8654658794403076, + "num_tokens": 814704965.0, + "step": 21352 + }, + { + "epoch": 2.7163210787431624, + "ewc_loss": 0.08170881122350693, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044355294085107744, + "grad_norm": 9.69739818572998, + "learning_rate": 1e-06, + "loss": 0.5633, + "mean_token_accuracy": 0.836581826210022, + "num_tokens": 814743740.0, + "step": 21353 + }, + { + "epoch": 2.716448289021753, + "ewc_loss": 0.08036056160926819, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004300705040805042, + "grad_norm": 9.433101654052734, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8721840381622314, + "num_tokens": 814784598.0, + "step": 21354 + }, + { + "epoch": 2.7165754993003435, + "ewc_loss": 0.08139921724796295, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004404569626785815, + "grad_norm": 9.645760536193848, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8593943119049072, + "num_tokens": 814821437.0, + "step": 21355 + }, + { + "epoch": 2.716702709578934, + "ewc_loss": 0.08025974035263062, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004290622309781611, + "grad_norm": 9.409679412841797, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8634711503982544, + "num_tokens": 814861638.0, + "step": 21356 + }, + { + "epoch": 2.7168299198575245, + "ewc_loss": 0.08154994994401932, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004419643373694271, + "grad_norm": 9.686210632324219, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8669414520263672, + "num_tokens": 814893799.0, + "step": 21357 + }, + { + "epoch": 2.716957130136115, + "ewc_loss": 0.08034913241863251, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042995615513063967, + "grad_norm": 9.444930076599121, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8736911416053772, + "num_tokens": 814930726.0, + "step": 21358 + }, + { + "epoch": 2.7170843404147056, + "ewc_loss": 0.08120232820510864, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043848808854818344, + "grad_norm": 9.567906379699707, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.864997923374176, + "num_tokens": 814970041.0, + "step": 21359 + }, + { + "epoch": 2.717211550693296, + "ewc_loss": 0.08040317893028259, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004304966132622212, + "grad_norm": 9.435157775878906, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8685335516929626, + "num_tokens": 815008587.0, + "step": 21360 + }, + { + "epoch": 2.7173387609718866, + "ewc_loss": 0.08109060674905777, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043737090891227126, + "grad_norm": 9.57049560546875, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.85260009765625, + "num_tokens": 815047659.0, + "step": 21361 + }, + { + "epoch": 2.717465971250477, + "ewc_loss": 0.08051314949989319, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043159633059985936, + "grad_norm": 9.427508354187012, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8656669855117798, + "num_tokens": 815082631.0, + "step": 21362 + }, + { + "epoch": 2.7175931815290677, + "ewc_loss": 0.08096680790185928, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004361329192761332, + "grad_norm": 9.54775619506836, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8837018013000488, + "num_tokens": 815117568.0, + "step": 21363 + }, + { + "epoch": 2.7177203918076582, + "ewc_loss": 0.08058807998895645, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004323456378187984, + "grad_norm": 9.43281364440918, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8659826517105103, + "num_tokens": 815155019.0, + "step": 21364 + }, + { + "epoch": 2.7178476020862483, + "ewc_loss": 0.08098478615283966, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004363126936368644, + "grad_norm": 9.444171905517578, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8655841946601868, + "num_tokens": 815195386.0, + "step": 21365 + }, + { + "epoch": 2.7179748123648393, + "ewc_loss": 0.08087264001369476, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004351912357378751, + "grad_norm": 9.518768310546875, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8710048198699951, + "num_tokens": 815228304.0, + "step": 21366 + }, + { + "epoch": 2.7181020226434294, + "ewc_loss": 0.0808187946677208, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004346527857705951, + "grad_norm": 9.456090927124023, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8849060535430908, + "num_tokens": 815260111.0, + "step": 21367 + }, + { + "epoch": 2.7182292329220203, + "ewc_loss": 0.08115090429782867, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043797388207167387, + "grad_norm": 9.504027366638184, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8634904623031616, + "num_tokens": 815297272.0, + "step": 21368 + }, + { + "epoch": 2.7183564432006104, + "ewc_loss": 0.0807643011212349, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004341078456491232, + "grad_norm": 9.49843978881836, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8648982048034668, + "num_tokens": 815336016.0, + "step": 21369 + }, + { + "epoch": 2.718483653479201, + "ewc_loss": 0.08099369704723358, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043640186777338386, + "grad_norm": 9.488370895385742, + "learning_rate": 1e-06, + "loss": 0.554, + "mean_token_accuracy": 0.8384597301483154, + "num_tokens": 815373940.0, + "step": 21370 + }, + { + "epoch": 2.7186108637577915, + "ewc_loss": 0.08083844184875488, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043484929483383894, + "grad_norm": 9.577824592590332, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8540410995483398, + "num_tokens": 815410836.0, + "step": 21371 + }, + { + "epoch": 2.718738074036382, + "ewc_loss": 0.08072544634342194, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043371928040869534, + "grad_norm": 9.47769546508789, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8714104890823364, + "num_tokens": 815449577.0, + "step": 21372 + }, + { + "epoch": 2.7188652843149725, + "ewc_loss": 0.08105644583702087, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043702934635803103, + "grad_norm": 9.543407440185547, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8681729435920715, + "num_tokens": 815485869.0, + "step": 21373 + }, + { + "epoch": 2.718992494593563, + "ewc_loss": 0.08075670897960663, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004340318846516311, + "grad_norm": 9.422298431396484, + "learning_rate": 1e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8459635972976685, + "num_tokens": 815523614.0, + "step": 21374 + }, + { + "epoch": 2.7191197048721536, + "ewc_loss": 0.08125107735395432, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043897563591599464, + "grad_norm": 9.58414077758789, + "learning_rate": 1e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.8442935943603516, + "num_tokens": 815561835.0, + "step": 21375 + }, + { + "epoch": 2.719246915150744, + "ewc_loss": 0.08060353994369507, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043250020826235414, + "grad_norm": 9.387417793273926, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.849331259727478, + "num_tokens": 815605596.0, + "step": 21376 + }, + { + "epoch": 2.7193741254293347, + "ewc_loss": 0.08148332685232162, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000441298121586442, + "grad_norm": 9.57722282409668, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8591257929801941, + "num_tokens": 815638720.0, + "step": 21377 + }, + { + "epoch": 2.719501335707925, + "ewc_loss": 0.08060655742883682, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004325304180383682, + "grad_norm": 9.393912315368652, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8799173831939697, + "num_tokens": 815673996.0, + "step": 21378 + }, + { + "epoch": 2.7196285459865157, + "ewc_loss": 0.08160147070884705, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004424795333761722, + "grad_norm": 9.61229133605957, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8521460294723511, + "num_tokens": 815716571.0, + "step": 21379 + }, + { + "epoch": 2.7197557562651062, + "ewc_loss": 0.0806194394826889, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004326592606958002, + "grad_norm": 9.386066436767578, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8723859190940857, + "num_tokens": 815748973.0, + "step": 21380 + }, + { + "epoch": 2.7198829665436968, + "ewc_loss": 0.08145390450954437, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044100394006818533, + "grad_norm": 9.595985412597656, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8678576946258545, + "num_tokens": 815794754.0, + "step": 21381 + }, + { + "epoch": 2.7200101768222873, + "ewc_loss": 0.08064407110214233, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000432905595516786, + "grad_norm": 9.39551067352295, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8672166466712952, + "num_tokens": 815831051.0, + "step": 21382 + }, + { + "epoch": 2.720137387100878, + "ewc_loss": 0.08156216889619827, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004420865443535149, + "grad_norm": 9.579442024230957, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8670133352279663, + "num_tokens": 815874291.0, + "step": 21383 + }, + { + "epoch": 2.7202645973794684, + "ewc_loss": 0.08060094714164734, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043247430585324764, + "grad_norm": 9.388262748718262, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8617790341377258, + "num_tokens": 815912484.0, + "step": 21384 + }, + { + "epoch": 2.720391807658059, + "ewc_loss": 0.08167210221290588, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004431858251336962, + "grad_norm": 9.61617374420166, + "learning_rate": 1e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8495815992355347, + "num_tokens": 815948626.0, + "step": 21385 + }, + { + "epoch": 2.7205190179366494, + "ewc_loss": 0.08092460036277771, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004332694225013256, + "grad_norm": 9.382882118225098, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.876193642616272, + "num_tokens": 815983154.0, + "step": 21386 + }, + { + "epoch": 2.72064622821524, + "ewc_loss": 0.08170897513628006, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044355459976941347, + "grad_norm": 9.595358848571777, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8680561780929565, + "num_tokens": 816017524.0, + "step": 21387 + }, + { + "epoch": 2.7207734384938305, + "ewc_loss": 0.08115102350711823, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000433092238381505, + "grad_norm": 9.461234092712402, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8606788516044617, + "num_tokens": 816055204.0, + "step": 21388 + }, + { + "epoch": 2.720900648772421, + "ewc_loss": 0.08142416179180145, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044070641160942614, + "grad_norm": 9.649757385253906, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8734065890312195, + "num_tokens": 816086671.0, + "step": 21389 + }, + { + "epoch": 2.721027859051011, + "ewc_loss": 0.08049269020557404, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000431391759775579, + "grad_norm": 9.52685260772705, + "learning_rate": 1e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8451151847839355, + "num_tokens": 816114173.0, + "step": 21390 + }, + { + "epoch": 2.721155069329602, + "ewc_loss": 0.08113762736320496, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004378411395009607, + "grad_norm": 9.489944458007812, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8774669766426086, + "num_tokens": 816156434.0, + "step": 21391 + }, + { + "epoch": 2.721282279608192, + "ewc_loss": 0.08083216845989227, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004347864887677133, + "grad_norm": 9.513401985168457, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8601266145706177, + "num_tokens": 816191037.0, + "step": 21392 + }, + { + "epoch": 2.721409489886783, + "ewc_loss": 0.08084392547607422, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004349041264504194, + "grad_norm": 9.530618667602539, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.87962806224823, + "num_tokens": 816227031.0, + "step": 21393 + }, + { + "epoch": 2.721536700165373, + "ewc_loss": 0.080908864736557, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043555349111557007, + "grad_norm": 9.455779075622559, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8694502711296082, + "num_tokens": 816264308.0, + "step": 21394 + }, + { + "epoch": 2.7216639104439637, + "ewc_loss": 0.08092303574085236, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004356951976660639, + "grad_norm": 9.516681671142578, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8630561232566833, + "num_tokens": 816300955.0, + "step": 21395 + }, + { + "epoch": 2.7217911207225542, + "ewc_loss": 0.08074352145195007, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043390001519583166, + "grad_norm": 9.530491828918457, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.864540696144104, + "num_tokens": 816332881.0, + "step": 21396 + }, + { + "epoch": 2.7219183310011448, + "ewc_loss": 0.08103617280721664, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043438514694571495, + "grad_norm": 9.446449279785156, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8849028944969177, + "num_tokens": 816367446.0, + "step": 21397 + }, + { + "epoch": 2.7220455412797353, + "ewc_loss": 0.08097003400325775, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043616522452794015, + "grad_norm": 9.496939659118652, + "learning_rate": 1e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8491318821907043, + "num_tokens": 816407286.0, + "step": 21398 + }, + { + "epoch": 2.722172751558326, + "ewc_loss": 0.08058768510818481, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043234165059402585, + "grad_norm": 9.443985939025879, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8754399418830872, + "num_tokens": 816445277.0, + "step": 21399 + }, + { + "epoch": 2.7222999618369164, + "ewc_loss": 0.08073662966489792, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043383112642914057, + "grad_norm": 9.405573844909668, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8723386526107788, + "num_tokens": 816481638.0, + "step": 21400 + }, + { + "epoch": 2.722427172115507, + "ewc_loss": 0.08130063116550446, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043702972470782697, + "grad_norm": 9.540143966674805, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8680756688117981, + "num_tokens": 816521386.0, + "step": 21401 + }, + { + "epoch": 2.7225543823940974, + "ewc_loss": 0.08042031526565552, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043066797661595047, + "grad_norm": 9.38037395477295, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8770803809165955, + "num_tokens": 816554760.0, + "step": 21402 + }, + { + "epoch": 2.722681592672688, + "ewc_loss": 0.08106978237628937, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004371627001091838, + "grad_norm": 9.473246574401855, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8741987943649292, + "num_tokens": 816595195.0, + "step": 21403 + }, + { + "epoch": 2.7228088029512785, + "ewc_loss": 0.08061861991882324, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043265099520795047, + "grad_norm": 9.508054733276367, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8680993914604187, + "num_tokens": 816633046.0, + "step": 21404 + }, + { + "epoch": 2.722936013229869, + "ewc_loss": 0.08085322380065918, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004349971131887287, + "grad_norm": 9.472200393676758, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.880449116230011, + "num_tokens": 816675283.0, + "step": 21405 + }, + { + "epoch": 2.7230632235084595, + "ewc_loss": 0.08067244291305542, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004331892414484173, + "grad_norm": 9.447081565856934, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8572894334793091, + "num_tokens": 816716499.0, + "step": 21406 + }, + { + "epoch": 2.72319043378705, + "ewc_loss": 0.08071661740541458, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043363100849092007, + "grad_norm": 9.572099685668945, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8602908849716187, + "num_tokens": 816761825.0, + "step": 21407 + }, + { + "epoch": 2.7233176440656406, + "ewc_loss": 0.08045005798339844, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004309653886593878, + "grad_norm": 9.405248641967773, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8730743527412415, + "num_tokens": 816792080.0, + "step": 21408 + }, + { + "epoch": 2.723444854344231, + "ewc_loss": 0.08098595589399338, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043632439337670803, + "grad_norm": 9.672080039978027, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8554822206497192, + "num_tokens": 816832121.0, + "step": 21409 + }, + { + "epoch": 2.7235720646228216, + "ewc_loss": 0.08006884157657623, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042715322342701256, + "grad_norm": 9.33462142944336, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8581717014312744, + "num_tokens": 816874032.0, + "step": 21410 + }, + { + "epoch": 2.723699274901412, + "ewc_loss": 0.08127879351377487, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004392527916934341, + "grad_norm": 9.598881721496582, + "learning_rate": 1e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8471688032150269, + "num_tokens": 816920471.0, + "step": 21411 + }, + { + "epoch": 2.7238264851800027, + "ewc_loss": 0.08009974658489227, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000427462364314124, + "grad_norm": 9.428253173828125, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.865485668182373, + "num_tokens": 816958042.0, + "step": 21412 + }, + { + "epoch": 2.723953695458593, + "ewc_loss": 0.08122357726097107, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043870063382200897, + "grad_norm": 9.610936164855957, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8573221564292908, + "num_tokens": 816992158.0, + "step": 21413 + }, + { + "epoch": 2.7240809057371838, + "ewc_loss": 0.08013332635164261, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042779810610227287, + "grad_norm": 9.379831314086914, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8537925481796265, + "num_tokens": 817032552.0, + "step": 21414 + }, + { + "epoch": 2.724208116015774, + "ewc_loss": 0.08229145407676697, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.000439613766502589, + "grad_norm": 9.855467796325684, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8642337322235107, + "num_tokens": 817072240.0, + "step": 21415 + }, + { + "epoch": 2.724335326294365, + "ewc_loss": 0.07984992116689682, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042252265848219395, + "grad_norm": 9.27226448059082, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8558480739593506, + "num_tokens": 817104741.0, + "step": 21416 + }, + { + "epoch": 2.724462536572955, + "ewc_loss": 0.0820392519235611, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044685733155347407, + "grad_norm": 9.726905822753906, + "learning_rate": 1e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8533565998077393, + "num_tokens": 817143876.0, + "step": 21417 + }, + { + "epoch": 2.724589746851546, + "ewc_loss": 0.07955383509397507, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004220031842123717, + "grad_norm": 9.233429908752441, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8889862298965454, + "num_tokens": 817183647.0, + "step": 21418 + }, + { + "epoch": 2.724716957130136, + "ewc_loss": 0.08212189376354218, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044768379302695394, + "grad_norm": 9.821743965148926, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8601300120353699, + "num_tokens": 817214520.0, + "step": 21419 + }, + { + "epoch": 2.7248441674087265, + "ewc_loss": 0.07999519258737564, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00042397534707561135, + "grad_norm": 9.251450538635254, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8550853729248047, + "num_tokens": 817256936.0, + "step": 21420 + }, + { + "epoch": 2.724971377687317, + "ewc_loss": 0.082212433218956, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044858912588097155, + "grad_norm": 9.729619979858398, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8668168783187866, + "num_tokens": 817292840.0, + "step": 21421 + }, + { + "epoch": 2.7250985879659075, + "ewc_loss": 0.07986815273761749, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004251463687978685, + "grad_norm": 9.283080101013184, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.861012876033783, + "num_tokens": 817332872.0, + "step": 21422 + }, + { + "epoch": 2.725225798244498, + "ewc_loss": 0.08251097053289413, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004491331346798688, + "grad_norm": 9.742990493774414, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8725768327713013, + "num_tokens": 817367668.0, + "step": 21423 + }, + { + "epoch": 2.7253530085230886, + "ewc_loss": 0.08016873151063919, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042815215419977903, + "grad_norm": 9.32038402557373, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.884459912776947, + "num_tokens": 817407726.0, + "step": 21424 + }, + { + "epoch": 2.725480218801679, + "ewc_loss": 0.08206033706665039, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000447068247012794, + "grad_norm": 9.715872764587402, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8756651282310486, + "num_tokens": 817441003.0, + "step": 21425 + }, + { + "epoch": 2.7256074290802697, + "ewc_loss": 0.08044086396694183, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043087347876280546, + "grad_norm": 9.342278480529785, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8716185688972473, + "num_tokens": 817482587.0, + "step": 21426 + }, + { + "epoch": 2.72573463935886, + "ewc_loss": 0.0818481594324112, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004449464613571763, + "grad_norm": 9.652162551879883, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8700210452079773, + "num_tokens": 817519727.0, + "step": 21427 + }, + { + "epoch": 2.7258618496374507, + "ewc_loss": 0.08045890927314758, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043105389340780675, + "grad_norm": 9.334078788757324, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8621470332145691, + "num_tokens": 817553809.0, + "step": 21428 + }, + { + "epoch": 2.7259890599160412, + "ewc_loss": 0.08176504820585251, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004441153141669929, + "grad_norm": 9.623732566833496, + "learning_rate": 1e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8490381240844727, + "num_tokens": 817588893.0, + "step": 21429 + }, + { + "epoch": 2.7261162701946318, + "ewc_loss": 0.08058153837919235, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004322802124079317, + "grad_norm": 9.368181228637695, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8732558488845825, + "num_tokens": 817627789.0, + "step": 21430 + }, + { + "epoch": 2.7262434804732223, + "ewc_loss": 0.081711545586586, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004435802693478763, + "grad_norm": 9.599302291870117, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8550658822059631, + "num_tokens": 817670048.0, + "step": 21431 + }, + { + "epoch": 2.726370690751813, + "ewc_loss": 0.08070644736289978, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043352931970730424, + "grad_norm": 9.434112548828125, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8707550764083862, + "num_tokens": 817707108.0, + "step": 21432 + }, + { + "epoch": 2.7264979010304033, + "ewc_loss": 0.08140530437231064, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004405178769957274, + "grad_norm": 9.523796081542969, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8631695508956909, + "num_tokens": 817748126.0, + "step": 21433 + }, + { + "epoch": 2.726625111308994, + "ewc_loss": 0.08089283108711243, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004353931581135839, + "grad_norm": 9.419403076171875, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8679468035697937, + "num_tokens": 817781578.0, + "step": 21434 + }, + { + "epoch": 2.7267523215875844, + "ewc_loss": 0.08119776099920273, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004384424537420273, + "grad_norm": 9.569164276123047, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8722389936447144, + "num_tokens": 817822143.0, + "step": 21435 + }, + { + "epoch": 2.726879531866175, + "ewc_loss": 0.0807717889547348, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004341827880125493, + "grad_norm": 9.364530563354492, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8765195608139038, + "num_tokens": 817858456.0, + "step": 21436 + }, + { + "epoch": 2.7270067421447655, + "ewc_loss": 0.0815734937787056, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044219978735782206, + "grad_norm": 9.621127128601074, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8730813264846802, + "num_tokens": 817892863.0, + "step": 21437 + }, + { + "epoch": 2.7271339524233555, + "ewc_loss": 0.08041761815547943, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000430641055572778, + "grad_norm": 9.328006744384766, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8688395619392395, + "num_tokens": 817931120.0, + "step": 21438 + }, + { + "epoch": 2.7272611627019465, + "ewc_loss": 0.08175843209028244, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044404916116036475, + "grad_norm": 9.706653594970703, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8668351173400879, + "num_tokens": 817963076.0, + "step": 21439 + }, + { + "epoch": 2.7273883729805366, + "ewc_loss": 0.08025665581226349, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042903143912553787, + "grad_norm": 9.297576904296875, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8657703995704651, + "num_tokens": 818003844.0, + "step": 21440 + }, + { + "epoch": 2.7275155832591276, + "ewc_loss": 0.0819673016667366, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044613785576075315, + "grad_norm": 9.633657455444336, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8679624795913696, + "num_tokens": 818048533.0, + "step": 21441 + }, + { + "epoch": 2.7276427935377177, + "ewc_loss": 0.08034736663103104, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004299385182093829, + "grad_norm": 9.363992691040039, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8624590635299683, + "num_tokens": 818083290.0, + "step": 21442 + }, + { + "epoch": 2.7277700038163086, + "ewc_loss": 0.081746906042099, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044393385178409517, + "grad_norm": 9.606152534484863, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8758088946342468, + "num_tokens": 818120037.0, + "step": 21443 + }, + { + "epoch": 2.7278972140948987, + "ewc_loss": 0.08069328218698502, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000433397653978318, + "grad_norm": 9.414944648742676, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8655188083648682, + "num_tokens": 818164122.0, + "step": 21444 + }, + { + "epoch": 2.7280244243734892, + "ewc_loss": 0.08183972537517548, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004424207145348191, + "grad_norm": 9.672276496887207, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8685939311981201, + "num_tokens": 818198065.0, + "step": 21445 + }, + { + "epoch": 2.7281516346520798, + "ewc_loss": 0.08040033280849457, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043046820792369545, + "grad_norm": 9.384659767150879, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8670685887336731, + "num_tokens": 818236843.0, + "step": 21446 + }, + { + "epoch": 2.7282788449306703, + "ewc_loss": 0.0815398246049881, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044186305603943765, + "grad_norm": 9.657751083374023, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8674907684326172, + "num_tokens": 818271300.0, + "step": 21447 + }, + { + "epoch": 2.728406055209261, + "ewc_loss": 0.08028728514909744, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004293376987334341, + "grad_norm": 9.357400894165039, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8507484197616577, + "num_tokens": 818311799.0, + "step": 21448 + }, + { + "epoch": 2.7285332654878514, + "ewc_loss": 0.08170394599437714, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004435042792465538, + "grad_norm": 9.633585929870605, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8721520900726318, + "num_tokens": 818346662.0, + "step": 21449 + }, + { + "epoch": 2.728660475766442, + "ewc_loss": 0.08041627705097198, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000430627609603107, + "grad_norm": 9.428088188171387, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8706281185150146, + "num_tokens": 818384465.0, + "step": 21450 + }, + { + "epoch": 2.7287876860450324, + "ewc_loss": 0.08145298808813095, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044099471415393054, + "grad_norm": 9.584450721740723, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8629361987113953, + "num_tokens": 818425625.0, + "step": 21451 + }, + { + "epoch": 2.728914896323623, + "ewc_loss": 0.08053994178771973, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004318642895668745, + "grad_norm": 9.364872932434082, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8714749217033386, + "num_tokens": 818458544.0, + "step": 21452 + }, + { + "epoch": 2.7290421066022135, + "ewc_loss": 0.08157749474048615, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004422398342285305, + "grad_norm": 9.67163372039795, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.860919713973999, + "num_tokens": 818503507.0, + "step": 21453 + }, + { + "epoch": 2.729169316880804, + "ewc_loss": 0.08033350110054016, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000429799867561087, + "grad_norm": 9.345470428466797, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8720184564590454, + "num_tokens": 818545411.0, + "step": 21454 + }, + { + "epoch": 2.7292965271593945, + "ewc_loss": 0.08173008263111115, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044376568985171616, + "grad_norm": 9.663101196289062, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8625770211219788, + "num_tokens": 818582100.0, + "step": 21455 + }, + { + "epoch": 2.729423737437985, + "ewc_loss": 0.08042003959417343, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043066524085588753, + "grad_norm": 9.456221580505371, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8633332252502441, + "num_tokens": 818615549.0, + "step": 21456 + }, + { + "epoch": 2.7295509477165756, + "ewc_loss": 0.08142611384391785, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004407259402796626, + "grad_norm": 9.648969650268555, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8703829646110535, + "num_tokens": 818652476.0, + "step": 21457 + }, + { + "epoch": 2.729678157995166, + "ewc_loss": 0.08070725202560425, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004310959775466472, + "grad_norm": 9.395194053649902, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8703064918518066, + "num_tokens": 818692435.0, + "step": 21458 + }, + { + "epoch": 2.7298053682737566, + "ewc_loss": 0.08151876926422119, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004416525480337441, + "grad_norm": 9.673312187194824, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8651819229125977, + "num_tokens": 818733206.0, + "step": 21459 + }, + { + "epoch": 2.729932578552347, + "ewc_loss": 0.08029556274414062, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042942047002725303, + "grad_norm": 9.350274085998535, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8679110407829285, + "num_tokens": 818771717.0, + "step": 21460 + }, + { + "epoch": 2.7300597888309377, + "ewc_loss": 0.08164773881435394, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004429421969689429, + "grad_norm": 9.66064453125, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8645481467247009, + "num_tokens": 818806999.0, + "step": 21461 + }, + { + "epoch": 2.7301869991095282, + "ewc_loss": 0.08039829134941101, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004304477188270539, + "grad_norm": 9.42027473449707, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8725406527519226, + "num_tokens": 818846226.0, + "step": 21462 + }, + { + "epoch": 2.7303142093881183, + "ewc_loss": 0.08155134320259094, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044197827810421586, + "grad_norm": 9.644255638122559, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8713384866714478, + "num_tokens": 818885667.0, + "step": 21463 + }, + { + "epoch": 2.7304414196667093, + "ewc_loss": 0.0805431678891182, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004318965075071901, + "grad_norm": 9.458288192749023, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8731444478034973, + "num_tokens": 818924009.0, + "step": 21464 + }, + { + "epoch": 2.7305686299452994, + "ewc_loss": 0.08125185966491699, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004389834648463875, + "grad_norm": 9.568331718444824, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8725665807723999, + "num_tokens": 818962614.0, + "step": 21465 + }, + { + "epoch": 2.7306958402238903, + "ewc_loss": 0.08070862293243408, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043355103116482496, + "grad_norm": 9.457863807678223, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8673728108406067, + "num_tokens": 819004486.0, + "step": 21466 + }, + { + "epoch": 2.7308230505024804, + "ewc_loss": 0.08124391734600067, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043890406959690154, + "grad_norm": 9.595563888549805, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8595422506332397, + "num_tokens": 819048364.0, + "step": 21467 + }, + { + "epoch": 2.730950260781071, + "ewc_loss": 0.08058922737836838, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004323571047279984, + "grad_norm": 9.531475067138672, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8626755475997925, + "num_tokens": 819086250.0, + "step": 21468 + }, + { + "epoch": 2.7310774710596615, + "ewc_loss": 0.08122637122869492, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000438728544395417, + "grad_norm": 9.610962867736816, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8771975040435791, + "num_tokens": 819126705.0, + "step": 21469 + }, + { + "epoch": 2.731204681338252, + "ewc_loss": 0.08082668483257294, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043473171535879374, + "grad_norm": 9.511629104614258, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8668595552444458, + "num_tokens": 819165967.0, + "step": 21470 + }, + { + "epoch": 2.7313318916168425, + "ewc_loss": 0.08101397752761841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004366046632640064, + "grad_norm": 9.528894424438477, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8630735874176025, + "num_tokens": 819208066.0, + "step": 21471 + }, + { + "epoch": 2.731459101895433, + "ewc_loss": 0.08092494308948517, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004357142315711826, + "grad_norm": 9.573610305786133, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.86448734998703, + "num_tokens": 819246269.0, + "step": 21472 + }, + { + "epoch": 2.7315863121740236, + "ewc_loss": 0.08093941956758499, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043585902312770486, + "grad_norm": 9.624164581298828, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8686918020248413, + "num_tokens": 819286027.0, + "step": 21473 + }, + { + "epoch": 2.731713522452614, + "ewc_loss": 0.08062781393527985, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004327429924160242, + "grad_norm": 9.528334617614746, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8742967844009399, + "num_tokens": 819319994.0, + "step": 21474 + }, + { + "epoch": 2.7318407327312046, + "ewc_loss": 0.08099332451820374, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004363980842754245, + "grad_norm": 9.545345306396484, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8615885376930237, + "num_tokens": 819360113.0, + "step": 21475 + }, + { + "epoch": 2.731967943009795, + "ewc_loss": 0.08079893887042999, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004344541812315583, + "grad_norm": 9.538323402404785, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8737830519676208, + "num_tokens": 819394575.0, + "step": 21476 + }, + { + "epoch": 2.7320951532883857, + "ewc_loss": 0.08082219213247299, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043468677904456854, + "grad_norm": 9.50832462310791, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8597754240036011, + "num_tokens": 819434545.0, + "step": 21477 + }, + { + "epoch": 2.7322223635669762, + "ewc_loss": 0.08082437515258789, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004347086069174111, + "grad_norm": 9.554306983947754, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8750725984573364, + "num_tokens": 819470699.0, + "step": 21478 + }, + { + "epoch": 2.7323495738455668, + "ewc_loss": 0.08066589385271072, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004331237869337201, + "grad_norm": 9.491202354431152, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8598648309707642, + "num_tokens": 819514447.0, + "step": 21479 + }, + { + "epoch": 2.7324767841241573, + "ewc_loss": 0.08092157542705536, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004356806166470051, + "grad_norm": 9.501169204711914, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.86048823595047, + "num_tokens": 819553434.0, + "step": 21480 + }, + { + "epoch": 2.732603994402748, + "ewc_loss": 0.08096645772457123, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004361293977126479, + "grad_norm": 9.597041130065918, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.858677327632904, + "num_tokens": 819591594.0, + "step": 21481 + }, + { + "epoch": 2.7327312046813383, + "ewc_loss": 0.08077875524759293, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004342524043750018, + "grad_norm": 9.491579055786133, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8632618188858032, + "num_tokens": 819634530.0, + "step": 21482 + }, + { + "epoch": 2.732858414959929, + "ewc_loss": 0.08123588562011719, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043882368481718004, + "grad_norm": 9.563036918640137, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8563963770866394, + "num_tokens": 819669358.0, + "step": 21483 + }, + { + "epoch": 2.7329856252385194, + "ewc_loss": 0.08078369498252869, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004343017644714564, + "grad_norm": 9.493888854980469, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8641864061355591, + "num_tokens": 819711336.0, + "step": 21484 + }, + { + "epoch": 2.73311283551711, + "ewc_loss": 0.08151291310787201, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044159393291920424, + "grad_norm": 9.594221115112305, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8555641174316406, + "num_tokens": 819746973.0, + "step": 21485 + }, + { + "epoch": 2.7332400457957005, + "ewc_loss": 0.08090630173683167, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004355278506409377, + "grad_norm": 9.476568222045898, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.875907301902771, + "num_tokens": 819780305.0, + "step": 21486 + }, + { + "epoch": 2.733367256074291, + "ewc_loss": 0.08141417056322098, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004406065563671291, + "grad_norm": 9.613210678100586, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8743003010749817, + "num_tokens": 819817326.0, + "step": 21487 + }, + { + "epoch": 2.733494466352881, + "ewc_loss": 0.0810006856918335, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004364716587588191, + "grad_norm": 9.47628402709961, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.85967618227005, + "num_tokens": 819857597.0, + "step": 21488 + }, + { + "epoch": 2.733621676631472, + "ewc_loss": 0.08147530257701874, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004412178823258728, + "grad_norm": 9.66610336303711, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8668665885925293, + "num_tokens": 819893471.0, + "step": 21489 + }, + { + "epoch": 2.733748886910062, + "ewc_loss": 0.08069267123937607, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004333915712777525, + "grad_norm": 9.487950325012207, + "learning_rate": 1e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8487346172332764, + "num_tokens": 819934175.0, + "step": 21490 + }, + { + "epoch": 2.733876097188653, + "ewc_loss": 0.08163490891456604, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004428139072842896, + "grad_norm": 9.649137496948242, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8744568228721619, + "num_tokens": 819971451.0, + "step": 21491 + }, + { + "epoch": 2.734003307467243, + "ewc_loss": 0.08054088056087494, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043187369010411203, + "grad_norm": 9.409055709838867, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.861598551273346, + "num_tokens": 820011566.0, + "step": 21492 + }, + { + "epoch": 2.7341305177458337, + "ewc_loss": 0.08195540308952332, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044601887930184603, + "grad_norm": 9.716592788696289, + "learning_rate": 1e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8417190313339233, + "num_tokens": 820047944.0, + "step": 21493 + }, + { + "epoch": 2.7342577280244242, + "ewc_loss": 0.08035099506378174, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004299747524783015, + "grad_norm": 9.45989990234375, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.852716326713562, + "num_tokens": 820083416.0, + "step": 21494 + }, + { + "epoch": 2.7343849383030148, + "ewc_loss": 0.08190084993839264, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044547338620759547, + "grad_norm": 9.649121284484863, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8582413196563721, + "num_tokens": 820122476.0, + "step": 21495 + }, + { + "epoch": 2.7345121485816053, + "ewc_loss": 0.08052399754524231, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004317048587836325, + "grad_norm": 9.400456428527832, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8711995482444763, + "num_tokens": 820159866.0, + "step": 21496 + }, + { + "epoch": 2.734639358860196, + "ewc_loss": 0.08163457363843918, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044281058944761753, + "grad_norm": 9.595481872558594, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8663001656532288, + "num_tokens": 820202710.0, + "step": 21497 + }, + { + "epoch": 2.7347665691387864, + "ewc_loss": 0.08069166541099548, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004333814431447536, + "grad_norm": 9.443944931030273, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8649067282676697, + "num_tokens": 820239156.0, + "step": 21498 + }, + { + "epoch": 2.734893779417377, + "ewc_loss": 0.0815366804599762, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004418316239025444, + "grad_norm": 9.626285552978516, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8841443657875061, + "num_tokens": 820274010.0, + "step": 21499 + }, + { + "epoch": 2.7350209896959674, + "ewc_loss": 0.08076262474060059, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043409105273894966, + "grad_norm": 9.396169662475586, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8643046617507935, + "num_tokens": 820319171.0, + "step": 21500 + }, + { + "epoch": 2.735148199974558, + "ewc_loss": 0.0817287415266037, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044375224388204515, + "grad_norm": 9.64687728881836, + "learning_rate": 1e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8502606153488159, + "num_tokens": 820355038.0, + "step": 21501 + }, + { + "epoch": 2.7352754102531485, + "ewc_loss": 0.08059819787740707, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004324468318372965, + "grad_norm": 9.453983306884766, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.861804723739624, + "num_tokens": 820390973.0, + "step": 21502 + }, + { + "epoch": 2.735402620531739, + "ewc_loss": 0.08157870173454285, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044225185411050916, + "grad_norm": 9.680904388427734, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8587832450866699, + "num_tokens": 820424543.0, + "step": 21503 + }, + { + "epoch": 2.7355298308103295, + "ewc_loss": 0.08050207793712616, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004314855905249715, + "grad_norm": 9.433442115783691, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8657013177871704, + "num_tokens": 820462702.0, + "step": 21504 + }, + { + "epoch": 2.73565704108892, + "ewc_loss": 0.08178402483463287, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044430510024540126, + "grad_norm": 9.777806282043457, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8758395910263062, + "num_tokens": 820498840.0, + "step": 21505 + }, + { + "epoch": 2.7357842513675106, + "ewc_loss": 0.0803009569644928, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004294743703212589, + "grad_norm": 9.318751335144043, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8597784042358398, + "num_tokens": 820534083.0, + "step": 21506 + }, + { + "epoch": 2.735911461646101, + "ewc_loss": 0.08207917213439941, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004472565487958491, + "grad_norm": 9.762259483337402, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8678750991821289, + "num_tokens": 820572479.0, + "step": 21507 + }, + { + "epoch": 2.7360386719246916, + "ewc_loss": 0.08010773360729218, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042754222522489727, + "grad_norm": 9.324515342712402, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8536374568939209, + "num_tokens": 820610283.0, + "step": 21508 + }, + { + "epoch": 2.736165882203282, + "ewc_loss": 0.0823936015367508, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004504008684307337, + "grad_norm": 9.734831809997559, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8828017115592957, + "num_tokens": 820647580.0, + "step": 21509 + }, + { + "epoch": 2.7362930924818727, + "ewc_loss": 0.08021872490644455, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004286520997993648, + "grad_norm": 9.339655876159668, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8711071014404297, + "num_tokens": 820686774.0, + "step": 21510 + }, + { + "epoch": 2.7364203027604628, + "ewc_loss": 0.08226802945137024, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000449145125458017, + "grad_norm": 9.744963645935059, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8747811317443848, + "num_tokens": 820724635.0, + "step": 21511 + }, + { + "epoch": 2.7365475130390537, + "ewc_loss": 0.08076927065849304, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043171615106984973, + "grad_norm": 9.350035667419434, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.87335205078125, + "num_tokens": 820761467.0, + "step": 21512 + }, + { + "epoch": 2.736674723317644, + "ewc_loss": 0.082233726978302, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004488021368160844, + "grad_norm": 9.693099021911621, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.852321445941925, + "num_tokens": 820801001.0, + "step": 21513 + }, + { + "epoch": 2.736801933596235, + "ewc_loss": 0.08055183291435242, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004319832078181207, + "grad_norm": 9.399158477783203, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8601071834564209, + "num_tokens": 820841028.0, + "step": 21514 + }, + { + "epoch": 2.736929143874825, + "ewc_loss": 0.08215078711509705, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044797276495955884, + "grad_norm": 9.683969497680664, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8737554550170898, + "num_tokens": 820881613.0, + "step": 21515 + }, + { + "epoch": 2.737056354153416, + "ewc_loss": 0.08063191175460815, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043278394150547683, + "grad_norm": 9.404358863830566, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8741136193275452, + "num_tokens": 820918205.0, + "step": 21516 + }, + { + "epoch": 2.737183564432006, + "ewc_loss": 0.08208446949720383, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004473095468711108, + "grad_norm": 9.785893440246582, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8587519526481628, + "num_tokens": 820950751.0, + "step": 21517 + }, + { + "epoch": 2.7373107747105965, + "ewc_loss": 0.08077916502952576, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004342564498074353, + "grad_norm": 9.424774169921875, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8668186664581299, + "num_tokens": 820987385.0, + "step": 21518 + }, + { + "epoch": 2.737437984989187, + "ewc_loss": 0.0822436660528183, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044646012247540057, + "grad_norm": 9.718743324279785, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8581939935684204, + "num_tokens": 821026273.0, + "step": 21519 + }, + { + "epoch": 2.7375651952677775, + "ewc_loss": 0.08082441985607147, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004322676104493439, + "grad_norm": 9.375410079956055, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8559321165084839, + "num_tokens": 821067978.0, + "step": 21520 + }, + { + "epoch": 2.737692405546368, + "ewc_loss": 0.0826103687286377, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004501270886976272, + "grad_norm": 9.810471534729004, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8704208135604858, + "num_tokens": 821111925.0, + "step": 21521 + }, + { + "epoch": 2.7378196158249586, + "ewc_loss": 0.080394446849823, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043040933087468147, + "grad_norm": 9.416192054748535, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8686665296554565, + "num_tokens": 821151019.0, + "step": 21522 + }, + { + "epoch": 2.737946826103549, + "ewc_loss": 0.08212797343730927, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004477445618249476, + "grad_norm": 9.789562225341797, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8717303276062012, + "num_tokens": 821185520.0, + "step": 21523 + }, + { + "epoch": 2.7380740363821396, + "ewc_loss": 0.08065925538539886, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004330574010964483, + "grad_norm": 9.480537414550781, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8667550683021545, + "num_tokens": 821224273.0, + "step": 21524 + }, + { + "epoch": 2.73820124666073, + "ewc_loss": 0.08210846781730652, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004451081622391939, + "grad_norm": 9.706076622009277, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8721213936805725, + "num_tokens": 821263541.0, + "step": 21525 + }, + { + "epoch": 2.7383284569393207, + "ewc_loss": 0.08078666031360626, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043433147948235273, + "grad_norm": 9.447633743286133, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8543426990509033, + "num_tokens": 821304053.0, + "step": 21526 + }, + { + "epoch": 2.7384556672179112, + "ewc_loss": 0.08192875981330872, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044575243373401463, + "grad_norm": 9.804661750793457, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8655006289482117, + "num_tokens": 821339710.0, + "step": 21527 + }, + { + "epoch": 2.7385828774965018, + "ewc_loss": 0.08064667880535126, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043049026862718165, + "grad_norm": 9.456785202026367, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8650554418563843, + "num_tokens": 821378377.0, + "step": 21528 + }, + { + "epoch": 2.7387100877750923, + "ewc_loss": 0.08202435821294785, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004467084363568574, + "grad_norm": 9.737712860107422, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8749104142189026, + "num_tokens": 821417434.0, + "step": 21529 + }, + { + "epoch": 2.738837298053683, + "ewc_loss": 0.08051969110965729, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043166178511455655, + "grad_norm": 9.450223922729492, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8623697757720947, + "num_tokens": 821464323.0, + "step": 21530 + }, + { + "epoch": 2.7389645083322733, + "ewc_loss": 0.08179840445518494, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044444884406402707, + "grad_norm": 9.669660568237305, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8635467290878296, + "num_tokens": 821509891.0, + "step": 21531 + }, + { + "epoch": 2.739091718610864, + "ewc_loss": 0.08077054470777512, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043417030246928334, + "grad_norm": 9.46839427947998, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8548727035522461, + "num_tokens": 821557309.0, + "step": 21532 + }, + { + "epoch": 2.7392189288894544, + "ewc_loss": 0.08172853291034698, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004437501775100827, + "grad_norm": 9.685104370117188, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8610481023788452, + "num_tokens": 821597751.0, + "step": 21533 + }, + { + "epoch": 2.739346139168045, + "ewc_loss": 0.08071470260620117, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043361191637814045, + "grad_norm": 9.493424415588379, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8774397373199463, + "num_tokens": 821637961.0, + "step": 21534 + }, + { + "epoch": 2.7394733494466355, + "ewc_loss": 0.08169996738433838, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004434644943103194, + "grad_norm": 9.614736557006836, + "learning_rate": 1e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8496430516242981, + "num_tokens": 821684728.0, + "step": 21535 + }, + { + "epoch": 2.7396005597252255, + "ewc_loss": 0.08098091185092926, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004362740146461874, + "grad_norm": 9.556824684143066, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8710945248603821, + "num_tokens": 821715359.0, + "step": 21536 + }, + { + "epoch": 2.7397277700038165, + "ewc_loss": 0.08130913972854614, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043955628643743694, + "grad_norm": 9.597947120666504, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8611177206039429, + "num_tokens": 821755540.0, + "step": 21537 + }, + { + "epoch": 2.7398549802824066, + "ewc_loss": 0.08127085864543915, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004391734255477786, + "grad_norm": 9.591904640197754, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8657039403915405, + "num_tokens": 821788506.0, + "step": 21538 + }, + { + "epoch": 2.7399821905609976, + "ewc_loss": 0.08105148375034332, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004369797243271023, + "grad_norm": 9.569648742675781, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8635978698730469, + "num_tokens": 821826555.0, + "step": 21539 + }, + { + "epoch": 2.7401094008395877, + "ewc_loss": 0.08126898109912872, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000439154653577134, + "grad_norm": 9.566421508789062, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8549243211746216, + "num_tokens": 821866160.0, + "step": 21540 + }, + { + "epoch": 2.7402366111181786, + "ewc_loss": 0.08102663606405258, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043673120671883225, + "grad_norm": 9.493891716003418, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8709540367126465, + "num_tokens": 821905595.0, + "step": 21541 + }, + { + "epoch": 2.7403638213967687, + "ewc_loss": 0.08150237798690796, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044148863526061177, + "grad_norm": 9.669443130493164, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8562498092651367, + "num_tokens": 821944769.0, + "step": 21542 + }, + { + "epoch": 2.7404910316753592, + "ewc_loss": 0.08089420199394226, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043540686601772904, + "grad_norm": 9.468850135803223, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.873188853263855, + "num_tokens": 821988889.0, + "step": 21543 + }, + { + "epoch": 2.7406182419539498, + "ewc_loss": 0.08183114975690842, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004447763494681567, + "grad_norm": 9.693930625915527, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8690385818481445, + "num_tokens": 822025343.0, + "step": 21544 + }, + { + "epoch": 2.7407454522325403, + "ewc_loss": 0.08083201199769974, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043478497536852956, + "grad_norm": 9.45484447479248, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8535884618759155, + "num_tokens": 822067651.0, + "step": 21545 + }, + { + "epoch": 2.740872662511131, + "ewc_loss": 0.08192680776119232, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044573290506377816, + "grad_norm": 9.70295238494873, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8570464849472046, + "num_tokens": 822102885.0, + "step": 21546 + }, + { + "epoch": 2.7409998727897213, + "ewc_loss": 0.08090448379516602, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043550971895456314, + "grad_norm": 9.537383079528809, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.857245922088623, + "num_tokens": 822142515.0, + "step": 21547 + }, + { + "epoch": 2.741127083068312, + "ewc_loss": 0.08171883970499039, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044365323265083134, + "grad_norm": 9.676290512084961, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8626203536987305, + "num_tokens": 822180745.0, + "step": 21548 + }, + { + "epoch": 2.7412542933469024, + "ewc_loss": 0.08081858605146408, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004346506902948022, + "grad_norm": 9.470508575439453, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8753212690353394, + "num_tokens": 822215117.0, + "step": 21549 + }, + { + "epoch": 2.741381503625493, + "ewc_loss": 0.08171345293521881, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004435993905644864, + "grad_norm": 9.6221923828125, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8557312488555908, + "num_tokens": 822258883.0, + "step": 21550 + }, + { + "epoch": 2.7415087139040835, + "ewc_loss": 0.08097643405199051, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004362291656434536, + "grad_norm": 9.559865951538086, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8620411157608032, + "num_tokens": 822295254.0, + "step": 21551 + }, + { + "epoch": 2.741635924182674, + "ewc_loss": 0.08141836524009705, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004406485240906477, + "grad_norm": 9.582873344421387, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8676565885543823, + "num_tokens": 822335769.0, + "step": 21552 + }, + { + "epoch": 2.7417631344612645, + "ewc_loss": 0.08113684505224228, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004378332814667374, + "grad_norm": 9.540254592895508, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.874962329864502, + "num_tokens": 822376873.0, + "step": 21553 + }, + { + "epoch": 2.741890344739855, + "ewc_loss": 0.08138071000576019, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004402718914207071, + "grad_norm": 9.619669914245605, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8604477643966675, + "num_tokens": 822419868.0, + "step": 21554 + }, + { + "epoch": 2.7420175550184456, + "ewc_loss": 0.08111249655485153, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043758979882113636, + "grad_norm": 9.580220222473145, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.863645076751709, + "num_tokens": 822456757.0, + "step": 21555 + }, + { + "epoch": 2.742144765297036, + "ewc_loss": 0.08119504153728485, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004384152125567198, + "grad_norm": 9.566866874694824, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8792867064476013, + "num_tokens": 822492772.0, + "step": 21556 + }, + { + "epoch": 2.7422719755756266, + "ewc_loss": 0.08110557496547699, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004375206190161407, + "grad_norm": 9.568713188171387, + "learning_rate": 1e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8484241366386414, + "num_tokens": 822535400.0, + "step": 21557 + }, + { + "epoch": 2.742399185854217, + "ewc_loss": 0.08118177205324173, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000438282557297498, + "grad_norm": 9.510682106018066, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8721903562545776, + "num_tokens": 822568313.0, + "step": 21558 + }, + { + "epoch": 2.7425263961328077, + "ewc_loss": 0.08132505416870117, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043971536797471344, + "grad_norm": 9.574606895446777, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8708063364028931, + "num_tokens": 822608487.0, + "step": 21559 + }, + { + "epoch": 2.742653606411398, + "ewc_loss": 0.08094848692417145, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043594971066340804, + "grad_norm": 9.521417617797852, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8656694889068604, + "num_tokens": 822647938.0, + "step": 21560 + }, + { + "epoch": 2.7427808166899883, + "ewc_loss": 0.08146250993013382, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000441089941887185, + "grad_norm": 9.57888126373291, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.862143874168396, + "num_tokens": 822685974.0, + "step": 21561 + }, + { + "epoch": 2.7429080269685793, + "ewc_loss": 0.08118265867233276, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043829146306961775, + "grad_norm": 9.648009300231934, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8676559329032898, + "num_tokens": 822719166.0, + "step": 21562 + }, + { + "epoch": 2.7430352372471694, + "ewc_loss": 0.0809386596083641, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043585142702795565, + "grad_norm": 9.475250244140625, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8551214337348938, + "num_tokens": 822758364.0, + "step": 21563 + }, + { + "epoch": 2.7431624475257603, + "ewc_loss": 0.08139537274837494, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004404186038300395, + "grad_norm": 9.547823905944824, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8730133771896362, + "num_tokens": 822798002.0, + "step": 21564 + }, + { + "epoch": 2.7432896578043504, + "ewc_loss": 0.08105000108480453, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004369648522697389, + "grad_norm": 9.54720687866211, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.864861249923706, + "num_tokens": 822837958.0, + "step": 21565 + }, + { + "epoch": 2.743416868082941, + "ewc_loss": 0.08109724521636963, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043743732385337353, + "grad_norm": 9.510688781738281, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8654634952545166, + "num_tokens": 822873076.0, + "step": 21566 + }, + { + "epoch": 2.7435440783615315, + "ewc_loss": 0.08126834034919739, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004391481925267726, + "grad_norm": 9.531412124633789, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8649002313613892, + "num_tokens": 822904693.0, + "step": 21567 + }, + { + "epoch": 2.743671288640122, + "ewc_loss": 0.08109073340892792, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004373722185846418, + "grad_norm": 9.542593002319336, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.860288143157959, + "num_tokens": 822941192.0, + "step": 21568 + }, + { + "epoch": 2.7437984989187125, + "ewc_loss": 0.08117621392011642, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004382269689813256, + "grad_norm": 9.520989418029785, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.851730465888977, + "num_tokens": 822982166.0, + "step": 21569 + }, + { + "epoch": 2.743925709197303, + "ewc_loss": 0.08105691522359848, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004370340029709041, + "grad_norm": 9.564794540405273, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8585690259933472, + "num_tokens": 823021630.0, + "step": 21570 + }, + { + "epoch": 2.7440529194758936, + "ewc_loss": 0.08094432950019836, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004359081794973463, + "grad_norm": 9.529383659362793, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8642436861991882, + "num_tokens": 823062133.0, + "step": 21571 + }, + { + "epoch": 2.744180129754484, + "ewc_loss": 0.08113685250282288, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004378333396743983, + "grad_norm": 9.64085578918457, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8613072037696838, + "num_tokens": 823105163.0, + "step": 21572 + }, + { + "epoch": 2.7443073400330746, + "ewc_loss": 0.0808836817741394, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004353016847744584, + "grad_norm": 9.52600383758545, + "learning_rate": 1e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.8449752330780029, + "num_tokens": 823146878.0, + "step": 21573 + }, + { + "epoch": 2.744434550311665, + "ewc_loss": 0.0813082605600357, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000439547497080639, + "grad_norm": 9.59998607635498, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8828511238098145, + "num_tokens": 823182216.0, + "step": 21574 + }, + { + "epoch": 2.7445617605902557, + "ewc_loss": 0.08109325170516968, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043495590216480196, + "grad_norm": 9.3941068649292, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8724269866943359, + "num_tokens": 823227838.0, + "step": 21575 + }, + { + "epoch": 2.7446889708688462, + "ewc_loss": 0.08186572790145874, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044512207387015224, + "grad_norm": 9.71886920928955, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8565884232521057, + "num_tokens": 823268609.0, + "step": 21576 + }, + { + "epoch": 2.7448161811474368, + "ewc_loss": 0.08058885484933853, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043235340854153037, + "grad_norm": 9.399130821228027, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8752048015594482, + "num_tokens": 823309426.0, + "step": 21577 + }, + { + "epoch": 2.7449433914260273, + "ewc_loss": 0.08207447826862335, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004472096625249833, + "grad_norm": 9.74134635925293, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8599518537521362, + "num_tokens": 823346049.0, + "step": 21578 + }, + { + "epoch": 2.745070601704618, + "ewc_loss": 0.08049537986516953, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000431418651714921, + "grad_norm": 9.451536178588867, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.858439564704895, + "num_tokens": 823378915.0, + "step": 21579 + }, + { + "epoch": 2.7451978119832083, + "ewc_loss": 0.08198368549346924, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044630171032622457, + "grad_norm": 9.685351371765137, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8556054830551147, + "num_tokens": 823422985.0, + "step": 21580 + }, + { + "epoch": 2.745325022261799, + "ewc_loss": 0.0805998295545578, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043246312998235226, + "grad_norm": 9.427515983581543, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8517293930053711, + "num_tokens": 823460703.0, + "step": 21581 + }, + { + "epoch": 2.7454522325403894, + "ewc_loss": 0.0819869413971901, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004463342484086752, + "grad_norm": 9.755948066711426, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8676881194114685, + "num_tokens": 823499651.0, + "step": 21582 + }, + { + "epoch": 2.74557944281898, + "ewc_loss": 0.08040159195661545, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004304807516746223, + "grad_norm": 9.42095947265625, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8652513027191162, + "num_tokens": 823537044.0, + "step": 21583 + }, + { + "epoch": 2.7457066530975704, + "ewc_loss": 0.08210241049528122, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004474889428820461, + "grad_norm": 9.747076034545898, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8784953355789185, + "num_tokens": 823571832.0, + "step": 21584 + }, + { + "epoch": 2.745833863376161, + "ewc_loss": 0.08052746951580048, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004317395214457065, + "grad_norm": 9.469356536865234, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8713419437408447, + "num_tokens": 823609852.0, + "step": 21585 + }, + { + "epoch": 2.745961073654751, + "ewc_loss": 0.08200353384017944, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004465001984499395, + "grad_norm": 9.783044815063477, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8681000471115112, + "num_tokens": 823644419.0, + "step": 21586 + }, + { + "epoch": 2.746088283933342, + "ewc_loss": 0.08039544522762299, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004304193425923586, + "grad_norm": 9.38978385925293, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8696267008781433, + "num_tokens": 823682451.0, + "step": 21587 + }, + { + "epoch": 2.746215494211932, + "ewc_loss": 0.08222825825214386, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044874739251099527, + "grad_norm": 9.787229537963867, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8756625056266785, + "num_tokens": 823723051.0, + "step": 21588 + }, + { + "epoch": 2.746342704490523, + "ewc_loss": 0.08045244216918945, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043098925380036235, + "grad_norm": 9.352814674377441, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8619357347488403, + "num_tokens": 823761353.0, + "step": 21589 + }, + { + "epoch": 2.746469914769113, + "ewc_loss": 0.08240540325641632, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004505189135670662, + "grad_norm": 9.847827911376953, + "learning_rate": 1e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8506161570549011, + "num_tokens": 823800281.0, + "step": 21590 + }, + { + "epoch": 2.7465971250477037, + "ewc_loss": 0.08038418740034103, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043030671076849103, + "grad_norm": 9.35016918182373, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8790817260742188, + "num_tokens": 823843255.0, + "step": 21591 + }, + { + "epoch": 2.7467243353262942, + "ewc_loss": 0.0825742781162262, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004522076342254877, + "grad_norm": 9.879996299743652, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8624269962310791, + "num_tokens": 823880736.0, + "step": 21592 + }, + { + "epoch": 2.7468515456048848, + "ewc_loss": 0.08051058650016785, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043157071922905743, + "grad_norm": 9.38911247253418, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8627945184707642, + "num_tokens": 823923232.0, + "step": 21593 + }, + { + "epoch": 2.7469787558834753, + "ewc_loss": 0.0827091634273529, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045355645124800503, + "grad_norm": 9.82909870147705, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8695032596588135, + "num_tokens": 823957488.0, + "step": 21594 + }, + { + "epoch": 2.747105966162066, + "ewc_loss": 0.08052285015583038, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004316933627706021, + "grad_norm": 9.405556678771973, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8611531853675842, + "num_tokens": 823999568.0, + "step": 21595 + }, + { + "epoch": 2.7472331764406563, + "ewc_loss": 0.08252553641796112, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004517202323768288, + "grad_norm": 9.74763011932373, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8763719797134399, + "num_tokens": 824036947.0, + "step": 21596 + }, + { + "epoch": 2.747360386719247, + "ewc_loss": 0.08096466958522797, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043611155706457794, + "grad_norm": 9.535922050476074, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8775171041488647, + "num_tokens": 824067966.0, + "step": 21597 + }, + { + "epoch": 2.7474875969978374, + "ewc_loss": 0.08180937170982361, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044455856550484896, + "grad_norm": 9.658583641052246, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8624392747879028, + "num_tokens": 824112216.0, + "step": 21598 + }, + { + "epoch": 2.747614807276428, + "ewc_loss": 0.0811360776424408, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004378255980554968, + "grad_norm": 9.522910118103027, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8755446672439575, + "num_tokens": 824148749.0, + "step": 21599 + }, + { + "epoch": 2.7477420175550185, + "ewc_loss": 0.08168528228998184, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004433176654856652, + "grad_norm": 9.666791915893555, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8637522459030151, + "num_tokens": 824186685.0, + "step": 21600 + }, + { + "epoch": 2.747869227833609, + "ewc_loss": 0.08107060939073563, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043717093649320304, + "grad_norm": 9.59428882598877, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.858989953994751, + "num_tokens": 824223716.0, + "step": 21601 + }, + { + "epoch": 2.7479964381121995, + "ewc_loss": 0.08121499419212341, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043861474841833115, + "grad_norm": 9.511131286621094, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8640772104263306, + "num_tokens": 824264627.0, + "step": 21602 + }, + { + "epoch": 2.74812364839079, + "ewc_loss": 0.08135704696178436, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004400353354867548, + "grad_norm": 9.631904602050781, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8662041425704956, + "num_tokens": 824301901.0, + "step": 21603 + }, + { + "epoch": 2.7482508586693806, + "ewc_loss": 0.08097108453512192, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004361756728030741, + "grad_norm": 9.500778198242188, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8698620200157166, + "num_tokens": 824344621.0, + "step": 21604 + }, + { + "epoch": 2.748378068947971, + "ewc_loss": 0.0816277340054512, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004427421954460442, + "grad_norm": 9.658622741699219, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.872287392616272, + "num_tokens": 824385850.0, + "step": 21605 + }, + { + "epoch": 2.7485052792265616, + "ewc_loss": 0.08084255456924438, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043489038944244385, + "grad_norm": 9.457846641540527, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8918978571891785, + "num_tokens": 824425545.0, + "step": 21606 + }, + { + "epoch": 2.748632489505152, + "ewc_loss": 0.08184157311916351, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044488062849268317, + "grad_norm": 9.78075122833252, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8816285133361816, + "num_tokens": 824468611.0, + "step": 21607 + }, + { + "epoch": 2.7487596997837427, + "ewc_loss": 0.08069930225610733, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004310164658818394, + "grad_norm": 9.351017951965332, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8702995777130127, + "num_tokens": 824512161.0, + "step": 21608 + }, + { + "epoch": 2.7488869100623328, + "ewc_loss": 0.08262673020362854, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045029071043245494, + "grad_norm": 9.87991714477539, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8780258297920227, + "num_tokens": 824555502.0, + "step": 21609 + }, + { + "epoch": 2.7490141203409237, + "ewc_loss": 0.08018234372138977, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004282882728148252, + "grad_norm": 9.362259864807129, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8604788780212402, + "num_tokens": 824596277.0, + "step": 21610 + }, + { + "epoch": 2.749141330619514, + "ewc_loss": 0.08257973194122314, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045226217480376363, + "grad_norm": 9.880125045776367, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8533053398132324, + "num_tokens": 824633704.0, + "step": 21611 + }, + { + "epoch": 2.749268540898105, + "ewc_loss": 0.08023688197135925, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004288336494937539, + "grad_norm": 9.374344825744629, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8695917129516602, + "num_tokens": 824673584.0, + "step": 21612 + }, + { + "epoch": 2.749395751176695, + "ewc_loss": 0.08283481001853943, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004548129509203136, + "grad_norm": 9.866870880126953, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8572837114334106, + "num_tokens": 824712508.0, + "step": 21613 + }, + { + "epoch": 2.749522961455286, + "ewc_loss": 0.08042344450950623, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043069926323369145, + "grad_norm": 9.419200897216797, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8767282366752625, + "num_tokens": 824754353.0, + "step": 21614 + }, + { + "epoch": 2.749650171733876, + "ewc_loss": 0.08260941505432129, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045255894656293094, + "grad_norm": 9.765436172485352, + "learning_rate": 1e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8486316204071045, + "num_tokens": 824796333.0, + "step": 21615 + }, + { + "epoch": 2.7497773820124665, + "ewc_loss": 0.08076627552509308, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043412763625383377, + "grad_norm": 9.453115463256836, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8584604263305664, + "num_tokens": 824832048.0, + "step": 21616 + }, + { + "epoch": 2.749904592291057, + "ewc_loss": 0.0822426825761795, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004488917184062302, + "grad_norm": 9.7457914352417, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.859533965587616, + "num_tokens": 824871889.0, + "step": 21617 + }, + { + "epoch": 2.7500318025696475, + "ewc_loss": 0.08101628720760345, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043662774260155857, + "grad_norm": 9.501226425170898, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8751481771469116, + "num_tokens": 824912547.0, + "step": 21618 + }, + { + "epoch": 2.750159012848238, + "ewc_loss": 0.08216143399477005, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044807916856370866, + "grad_norm": 9.728161811828613, + "learning_rate": 1e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8458287119865417, + "num_tokens": 824955626.0, + "step": 21619 + }, + { + "epoch": 2.7502862231268286, + "ewc_loss": 0.08100022375583649, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004364670894574374, + "grad_norm": 9.488213539123535, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8731549978256226, + "num_tokens": 824991813.0, + "step": 21620 + }, + { + "epoch": 2.750413433405419, + "ewc_loss": 0.0821322649717331, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004477874899748713, + "grad_norm": 9.683791160583496, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8736610412597656, + "num_tokens": 825024356.0, + "step": 21621 + }, + { + "epoch": 2.7505406436840096, + "ewc_loss": 0.08116992563009262, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004381641047075391, + "grad_norm": 9.52324390411377, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8612465262413025, + "num_tokens": 825060316.0, + "step": 21622 + }, + { + "epoch": 2.7506678539626, + "ewc_loss": 0.08187729120254517, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004452377324923873, + "grad_norm": 9.61095142364502, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8589215278625488, + "num_tokens": 825096044.0, + "step": 21623 + }, + { + "epoch": 2.7507950642411907, + "ewc_loss": 0.08162304759025574, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044269528007134795, + "grad_norm": 9.630553245544434, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8630850911140442, + "num_tokens": 825123999.0, + "step": 21624 + }, + { + "epoch": 2.750922274519781, + "ewc_loss": 0.08154952526092529, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044196011731401086, + "grad_norm": 9.554849624633789, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8545761108398438, + "num_tokens": 825170155.0, + "step": 21625 + }, + { + "epoch": 2.7510494847983717, + "ewc_loss": 0.08187897503376007, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004452545545063913, + "grad_norm": 9.617454528808594, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8636390566825867, + "num_tokens": 825213388.0, + "step": 21626 + }, + { + "epoch": 2.7511766950769623, + "ewc_loss": 0.08121797442436218, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043864455074071884, + "grad_norm": 9.516876220703125, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8713283538818359, + "num_tokens": 825247171.0, + "step": 21627 + }, + { + "epoch": 2.751303905355553, + "ewc_loss": 0.0820651575922966, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004471164138521999, + "grad_norm": 9.654196739196777, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8652616739273071, + "num_tokens": 825283271.0, + "step": 21628 + }, + { + "epoch": 2.7514311156341433, + "ewc_loss": 0.08121013641357422, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043856626143679023, + "grad_norm": 9.494483947753906, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8656432032585144, + "num_tokens": 825318976.0, + "step": 21629 + }, + { + "epoch": 2.751558325912734, + "ewc_loss": 0.08202604949474335, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004467253456823528, + "grad_norm": 9.597978591918945, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.86483234167099, + "num_tokens": 825358019.0, + "step": 21630 + }, + { + "epoch": 2.7516855361913244, + "ewc_loss": 0.08128991723060608, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004393639974296093, + "grad_norm": 9.503220558166504, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8833341598510742, + "num_tokens": 825394025.0, + "step": 21631 + }, + { + "epoch": 2.751812746469915, + "ewc_loss": 0.08167925477027893, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044325742055661976, + "grad_norm": 9.616963386535645, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8766940832138062, + "num_tokens": 825433021.0, + "step": 21632 + }, + { + "epoch": 2.7519399567485054, + "ewc_loss": 0.08141005039215088, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044056534534320235, + "grad_norm": 9.591011047363281, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8647851347923279, + "num_tokens": 825477026.0, + "step": 21633 + }, + { + "epoch": 2.7520671670270955, + "ewc_loss": 0.08158161491155624, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044228098704479635, + "grad_norm": 9.599349021911621, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8704074621200562, + "num_tokens": 825511054.0, + "step": 21634 + }, + { + "epoch": 2.7521943773056865, + "ewc_loss": 0.08146636188030243, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004411284171510488, + "grad_norm": 9.585583686828613, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8514156341552734, + "num_tokens": 825549654.0, + "step": 21635 + }, + { + "epoch": 2.7523215875842766, + "ewc_loss": 0.08151104301214218, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044157527736388147, + "grad_norm": 9.572407722473145, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8678762912750244, + "num_tokens": 825587269.0, + "step": 21636 + }, + { + "epoch": 2.7524487978628676, + "ewc_loss": 0.08149608969688416, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044142574188299477, + "grad_norm": 9.612833976745605, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8891611099243164, + "num_tokens": 825624864.0, + "step": 21637 + }, + { + "epoch": 2.7525760081414576, + "ewc_loss": 0.0813504010438919, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043996889144182205, + "grad_norm": 9.58299732208252, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8572700619697571, + "num_tokens": 825667025.0, + "step": 21638 + }, + { + "epoch": 2.7527032184200486, + "ewc_loss": 0.08158896863460541, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044235456152819097, + "grad_norm": 9.634819030761719, + "learning_rate": 1e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8502827882766724, + "num_tokens": 825709505.0, + "step": 21639 + }, + { + "epoch": 2.7528304286986387, + "ewc_loss": 0.08116135746240616, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043807842303067446, + "grad_norm": 9.613669395446777, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8687230348587036, + "num_tokens": 825747376.0, + "step": 21640 + }, + { + "epoch": 2.7529576389772292, + "ewc_loss": 0.08148956298828125, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004413604619912803, + "grad_norm": 9.540605545043945, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8731837272644043, + "num_tokens": 825784817.0, + "step": 21641 + }, + { + "epoch": 2.7530848492558198, + "ewc_loss": 0.0813724547624588, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004401893529575318, + "grad_norm": 9.59134292602539, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8616399765014648, + "num_tokens": 825815631.0, + "step": 21642 + }, + { + "epoch": 2.7532120595344103, + "ewc_loss": 0.08124129474163055, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004388777888379991, + "grad_norm": 9.615762710571289, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8650795817375183, + "num_tokens": 825854202.0, + "step": 21643 + }, + { + "epoch": 2.753339269813001, + "ewc_loss": 0.08137769997119904, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044024179806001484, + "grad_norm": 9.632704734802246, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8716360926628113, + "num_tokens": 825890636.0, + "step": 21644 + }, + { + "epoch": 2.7534664800915913, + "ewc_loss": 0.08125456422567368, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004390105023048818, + "grad_norm": 9.555368423461914, + "learning_rate": 1e-06, + "loss": 0.536, + "mean_token_accuracy": 0.842420220375061, + "num_tokens": 825931054.0, + "step": 21645 + }, + { + "epoch": 2.753593690370182, + "ewc_loss": 0.08143971860408783, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044086205889470875, + "grad_norm": 9.558116912841797, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8608163595199585, + "num_tokens": 825972519.0, + "step": 21646 + }, + { + "epoch": 2.7537209006487724, + "ewc_loss": 0.08121462166309357, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043861105223186314, + "grad_norm": 9.576719284057617, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8678230047225952, + "num_tokens": 826010010.0, + "step": 21647 + }, + { + "epoch": 2.753848110927363, + "ewc_loss": 0.08135303854942322, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043999520130455494, + "grad_norm": 9.615267753601074, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8663052320480347, + "num_tokens": 826048348.0, + "step": 21648 + }, + { + "epoch": 2.7539753212059535, + "ewc_loss": 0.08115760236978531, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043804087908938527, + "grad_norm": 9.49069881439209, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8621373176574707, + "num_tokens": 826087807.0, + "step": 21649 + }, + { + "epoch": 2.754102531484544, + "ewc_loss": 0.08154009282588959, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044186573359183967, + "grad_norm": 9.635749816894531, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8609565496444702, + "num_tokens": 826124657.0, + "step": 21650 + }, + { + "epoch": 2.7542297417631345, + "ewc_loss": 0.08112762123346329, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043774105142802, + "grad_norm": 9.56219482421875, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8675259351730347, + "num_tokens": 826164190.0, + "step": 21651 + }, + { + "epoch": 2.754356952041725, + "ewc_loss": 0.08174851536750793, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004439500335138291, + "grad_norm": 9.63333797454834, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8669978380203247, + "num_tokens": 826202876.0, + "step": 21652 + }, + { + "epoch": 2.7544841623203156, + "ewc_loss": 0.08110293745994568, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004374942509457469, + "grad_norm": 9.492630958557129, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8633569478988647, + "num_tokens": 826242955.0, + "step": 21653 + }, + { + "epoch": 2.754611372598906, + "ewc_loss": 0.0818723514676094, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004451883723959327, + "grad_norm": 9.649563789367676, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.875402569770813, + "num_tokens": 826279925.0, + "step": 21654 + }, + { + "epoch": 2.7547385828774966, + "ewc_loss": 0.08113357424736023, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004378005978651345, + "grad_norm": 9.512067794799805, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8743016719818115, + "num_tokens": 826314076.0, + "step": 21655 + }, + { + "epoch": 2.754865793156087, + "ewc_loss": 0.08187593519687653, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004452241992112249, + "grad_norm": 9.593483924865723, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8621829748153687, + "num_tokens": 826354138.0, + "step": 21656 + }, + { + "epoch": 2.7549930034346777, + "ewc_loss": 0.08135701715946198, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044003501534461975, + "grad_norm": 9.523937225341797, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8603256940841675, + "num_tokens": 826395393.0, + "step": 21657 + }, + { + "epoch": 2.755120213713268, + "ewc_loss": 0.0821605920791626, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044562938273884356, + "grad_norm": 11.50452995300293, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8545694351196289, + "num_tokens": 826437539.0, + "step": 21658 + }, + { + "epoch": 2.7552474239918583, + "ewc_loss": 0.08035814762115479, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004300463479012251, + "grad_norm": 9.239439010620117, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8576794266700745, + "num_tokens": 826475857.0, + "step": 21659 + }, + { + "epoch": 2.7553746342704493, + "ewc_loss": 0.08614747226238251, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00048793957103043795, + "grad_norm": 10.337890625, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.873562216758728, + "num_tokens": 826513956.0, + "step": 21660 + }, + { + "epoch": 2.7555018445490393, + "ewc_loss": 0.0799831748008728, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00042629658128134906, + "grad_norm": 9.196954727172852, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8624907732009888, + "num_tokens": 826552747.0, + "step": 21661 + }, + { + "epoch": 2.7556290548276303, + "ewc_loss": 0.0868065357208252, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004945301916450262, + "grad_norm": 10.385811805725098, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8715865612030029, + "num_tokens": 826590830.0, + "step": 21662 + }, + { + "epoch": 2.7557562651062204, + "ewc_loss": 0.08105044066905975, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004369692469481379, + "grad_norm": 9.411036491394043, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8680843710899353, + "num_tokens": 826633414.0, + "step": 21663 + }, + { + "epoch": 2.755883475384811, + "ewc_loss": 0.08566996455192566, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00048316444735974073, + "grad_norm": 10.28857421875, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8739006519317627, + "num_tokens": 826668344.0, + "step": 21664 + }, + { + "epoch": 2.7560106856634015, + "ewc_loss": 0.08180485665798187, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004445133963599801, + "grad_norm": 14.890778541564941, + "learning_rate": 1e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8503767251968384, + "num_tokens": 826710587.0, + "step": 21665 + }, + { + "epoch": 2.756137895941992, + "ewc_loss": 0.0858723446726799, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00048518829862587154, + "grad_norm": 9.734817504882812, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8551232218742371, + "num_tokens": 826748854.0, + "step": 21666 + }, + { + "epoch": 2.7562651062205825, + "ewc_loss": 0.08721110224723816, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004985758569091558, + "grad_norm": 10.556280136108398, + "learning_rate": 1e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8518511056900024, + "num_tokens": 826789591.0, + "step": 21667 + }, + { + "epoch": 2.756392316499173, + "ewc_loss": 0.08169172704219818, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004433821013662964, + "grad_norm": 9.41136646270752, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8607246279716492, + "num_tokens": 826831890.0, + "step": 21668 + }, + { + "epoch": 2.7565195267777636, + "ewc_loss": 0.08872053027153015, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005136701511219144, + "grad_norm": 10.606409072875977, + "learning_rate": 1e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8467502593994141, + "num_tokens": 826870265.0, + "step": 21669 + }, + { + "epoch": 2.756646737056354, + "ewc_loss": 0.0822460800409317, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004489256825763732, + "grad_norm": 9.595287322998047, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8642120361328125, + "num_tokens": 826910993.0, + "step": 21670 + }, + { + "epoch": 2.7567739473349446, + "ewc_loss": 0.08707596361637115, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004972245078533888, + "grad_norm": 10.464396476745605, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8632378578186035, + "num_tokens": 826947492.0, + "step": 21671 + }, + { + "epoch": 2.756901157613535, + "ewc_loss": 0.08252574503421783, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004517223278526217, + "grad_norm": 9.646142959594727, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8675844073295593, + "num_tokens": 826987215.0, + "step": 21672 + }, + { + "epoch": 2.7570283678921257, + "ewc_loss": 0.08591410517692566, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00048560593859292567, + "grad_norm": 10.266558647155762, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8613309860229492, + "num_tokens": 827024092.0, + "step": 21673 + }, + { + "epoch": 2.757155578170716, + "ewc_loss": 0.08203045278787613, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004467693797778338, + "grad_norm": 9.624576568603516, + "learning_rate": 1e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8481917381286621, + "num_tokens": 827060541.0, + "step": 21674 + }, + { + "epoch": 2.7572827884493067, + "ewc_loss": 0.08522719144821167, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000478736765217036, + "grad_norm": 10.153755187988281, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8638049364089966, + "num_tokens": 827101205.0, + "step": 21675 + }, + { + "epoch": 2.7574099987278973, + "ewc_loss": 0.08196644484996796, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004461292992345989, + "grad_norm": 9.651430130004883, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8582342863082886, + "num_tokens": 827132441.0, + "step": 21676 + }, + { + "epoch": 2.757537209006488, + "ewc_loss": 0.08413566648960114, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004678215482272208, + "grad_norm": 15.163443565368652, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8649537563323975, + "num_tokens": 827169530.0, + "step": 21677 + }, + { + "epoch": 2.7576644192850783, + "ewc_loss": 0.09171247482299805, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0005435896455310285, + "grad_norm": 10.773605346679688, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8702639937400818, + "num_tokens": 827207352.0, + "step": 21678 + }, + { + "epoch": 2.757791629563669, + "ewc_loss": 0.08343270421028137, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046079192543402314, + "grad_norm": 9.897781372070312, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8582974076271057, + "num_tokens": 827242790.0, + "step": 21679 + }, + { + "epoch": 2.7579188398422594, + "ewc_loss": 0.08446970582008362, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004711619403678924, + "grad_norm": 10.145689964294434, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8663136959075928, + "num_tokens": 827281676.0, + "step": 21680 + }, + { + "epoch": 2.75804605012085, + "ewc_loss": 0.08287779241800308, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004552427853923291, + "grad_norm": 9.752760887145996, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8774197697639465, + "num_tokens": 827313676.0, + "step": 21681 + }, + { + "epoch": 2.7581732603994404, + "ewc_loss": 0.08411405980587006, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004676053940784186, + "grad_norm": 10.09795093536377, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8646950125694275, + "num_tokens": 827355017.0, + "step": 21682 + }, + { + "epoch": 2.758300470678031, + "ewc_loss": 0.0816223993897438, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000442688848124817, + "grad_norm": 9.566568374633789, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8819406032562256, + "num_tokens": 827391773.0, + "step": 21683 + }, + { + "epoch": 2.758427680956621, + "ewc_loss": 0.0840541273355484, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046700608800165355, + "grad_norm": 10.07933521270752, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8810559511184692, + "num_tokens": 827433034.0, + "step": 21684 + }, + { + "epoch": 2.758554891235212, + "ewc_loss": 0.08147536963224411, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004412185517139733, + "grad_norm": 9.606820106506348, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8763588666915894, + "num_tokens": 827475729.0, + "step": 21685 + }, + { + "epoch": 2.758682101513802, + "ewc_loss": 0.08321839570999146, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004586488357745111, + "grad_norm": 9.917821884155273, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8688807487487793, + "num_tokens": 827514127.0, + "step": 21686 + }, + { + "epoch": 2.758809311792393, + "ewc_loss": 0.08156037330627441, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044206861639395356, + "grad_norm": 9.626672744750977, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8768398761749268, + "num_tokens": 827546709.0, + "step": 21687 + }, + { + "epoch": 2.758936522070983, + "ewc_loss": 0.08268965780735016, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004533614555839449, + "grad_norm": 9.812041282653809, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8843033909797668, + "num_tokens": 827581450.0, + "step": 21688 + }, + { + "epoch": 2.7590637323495737, + "ewc_loss": 0.08174079656600952, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044387279194779694, + "grad_norm": 9.715921401977539, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8615221381187439, + "num_tokens": 827623161.0, + "step": 21689 + }, + { + "epoch": 2.7591909426281642, + "ewc_loss": 0.08212863653898239, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044775119749829173, + "grad_norm": 9.66472053527832, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8610888123512268, + "num_tokens": 827659460.0, + "step": 21690 + }, + { + "epoch": 2.7593181529067548, + "ewc_loss": 0.0819622352719307, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000446087185991928, + "grad_norm": 9.66354751586914, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8601195812225342, + "num_tokens": 827701110.0, + "step": 21691 + }, + { + "epoch": 2.7594453631853453, + "ewc_loss": 0.08202899992465973, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000446754798758775, + "grad_norm": 9.714763641357422, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8622382283210754, + "num_tokens": 827738992.0, + "step": 21692 + }, + { + "epoch": 2.759572573463936, + "ewc_loss": 0.08175025880336761, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004439673793967813, + "grad_norm": 9.62983226776123, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8502609729766846, + "num_tokens": 827775853.0, + "step": 21693 + }, + { + "epoch": 2.7596997837425263, + "ewc_loss": 0.08178847283124924, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044434957089833915, + "grad_norm": 9.605894088745117, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8674318194389343, + "num_tokens": 827819438.0, + "step": 21694 + }, + { + "epoch": 2.759826994021117, + "ewc_loss": 0.08186760544776917, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004451409331522882, + "grad_norm": 9.59029769897461, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8582878112792969, + "num_tokens": 827858833.0, + "step": 21695 + }, + { + "epoch": 2.7599542042997074, + "ewc_loss": 0.08207370340824127, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004472018626984209, + "grad_norm": 9.73701286315918, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.856579065322876, + "num_tokens": 827895069.0, + "step": 21696 + }, + { + "epoch": 2.760081414578298, + "ewc_loss": 0.08158177137374878, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000442282558651641, + "grad_norm": 9.545096397399902, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.880128800868988, + "num_tokens": 827932970.0, + "step": 21697 + }, + { + "epoch": 2.7602086248568884, + "ewc_loss": 0.08245846629142761, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045104947639629245, + "grad_norm": 9.721280097961426, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8768481612205505, + "num_tokens": 827981859.0, + "step": 21698 + }, + { + "epoch": 2.760335835135479, + "ewc_loss": 0.08143466711044312, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044081147643737495, + "grad_norm": 9.575472831726074, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8590636849403381, + "num_tokens": 828022018.0, + "step": 21699 + }, + { + "epoch": 2.7604630454140695, + "ewc_loss": 0.08236140757799149, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004500789218582213, + "grad_norm": 9.677583694458008, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8596124649047852, + "num_tokens": 828061546.0, + "step": 21700 + }, + { + "epoch": 2.76059025569266, + "ewc_loss": 0.08147435635328293, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004412083944771439, + "grad_norm": 9.541364669799805, + "learning_rate": 1e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8514789342880249, + "num_tokens": 828104464.0, + "step": 21701 + }, + { + "epoch": 2.7607174659712506, + "ewc_loss": 0.08238153159618378, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045028014574199915, + "grad_norm": 9.754157066345215, + "learning_rate": 1e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8541034460067749, + "num_tokens": 828142527.0, + "step": 21702 + }, + { + "epoch": 2.760844676249841, + "ewc_loss": 0.08124499022960663, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043891475070267916, + "grad_norm": 9.455355644226074, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8665106892585754, + "num_tokens": 828189670.0, + "step": 21703 + }, + { + "epoch": 2.7609718865284316, + "ewc_loss": 0.08270974457263947, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004535623302217573, + "grad_norm": 9.821070671081543, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8724759817123413, + "num_tokens": 828231511.0, + "step": 21704 + }, + { + "epoch": 2.761099096807022, + "ewc_loss": 0.08107082545757294, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004371731192804873, + "grad_norm": 9.460480690002441, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8655165433883667, + "num_tokens": 828274387.0, + "step": 21705 + }, + { + "epoch": 2.7612263070856127, + "ewc_loss": 0.08299049735069275, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045636980212293565, + "grad_norm": 9.794925689697266, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8574913144111633, + "num_tokens": 828312263.0, + "step": 21706 + }, + { + "epoch": 2.7613535173642028, + "ewc_loss": 0.08117865025997162, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043825129978358746, + "grad_norm": 9.499218940734863, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8662651777267456, + "num_tokens": 828345550.0, + "step": 21707 + }, + { + "epoch": 2.7614807276427937, + "ewc_loss": 0.08284273743629456, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004548922006506473, + "grad_norm": 9.799636840820312, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.852475643157959, + "num_tokens": 828378633.0, + "step": 21708 + }, + { + "epoch": 2.761607937921384, + "ewc_loss": 0.08111262321472168, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000437591050285846, + "grad_norm": 9.457258224487305, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8571264743804932, + "num_tokens": 828414292.0, + "step": 21709 + }, + { + "epoch": 2.761735148199975, + "ewc_loss": 0.08306053280830383, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004570702149067074, + "grad_norm": 9.90318775177002, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8647453188896179, + "num_tokens": 828452682.0, + "step": 21710 + }, + { + "epoch": 2.761862358478565, + "ewc_loss": 0.0810440331697464, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043690516031347215, + "grad_norm": 9.43001937866211, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8817514777183533, + "num_tokens": 828494162.0, + "step": 21711 + }, + { + "epoch": 2.761989568757156, + "ewc_loss": 0.08321085572242737, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045857339864596725, + "grad_norm": 9.863408088684082, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8697584867477417, + "num_tokens": 828525997.0, + "step": 21712 + }, + { + "epoch": 2.762116779035746, + "ewc_loss": 0.08109090477228165, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004373739066068083, + "grad_norm": 9.497536659240723, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8626574277877808, + "num_tokens": 828562488.0, + "step": 21713 + }, + { + "epoch": 2.7622439893143365, + "ewc_loss": 0.08297205716371536, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004561854002531618, + "grad_norm": 9.811767578125, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8823803067207336, + "num_tokens": 828591818.0, + "step": 21714 + }, + { + "epoch": 2.762371199592927, + "ewc_loss": 0.08130209147930145, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004394857387524098, + "grad_norm": 9.537542343139648, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8533426523208618, + "num_tokens": 828628872.0, + "step": 21715 + }, + { + "epoch": 2.7624984098715175, + "ewc_loss": 0.08271870017051697, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045365188270807266, + "grad_norm": 9.833803176879883, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8733934760093689, + "num_tokens": 828670939.0, + "step": 21716 + }, + { + "epoch": 2.762625620150108, + "ewc_loss": 0.08135199546813965, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043998478213325143, + "grad_norm": 9.490714073181152, + "learning_rate": 1e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8461130857467651, + "num_tokens": 828708446.0, + "step": 21717 + }, + { + "epoch": 2.7627528304286986, + "ewc_loss": 0.08260345458984375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004524994292296469, + "grad_norm": 9.756400108337402, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8702548146247864, + "num_tokens": 828749873.0, + "step": 21718 + }, + { + "epoch": 2.762880040707289, + "ewc_loss": 0.08147115260362625, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000441176351159811, + "grad_norm": 9.53182315826416, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8580701947212219, + "num_tokens": 828790066.0, + "step": 21719 + }, + { + "epoch": 2.7630072509858796, + "ewc_loss": 0.08237382769584656, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004502031079027802, + "grad_norm": 9.735468864440918, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8622679710388184, + "num_tokens": 828821911.0, + "step": 21720 + }, + { + "epoch": 2.76313446126447, + "ewc_loss": 0.08155924081802368, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000442057236796245, + "grad_norm": 9.579439163208008, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8581430912017822, + "num_tokens": 828856572.0, + "step": 21721 + }, + { + "epoch": 2.7632616715430607, + "ewc_loss": 0.08216820657253265, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004481468931771815, + "grad_norm": 9.712618827819824, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8764243125915527, + "num_tokens": 828898865.0, + "step": 21722 + }, + { + "epoch": 2.763388881821651, + "ewc_loss": 0.08152638375759125, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044172865455038846, + "grad_norm": 9.553276062011719, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8728474378585815, + "num_tokens": 828933768.0, + "step": 21723 + }, + { + "epoch": 2.7635160921002417, + "ewc_loss": 0.08230606466531754, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044952548341825604, + "grad_norm": 9.709582328796387, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.880498468875885, + "num_tokens": 828968869.0, + "step": 21724 + }, + { + "epoch": 2.7636433023788323, + "ewc_loss": 0.08154001832008362, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044186506420373917, + "grad_norm": 9.639697074890137, + "learning_rate": 1e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8494243621826172, + "num_tokens": 829005621.0, + "step": 21725 + }, + { + "epoch": 2.763770512657423, + "ewc_loss": 0.08192369341850281, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044570182217285037, + "grad_norm": 9.691498756408691, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8526036739349365, + "num_tokens": 829045046.0, + "step": 21726 + }, + { + "epoch": 2.7638977229360133, + "ewc_loss": 0.08167913556098938, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004432561690919101, + "grad_norm": 9.52507495880127, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8783981204032898, + "num_tokens": 829084588.0, + "step": 21727 + }, + { + "epoch": 2.764024933214604, + "ewc_loss": 0.08188614249229431, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044532629544846714, + "grad_norm": 9.65817928314209, + "learning_rate": 1e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8474804162979126, + "num_tokens": 829121010.0, + "step": 21728 + }, + { + "epoch": 2.7641521434931944, + "ewc_loss": 0.08135949820280075, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044005984091199934, + "grad_norm": 9.54256820678711, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8745853304862976, + "num_tokens": 829159618.0, + "step": 21729 + }, + { + "epoch": 2.764279353771785, + "ewc_loss": 0.08202892541885376, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044675410026684403, + "grad_norm": 9.595767974853516, + "learning_rate": 1e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8520381450653076, + "num_tokens": 829204826.0, + "step": 21730 + }, + { + "epoch": 2.7644065640503754, + "ewc_loss": 0.08159304410219193, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044239527778699994, + "grad_norm": 9.584029197692871, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8750416040420532, + "num_tokens": 829240741.0, + "step": 21731 + }, + { + "epoch": 2.7645337743289655, + "ewc_loss": 0.0818810909986496, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004452758003026247, + "grad_norm": 9.62183952331543, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8843631744384766, + "num_tokens": 829278065.0, + "step": 21732 + }, + { + "epoch": 2.7646609846075565, + "ewc_loss": 0.08168253302574158, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004432901623658836, + "grad_norm": 9.589006423950195, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8618268966674805, + "num_tokens": 829319438.0, + "step": 21733 + }, + { + "epoch": 2.7647881948861466, + "ewc_loss": 0.08175539970397949, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004440188058651984, + "grad_norm": 9.588590621948242, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8755718469619751, + "num_tokens": 829357299.0, + "step": 21734 + }, + { + "epoch": 2.7649154051647375, + "ewc_loss": 0.08174336701631546, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004438985197339207, + "grad_norm": 9.559805870056152, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8670097589492798, + "num_tokens": 829398600.0, + "step": 21735 + }, + { + "epoch": 2.7650426154433276, + "ewc_loss": 0.081776924431324, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044423408689908683, + "grad_norm": 9.674999237060547, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8534830808639526, + "num_tokens": 829439214.0, + "step": 21736 + }, + { + "epoch": 2.7651698257219186, + "ewc_loss": 0.08152950555086136, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000441759912064299, + "grad_norm": 9.560887336730957, + "learning_rate": 1e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8535606265068054, + "num_tokens": 829477634.0, + "step": 21737 + }, + { + "epoch": 2.7652970360005087, + "ewc_loss": 0.08190639317035675, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044552877079695463, + "grad_norm": 9.641353607177734, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8712692260742188, + "num_tokens": 829514527.0, + "step": 21738 + }, + { + "epoch": 2.765424246279099, + "ewc_loss": 0.08150845766067505, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044154946226626635, + "grad_norm": 9.636550903320312, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8660933971405029, + "num_tokens": 829558531.0, + "step": 21739 + }, + { + "epoch": 2.7655514565576897, + "ewc_loss": 0.0817381739616394, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004438465693965554, + "grad_norm": 9.666555404663086, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8685568571090698, + "num_tokens": 829600769.0, + "step": 21740 + }, + { + "epoch": 2.7656786668362803, + "ewc_loss": 0.08138716965913773, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004403365310281515, + "grad_norm": 9.602540016174316, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8692964315414429, + "num_tokens": 829635938.0, + "step": 21741 + }, + { + "epoch": 2.765805877114871, + "ewc_loss": 0.08183496445417404, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000444814475486055, + "grad_norm": 9.663999557495117, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8643856644630432, + "num_tokens": 829672956.0, + "step": 21742 + }, + { + "epoch": 2.7659330873934613, + "ewc_loss": 0.08139613270759583, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004404261999297887, + "grad_norm": 9.549445152282715, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.864605188369751, + "num_tokens": 829712658.0, + "step": 21743 + }, + { + "epoch": 2.766060297672052, + "ewc_loss": 0.08196262270212173, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004460910859052092, + "grad_norm": 9.725066184997559, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8665964007377625, + "num_tokens": 829755429.0, + "step": 21744 + }, + { + "epoch": 2.7661875079506424, + "ewc_loss": 0.08136610686779022, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004401259357109666, + "grad_norm": 9.531469345092773, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.863379955291748, + "num_tokens": 829796373.0, + "step": 21745 + }, + { + "epoch": 2.766314718229233, + "ewc_loss": 0.08210846781730652, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000447549537057057, + "grad_norm": 9.717568397521973, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8598071336746216, + "num_tokens": 829834049.0, + "step": 21746 + }, + { + "epoch": 2.7664419285078234, + "ewc_loss": 0.08133544772863388, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004398193268571049, + "grad_norm": 9.566338539123535, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8645861148834229, + "num_tokens": 829870853.0, + "step": 21747 + }, + { + "epoch": 2.766569138786414, + "ewc_loss": 0.0822700783610344, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044916561455465853, + "grad_norm": 9.798657417297363, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8590103387832642, + "num_tokens": 829907519.0, + "step": 21748 + }, + { + "epoch": 2.7666963490650045, + "ewc_loss": 0.08107047528028488, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043716959771700203, + "grad_norm": 9.532588005065918, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8519392013549805, + "num_tokens": 829947162.0, + "step": 21749 + }, + { + "epoch": 2.766823559343595, + "ewc_loss": 0.08248065412044525, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045127139310352504, + "grad_norm": 9.813323974609375, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8629305362701416, + "num_tokens": 829991981.0, + "step": 21750 + }, + { + "epoch": 2.7669507696221856, + "ewc_loss": 0.08107293397188187, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000437194190453738, + "grad_norm": 9.463923454284668, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8720375299453735, + "num_tokens": 830032409.0, + "step": 21751 + }, + { + "epoch": 2.767077979900776, + "ewc_loss": 0.08263744413852692, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045283930376172066, + "grad_norm": 9.778712272644043, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8673135042190552, + "num_tokens": 830072738.0, + "step": 21752 + }, + { + "epoch": 2.7672051901793666, + "ewc_loss": 0.08167797327041626, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004383617197163403, + "grad_norm": 9.516518592834473, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8603043556213379, + "num_tokens": 830110880.0, + "step": 21753 + }, + { + "epoch": 2.767332400457957, + "ewc_loss": 0.08267134428024292, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004531783051788807, + "grad_norm": 9.839188575744629, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8596598505973816, + "num_tokens": 830150148.0, + "step": 21754 + }, + { + "epoch": 2.7674596107365477, + "ewc_loss": 0.08114471286535263, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004379119782242924, + "grad_norm": 9.56442642211914, + "learning_rate": 1e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8522176742553711, + "num_tokens": 830190717.0, + "step": 21755 + }, + { + "epoch": 2.767586821015138, + "ewc_loss": 0.08241746574640274, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045063949073664844, + "grad_norm": 9.748390197753906, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8668097257614136, + "num_tokens": 830226972.0, + "step": 21756 + }, + { + "epoch": 2.7677140312937283, + "ewc_loss": 0.08127345144748688, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000439199386164546, + "grad_norm": 9.538202285766602, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8746429681777954, + "num_tokens": 830262612.0, + "step": 21757 + }, + { + "epoch": 2.7678412415723193, + "ewc_loss": 0.08234582096338272, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000449923041742295, + "grad_norm": 9.785822868347168, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8708094358444214, + "num_tokens": 830306856.0, + "step": 21758 + }, + { + "epoch": 2.7679684518509093, + "ewc_loss": 0.08118791878223419, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004383440245874226, + "grad_norm": 9.549105644226074, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8632686138153076, + "num_tokens": 830340726.0, + "step": 21759 + }, + { + "epoch": 2.7680956621295003, + "ewc_loss": 0.08238375186920166, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004503023810684681, + "grad_norm": 9.75979995727539, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8642964363098145, + "num_tokens": 830380697.0, + "step": 21760 + }, + { + "epoch": 2.7682228724080904, + "ewc_loss": 0.0811413824558258, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043787871254608035, + "grad_norm": 9.528409957885742, + "learning_rate": 1e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8421758413314819, + "num_tokens": 830414737.0, + "step": 21761 + }, + { + "epoch": 2.768350082686681, + "ewc_loss": 0.08248800039291382, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045134485117159784, + "grad_norm": 9.771293640136719, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8656693696975708, + "num_tokens": 830456518.0, + "step": 21762 + }, + { + "epoch": 2.7684772929652715, + "ewc_loss": 0.08145745098590851, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004385978973004967, + "grad_norm": 9.56775188446045, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8617678284645081, + "num_tokens": 830494192.0, + "step": 21763 + }, + { + "epoch": 2.768604503243862, + "ewc_loss": 0.08239465951919556, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004504114913288504, + "grad_norm": 9.756669044494629, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8568038940429688, + "num_tokens": 830533659.0, + "step": 21764 + }, + { + "epoch": 2.7687317135224525, + "ewc_loss": 0.08129443973302841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043940922478213906, + "grad_norm": 9.563647270202637, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.858531653881073, + "num_tokens": 830569605.0, + "step": 21765 + }, + { + "epoch": 2.768858923801043, + "ewc_loss": 0.08224332332611084, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044889809214510024, + "grad_norm": 9.753344535827637, + "learning_rate": 1e-06, + "loss": 0.544, + "mean_token_accuracy": 0.843712329864502, + "num_tokens": 830610814.0, + "step": 21766 + }, + { + "epoch": 2.7689861340796336, + "ewc_loss": 0.0814155787229538, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044062064262107015, + "grad_norm": 9.597978591918945, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8643864989280701, + "num_tokens": 830646811.0, + "step": 21767 + }, + { + "epoch": 2.769113344358224, + "ewc_loss": 0.08225002884864807, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044896514737047255, + "grad_norm": 9.731037139892578, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8698453903198242, + "num_tokens": 830680984.0, + "step": 21768 + }, + { + "epoch": 2.7692405546368146, + "ewc_loss": 0.08142843842506409, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044074925244785845, + "grad_norm": 9.599465370178223, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8647519946098328, + "num_tokens": 830713384.0, + "step": 21769 + }, + { + "epoch": 2.769367764915405, + "ewc_loss": 0.08209943771362305, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004474591987673193, + "grad_norm": 9.718883514404297, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8518044948577881, + "num_tokens": 830746922.0, + "step": 21770 + }, + { + "epoch": 2.7694949751939957, + "ewc_loss": 0.08142082393169403, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044067302951589227, + "grad_norm": 9.559263229370117, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8721792697906494, + "num_tokens": 830789459.0, + "step": 21771 + }, + { + "epoch": 2.769622185472586, + "ewc_loss": 0.08203995227813721, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004468643746804446, + "grad_norm": 9.721014976501465, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.862316370010376, + "num_tokens": 830826736.0, + "step": 21772 + }, + { + "epoch": 2.7697493957511767, + "ewc_loss": 0.08150476217269897, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004415125004015863, + "grad_norm": 9.60142707824707, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8668394088745117, + "num_tokens": 830868118.0, + "step": 21773 + }, + { + "epoch": 2.7698766060297673, + "ewc_loss": 0.08182689547538757, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044473379966802895, + "grad_norm": 9.65041732788086, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8622614741325378, + "num_tokens": 830911658.0, + "step": 21774 + }, + { + "epoch": 2.770003816308358, + "ewc_loss": 0.08147791773080826, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004412440175656229, + "grad_norm": 9.568912506103516, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8649783730506897, + "num_tokens": 830948456.0, + "step": 21775 + }, + { + "epoch": 2.7701310265869483, + "ewc_loss": 0.08215290307998657, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044555243221111596, + "grad_norm": 9.723274230957031, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8740200996398926, + "num_tokens": 830988790.0, + "step": 21776 + }, + { + "epoch": 2.770258236865539, + "ewc_loss": 0.08126388490200043, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043910369277000427, + "grad_norm": 9.516046524047852, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8574200868606567, + "num_tokens": 831024551.0, + "step": 21777 + }, + { + "epoch": 2.7703854471441294, + "ewc_loss": 0.082175612449646, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044822096242569387, + "grad_norm": 9.70742416381836, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8615231513977051, + "num_tokens": 831060082.0, + "step": 21778 + }, + { + "epoch": 2.77051265742272, + "ewc_loss": 0.0815320834517479, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004417856689542532, + "grad_norm": 9.632948875427246, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8651654720306396, + "num_tokens": 831100067.0, + "step": 21779 + }, + { + "epoch": 2.77063986770131, + "ewc_loss": 0.08204193413257599, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004468842234928161, + "grad_norm": 9.75629997253418, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8747243881225586, + "num_tokens": 831131923.0, + "step": 21780 + }, + { + "epoch": 2.770767077979901, + "ewc_loss": 0.0813232809305191, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004396977019496262, + "grad_norm": 9.625248908996582, + "learning_rate": 1e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8483824133872986, + "num_tokens": 831167985.0, + "step": 21781 + }, + { + "epoch": 2.770894288258491, + "ewc_loss": 0.08194397389888763, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000445904559455812, + "grad_norm": 9.690716743469238, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8596192598342896, + "num_tokens": 831211737.0, + "step": 21782 + }, + { + "epoch": 2.771021498537082, + "ewc_loss": 0.08179518580436707, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004419753095135093, + "grad_norm": 9.684499740600586, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.853365421295166, + "num_tokens": 831248764.0, + "step": 21783 + }, + { + "epoch": 2.771148708815672, + "ewc_loss": 0.08178746700286865, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000441898126155138, + "grad_norm": 9.730348587036133, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8707329630851746, + "num_tokens": 831284140.0, + "step": 21784 + }, + { + "epoch": 2.771275919094263, + "ewc_loss": 0.08145052194595337, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044097009231336415, + "grad_norm": 9.674577713012695, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8677586317062378, + "num_tokens": 831318567.0, + "step": 21785 + }, + { + "epoch": 2.771403129372853, + "ewc_loss": 0.08144126832485199, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004408775712363422, + "grad_norm": 9.618544578552246, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8583806157112122, + "num_tokens": 831357128.0, + "step": 21786 + }, + { + "epoch": 2.7715303396514437, + "ewc_loss": 0.08143462985754013, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044081112719140947, + "grad_norm": 9.681351661682129, + "learning_rate": 1e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.8423657417297363, + "num_tokens": 831391551.0, + "step": 21787 + }, + { + "epoch": 2.771657549930034, + "ewc_loss": 0.08123762905597687, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043884108890779316, + "grad_norm": 9.645145416259766, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8613638877868652, + "num_tokens": 831430332.0, + "step": 21788 + }, + { + "epoch": 2.7717847602086247, + "ewc_loss": 0.08148175477981567, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004412823764141649, + "grad_norm": 9.571950912475586, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8694082498550415, + "num_tokens": 831471639.0, + "step": 21789 + }, + { + "epoch": 2.7719119704872153, + "ewc_loss": 0.08140422403812408, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044050710857845843, + "grad_norm": 9.631455421447754, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8691245317459106, + "num_tokens": 831506470.0, + "step": 21790 + }, + { + "epoch": 2.772039180765806, + "ewc_loss": 0.0814591646194458, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044105647248215973, + "grad_norm": 9.723501205444336, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8579325675964355, + "num_tokens": 831547212.0, + "step": 21791 + }, + { + "epoch": 2.7721663910443963, + "ewc_loss": 0.08123494684696198, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004388142842799425, + "grad_norm": 9.730212211608887, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8502887487411499, + "num_tokens": 831593181.0, + "step": 21792 + }, + { + "epoch": 2.772293601322987, + "ewc_loss": 0.08137800544500351, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044024488306604326, + "grad_norm": 9.803143501281738, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8802511692047119, + "num_tokens": 831624889.0, + "step": 21793 + }, + { + "epoch": 2.7724208116015774, + "ewc_loss": 0.08108846098184586, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004349080554675311, + "grad_norm": 9.567508697509766, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8710063099861145, + "num_tokens": 831662389.0, + "step": 21794 + }, + { + "epoch": 2.772548021880168, + "ewc_loss": 0.0815884992480278, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004423498176038265, + "grad_norm": 9.62283706665039, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8669666051864624, + "num_tokens": 831705282.0, + "step": 21795 + }, + { + "epoch": 2.7726752321587584, + "ewc_loss": 0.08123818039894104, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043884661863557994, + "grad_norm": 9.63545036315918, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.872246503829956, + "num_tokens": 831741604.0, + "step": 21796 + }, + { + "epoch": 2.772802442437349, + "ewc_loss": 0.08145717531442642, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004410365945659578, + "grad_norm": 9.638370513916016, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.853962242603302, + "num_tokens": 831784165.0, + "step": 21797 + }, + { + "epoch": 2.7729296527159395, + "ewc_loss": 0.0814414992928505, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004408798413351178, + "grad_norm": 9.653069496154785, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8684563636779785, + "num_tokens": 831821101.0, + "step": 21798 + }, + { + "epoch": 2.77305686299453, + "ewc_loss": 0.08162818849086761, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044030530261807144, + "grad_norm": 9.628050804138184, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8675098419189453, + "num_tokens": 831855766.0, + "step": 21799 + }, + { + "epoch": 2.7731840732731206, + "ewc_loss": 0.08149589598178864, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004414238501340151, + "grad_norm": 9.588058471679688, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8693163990974426, + "num_tokens": 831895692.0, + "step": 21800 + }, + { + "epoch": 2.773311283551711, + "ewc_loss": 0.08146286755800247, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044109352165833116, + "grad_norm": 9.641130447387695, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.882201611995697, + "num_tokens": 831931079.0, + "step": 21801 + }, + { + "epoch": 2.7734384938303016, + "ewc_loss": 0.08124113082885742, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000438876188127324, + "grad_norm": 9.599447250366211, + "learning_rate": 1e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8370862007141113, + "num_tokens": 831969742.0, + "step": 21802 + }, + { + "epoch": 2.773565704108892, + "ewc_loss": 0.08185514807701111, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004450163687579334, + "grad_norm": 9.62165641784668, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8541106581687927, + "num_tokens": 832011059.0, + "step": 21803 + }, + { + "epoch": 2.7736929143874827, + "ewc_loss": 0.08149977028369904, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000441462587332353, + "grad_norm": 9.590415954589844, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8750247955322266, + "num_tokens": 832042768.0, + "step": 21804 + }, + { + "epoch": 2.7738201246660728, + "ewc_loss": 0.08169560134410858, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004434208385646343, + "grad_norm": 9.654019355773926, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.873112142086029, + "num_tokens": 832077699.0, + "step": 21805 + }, + { + "epoch": 2.7739473349446637, + "ewc_loss": 0.0817803218960762, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004442680801730603, + "grad_norm": 9.6050386428833, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8730825185775757, + "num_tokens": 832113528.0, + "step": 21806 + }, + { + "epoch": 2.774074545223254, + "ewc_loss": 0.08176808059215546, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004441456403583288, + "grad_norm": 9.639847755432129, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8681791424751282, + "num_tokens": 832149108.0, + "step": 21807 + }, + { + "epoch": 2.774201755501845, + "ewc_loss": 0.08153977990150452, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004418626776896417, + "grad_norm": 9.630335807800293, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8819359540939331, + "num_tokens": 832189279.0, + "step": 21808 + }, + { + "epoch": 2.774328965780435, + "ewc_loss": 0.08165685832500458, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044303343747742474, + "grad_norm": 9.569941520690918, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8662818670272827, + "num_tokens": 832224423.0, + "step": 21809 + }, + { + "epoch": 2.774456176059026, + "ewc_loss": 0.08189424127340317, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044540726230479777, + "grad_norm": 9.759270668029785, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.858648419380188, + "num_tokens": 832260389.0, + "step": 21810 + }, + { + "epoch": 2.774583386337616, + "ewc_loss": 0.08127947151660919, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004392595437821001, + "grad_norm": 9.58399486541748, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.863215446472168, + "num_tokens": 832297591.0, + "step": 21811 + }, + { + "epoch": 2.7747105966162064, + "ewc_loss": 0.08202464878559113, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004467112885322422, + "grad_norm": 9.723152160644531, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8673248291015625, + "num_tokens": 832330195.0, + "step": 21812 + }, + { + "epoch": 2.774837806894797, + "ewc_loss": 0.08115541934967041, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004380190512165427, + "grad_norm": 9.577469825744629, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8707612752914429, + "num_tokens": 832361613.0, + "step": 21813 + }, + { + "epoch": 2.7749650171733875, + "ewc_loss": 0.08185788244009018, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004450436681509018, + "grad_norm": 9.677996635437012, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8708170056343079, + "num_tokens": 832402800.0, + "step": 21814 + }, + { + "epoch": 2.775092227451978, + "ewc_loss": 0.08128602802753448, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043932508560828865, + "grad_norm": 9.575240135192871, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8666332960128784, + "num_tokens": 832445161.0, + "step": 21815 + }, + { + "epoch": 2.7752194377305686, + "ewc_loss": 0.08169230818748474, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004433879512362182, + "grad_norm": 9.669753074645996, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8495201468467712, + "num_tokens": 832487905.0, + "step": 21816 + }, + { + "epoch": 2.775346648009159, + "ewc_loss": 0.08131301403045654, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043959502363577485, + "grad_norm": 9.569422721862793, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8653625249862671, + "num_tokens": 832521107.0, + "step": 21817 + }, + { + "epoch": 2.7754738582877496, + "ewc_loss": 0.08173646032810211, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044382945634424686, + "grad_norm": 9.73132610321045, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8614808320999146, + "num_tokens": 832557213.0, + "step": 21818 + }, + { + "epoch": 2.77560106856634, + "ewc_loss": 0.08118022978305817, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043826719047501683, + "grad_norm": 9.617140769958496, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8516911268234253, + "num_tokens": 832600445.0, + "step": 21819 + }, + { + "epoch": 2.7757282788449307, + "ewc_loss": 0.08177080750465393, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044173153582960367, + "grad_norm": 9.661620140075684, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8648157715797424, + "num_tokens": 832635994.0, + "step": 21820 + }, + { + "epoch": 2.775855489123521, + "ewc_loss": 0.0814930647611618, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043895404087379575, + "grad_norm": 9.640579223632812, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8758292198181152, + "num_tokens": 832677151.0, + "step": 21821 + }, + { + "epoch": 2.7759826994021117, + "ewc_loss": 0.08154051005840302, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004394285788293928, + "grad_norm": 9.55972957611084, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8644452095031738, + "num_tokens": 832713995.0, + "step": 21822 + }, + { + "epoch": 2.7761099096807023, + "ewc_loss": 0.08184491097927094, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000442472577560693, + "grad_norm": 9.698817253112793, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8542464375495911, + "num_tokens": 832754201.0, + "step": 21823 + }, + { + "epoch": 2.776237119959293, + "ewc_loss": 0.08135668933391571, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043759035179391503, + "grad_norm": 9.571393013000488, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8606346845626831, + "num_tokens": 832796920.0, + "step": 21824 + }, + { + "epoch": 2.7763643302378833, + "ewc_loss": 0.08205501735210419, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044457358308136463, + "grad_norm": 9.689885139465332, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8612075448036194, + "num_tokens": 832835952.0, + "step": 21825 + }, + { + "epoch": 2.776491540516474, + "ewc_loss": 0.08135022222995758, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043752568308264017, + "grad_norm": 9.556426048278809, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8685757517814636, + "num_tokens": 832878306.0, + "step": 21826 + }, + { + "epoch": 2.7766187507950644, + "ewc_loss": 0.08214618265628815, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044548523146659136, + "grad_norm": 9.728602409362793, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8567541837692261, + "num_tokens": 832914315.0, + "step": 21827 + }, + { + "epoch": 2.776745961073655, + "ewc_loss": 0.08136805146932602, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043770394404418766, + "grad_norm": 9.514569282531738, + "learning_rate": 1e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8619562387466431, + "num_tokens": 832956661.0, + "step": 21828 + }, + { + "epoch": 2.7768731713522454, + "ewc_loss": 0.08250097930431366, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004490332503337413, + "grad_norm": 9.785076141357422, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8708590269088745, + "num_tokens": 832999446.0, + "step": 21829 + }, + { + "epoch": 2.7770003816308355, + "ewc_loss": 0.08114150911569595, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043543853098526597, + "grad_norm": 9.541457176208496, + "learning_rate": 1e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.8373720645904541, + "num_tokens": 833039001.0, + "step": 21830 + }, + { + "epoch": 2.7771275919094265, + "ewc_loss": 0.0825817659497261, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044984108535572886, + "grad_norm": 9.814685821533203, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8627611398696899, + "num_tokens": 833074334.0, + "step": 21831 + }, + { + "epoch": 2.7772548021880166, + "ewc_loss": 0.08137324452400208, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004377559234853834, + "grad_norm": 9.5430326461792, + "learning_rate": 1e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8526973128318787, + "num_tokens": 833113089.0, + "step": 21832 + }, + { + "epoch": 2.7773820124666075, + "ewc_loss": 0.08222416043281555, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044870650162920356, + "grad_norm": 9.772348403930664, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8620471954345703, + "num_tokens": 833149755.0, + "step": 21833 + }, + { + "epoch": 2.7775092227451976, + "ewc_loss": 0.08111537992954254, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043761866982094944, + "grad_norm": 9.5195894241333, + "learning_rate": 1e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8551195859909058, + "num_tokens": 833187718.0, + "step": 21834 + }, + { + "epoch": 2.7776364330237886, + "ewc_loss": 0.08237439393997192, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004502087540458888, + "grad_norm": 9.765108108520508, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8700786828994751, + "num_tokens": 833221416.0, + "step": 21835 + }, + { + "epoch": 2.7777636433023787, + "ewc_loss": 0.08108281344175339, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043729296885430813, + "grad_norm": 9.52077865600586, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8796325922012329, + "num_tokens": 833261101.0, + "step": 21836 + }, + { + "epoch": 2.777890853580969, + "ewc_loss": 0.08216848969459534, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004481497744563967, + "grad_norm": 9.694114685058594, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8724586963653564, + "num_tokens": 833296804.0, + "step": 21837 + }, + { + "epoch": 2.7780180638595597, + "ewc_loss": 0.0815407931804657, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004394313145894557, + "grad_norm": 9.553190231323242, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8646510243415833, + "num_tokens": 833332005.0, + "step": 21838 + }, + { + "epoch": 2.7781452741381503, + "ewc_loss": 0.08224187046289444, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004464421363081783, + "grad_norm": 9.690910339355469, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8600357174873352, + "num_tokens": 833371991.0, + "step": 21839 + }, + { + "epoch": 2.778272484416741, + "ewc_loss": 0.08167952299118042, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044081866508349776, + "grad_norm": 9.533918380737305, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8771346807479858, + "num_tokens": 833407514.0, + "step": 21840 + }, + { + "epoch": 2.7783996946953313, + "ewc_loss": 0.08231569826602936, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044718041317537427, + "grad_norm": 9.711935997009277, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8626492619514465, + "num_tokens": 833444519.0, + "step": 21841 + }, + { + "epoch": 2.778526904973922, + "ewc_loss": 0.0817670077085495, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044169355533085763, + "grad_norm": 9.558952331542969, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8606576919555664, + "num_tokens": 833482252.0, + "step": 21842 + }, + { + "epoch": 2.7786541152525124, + "ewc_loss": 0.08233416080474854, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004473650478757918, + "grad_norm": 9.679716110229492, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8643796443939209, + "num_tokens": 833515513.0, + "step": 21843 + }, + { + "epoch": 2.778781325531103, + "ewc_loss": 0.08165336400270462, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044055707985535264, + "grad_norm": 9.581375122070312, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8716979026794434, + "num_tokens": 833551188.0, + "step": 21844 + }, + { + "epoch": 2.7789085358096934, + "ewc_loss": 0.08213549852371216, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004453783913049847, + "grad_norm": 9.66029167175293, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8734116554260254, + "num_tokens": 833589159.0, + "step": 21845 + }, + { + "epoch": 2.779035746088284, + "ewc_loss": 0.08176511526107788, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044167457963339984, + "grad_norm": 9.634902000427246, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8535956144332886, + "num_tokens": 833623581.0, + "step": 21846 + }, + { + "epoch": 2.7791629563668745, + "ewc_loss": 0.08207766711711884, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004448000981938094, + "grad_norm": 9.693681716918945, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8524368405342102, + "num_tokens": 833660466.0, + "step": 21847 + }, + { + "epoch": 2.779290166645465, + "ewc_loss": 0.08159538358449936, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004399772733449936, + "grad_norm": 9.58337688446045, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8536845445632935, + "num_tokens": 833697465.0, + "step": 21848 + }, + { + "epoch": 2.7794173769240555, + "ewc_loss": 0.082125723361969, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000445280660642311, + "grad_norm": 9.668001174926758, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8625535368919373, + "num_tokens": 833735178.0, + "step": 21849 + }, + { + "epoch": 2.779544587202646, + "ewc_loss": 0.0815921425819397, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043994482257403433, + "grad_norm": 9.582590103149414, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8798335790634155, + "num_tokens": 833770193.0, + "step": 21850 + }, + { + "epoch": 2.7796717974812366, + "ewc_loss": 0.08197320252656937, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004437554453033954, + "grad_norm": 9.619789123535156, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8652509450912476, + "num_tokens": 833807836.0, + "step": 21851 + }, + { + "epoch": 2.779799007759827, + "ewc_loss": 0.0817374587059021, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004413980059325695, + "grad_norm": 9.622601509094238, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8608489632606506, + "num_tokens": 833847605.0, + "step": 21852 + }, + { + "epoch": 2.7799262180384177, + "ewc_loss": 0.08182835578918457, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044230700586922467, + "grad_norm": 9.56580924987793, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8669102787971497, + "num_tokens": 833884239.0, + "step": 21853 + }, + { + "epoch": 2.780053428317008, + "ewc_loss": 0.08210209012031555, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044504436664283276, + "grad_norm": 9.66530990600586, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8638814687728882, + "num_tokens": 833921305.0, + "step": 21854 + }, + { + "epoch": 2.7801806385955983, + "ewc_loss": 0.08168365061283112, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004408599343150854, + "grad_norm": 9.593928337097168, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.855686366558075, + "num_tokens": 833959297.0, + "step": 21855 + }, + { + "epoch": 2.7803078488741892, + "ewc_loss": 0.08193992078304291, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044342264300212264, + "grad_norm": 9.693012237548828, + "learning_rate": 1e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.8391165137290955, + "num_tokens": 833992135.0, + "step": 21856 + }, + { + "epoch": 2.7804350591527793, + "ewc_loss": 0.08190666139125824, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044064861140213907, + "grad_norm": 9.534781455993652, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8536080121994019, + "num_tokens": 834035933.0, + "step": 21857 + }, + { + "epoch": 2.7805622694313703, + "ewc_loss": 0.08225083351135254, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044653177610598505, + "grad_norm": 9.749136924743652, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8634935021400452, + "num_tokens": 834070187.0, + "step": 21858 + }, + { + "epoch": 2.7806894797099604, + "ewc_loss": 0.08126096427440643, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043663306860253215, + "grad_norm": 9.498374938964844, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8667452931404114, + "num_tokens": 834107370.0, + "step": 21859 + }, + { + "epoch": 2.780816689988551, + "ewc_loss": 0.08267360925674438, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004483181401155889, + "grad_norm": 10.040600776672363, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8557544946670532, + "num_tokens": 834145357.0, + "step": 21860 + }, + { + "epoch": 2.7809439002671414, + "ewc_loss": 0.08069420605897903, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004285241011530161, + "grad_norm": 9.361323356628418, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8624492883682251, + "num_tokens": 834187250.0, + "step": 21861 + }, + { + "epoch": 2.781071110545732, + "ewc_loss": 0.08346093446016312, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004561913665384054, + "grad_norm": 16.533199310302734, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8754096031188965, + "num_tokens": 834227598.0, + "step": 21862 + }, + { + "epoch": 2.7811983208243225, + "ewc_loss": 0.09430946409702301, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0005671180551871657, + "grad_norm": 10.929430961608887, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8784793019294739, + "num_tokens": 834267032.0, + "step": 21863 + }, + { + "epoch": 2.781325531102913, + "ewc_loss": 0.0835464745759964, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045948821934871376, + "grad_norm": 9.86229419708252, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8532604575157166, + "num_tokens": 834308479.0, + "step": 21864 + }, + { + "epoch": 2.7814527413815036, + "ewc_loss": 0.08414427936077118, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046302477130666375, + "grad_norm": 10.101612091064453, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8737347722053528, + "num_tokens": 834346185.0, + "step": 21865 + }, + { + "epoch": 2.781579951660094, + "ewc_loss": 0.08415855467319489, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046560895862057805, + "grad_norm": 9.844710350036621, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8728865385055542, + "num_tokens": 834384404.0, + "step": 21866 + }, + { + "epoch": 2.7817071619386846, + "ewc_loss": 0.083210289478302, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045612629037350416, + "grad_norm": 9.913236618041992, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8623607158660889, + "num_tokens": 834424833.0, + "step": 21867 + }, + { + "epoch": 2.781834372217275, + "ewc_loss": 0.08267263323068619, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000450749765150249, + "grad_norm": 9.758564949035645, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8653054237365723, + "num_tokens": 834464215.0, + "step": 21868 + }, + { + "epoch": 2.7819615824958657, + "ewc_loss": 0.08297233283519745, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045374673209153116, + "grad_norm": 9.870882987976074, + "learning_rate": 1e-06, + "loss": 0.5555, + "mean_token_accuracy": 0.8419432640075684, + "num_tokens": 834502452.0, + "step": 21869 + }, + { + "epoch": 2.782088792774456, + "ewc_loss": 0.08251574635505676, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044918095227330923, + "grad_norm": 9.790348052978516, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8658779859542847, + "num_tokens": 834539690.0, + "step": 21870 + }, + { + "epoch": 2.7822160030530467, + "ewc_loss": 0.08226338028907776, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004466572718229145, + "grad_norm": 9.731639862060547, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8533535599708557, + "num_tokens": 834583193.0, + "step": 21871 + }, + { + "epoch": 2.7823432133316373, + "ewc_loss": 0.08229053020477295, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004469287523534149, + "grad_norm": 9.727066040039062, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8596240282058716, + "num_tokens": 834620028.0, + "step": 21872 + }, + { + "epoch": 2.782470423610228, + "ewc_loss": 0.08187929540872574, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000445257785031572, + "grad_norm": 9.767417907714844, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8530546426773071, + "num_tokens": 834657351.0, + "step": 21873 + }, + { + "epoch": 2.7825976338888183, + "ewc_loss": 0.08191187679767609, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004431421693880111, + "grad_norm": 9.765876770019531, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8792260885238647, + "num_tokens": 834693687.0, + "step": 21874 + }, + { + "epoch": 2.782724844167409, + "ewc_loss": 0.08189839869737625, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004430074186529964, + "grad_norm": 9.628822326660156, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.862906277179718, + "num_tokens": 834736171.0, + "step": 21875 + }, + { + "epoch": 2.7828520544459994, + "ewc_loss": 0.08202613890171051, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000446726189693436, + "grad_norm": 9.756978034973145, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8775119781494141, + "num_tokens": 834775699.0, + "step": 21876 + }, + { + "epoch": 2.78297926472459, + "ewc_loss": 0.08172781765460968, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044130158494226635, + "grad_norm": 9.652606010437012, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8577607870101929, + "num_tokens": 834816053.0, + "step": 21877 + }, + { + "epoch": 2.78310647500318, + "ewc_loss": 0.08217212557792664, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004457447212189436, + "grad_norm": 9.727303504943848, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.859584391117096, + "num_tokens": 834857125.0, + "step": 21878 + }, + { + "epoch": 2.783233685281771, + "ewc_loss": 0.08176644891500473, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004416879382915795, + "grad_norm": 9.659863471984863, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8769254088401794, + "num_tokens": 834895676.0, + "step": 21879 + }, + { + "epoch": 2.783360895560361, + "ewc_loss": 0.0821746215224266, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000445769663201645, + "grad_norm": 9.712489128112793, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8692069053649902, + "num_tokens": 834927748.0, + "step": 21880 + }, + { + "epoch": 2.783488105838952, + "ewc_loss": 0.08138664066791534, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004403312341310084, + "grad_norm": 9.587040901184082, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8609758615493774, + "num_tokens": 834966722.0, + "step": 21881 + }, + { + "epoch": 2.783615316117542, + "ewc_loss": 0.08201566338539124, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044662150321528316, + "grad_norm": 9.757235527038574, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8652847409248352, + "num_tokens": 835003770.0, + "step": 21882 + }, + { + "epoch": 2.783742526396133, + "ewc_loss": 0.08154137432575226, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000439437193563208, + "grad_norm": 9.520023345947266, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.873650312423706, + "num_tokens": 835041504.0, + "step": 21883 + }, + { + "epoch": 2.783869736674723, + "ewc_loss": 0.08239766955375671, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004480001225601882, + "grad_norm": 9.710978507995605, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8661187887191772, + "num_tokens": 835075179.0, + "step": 21884 + }, + { + "epoch": 2.7839969469533137, + "ewc_loss": 0.08162081241607666, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044023158261552453, + "grad_norm": 9.600218772888184, + "learning_rate": 1e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8539389371871948, + "num_tokens": 835113080.0, + "step": 21885 + }, + { + "epoch": 2.784124157231904, + "ewc_loss": 0.08241832256317139, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004482066142372787, + "grad_norm": 9.672764778137207, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8644465208053589, + "num_tokens": 835150586.0, + "step": 21886 + }, + { + "epoch": 2.7842513675104947, + "ewc_loss": 0.08169569075107574, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004409803659655154, + "grad_norm": 9.627276420593262, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8600975275039673, + "num_tokens": 835184049.0, + "step": 21887 + }, + { + "epoch": 2.7843785777890853, + "ewc_loss": 0.08219744265079498, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004459978372324258, + "grad_norm": 9.712138175964355, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8594558835029602, + "num_tokens": 835222745.0, + "step": 21888 + }, + { + "epoch": 2.784505788067676, + "ewc_loss": 0.08191026002168655, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044312604586593807, + "grad_norm": 9.593506813049316, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8515838980674744, + "num_tokens": 835263017.0, + "step": 21889 + }, + { + "epoch": 2.7846329983462663, + "ewc_loss": 0.08217954635620117, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044581887777894735, + "grad_norm": 9.699413299560547, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8780959248542786, + "num_tokens": 835303916.0, + "step": 21890 + }, + { + "epoch": 2.784760208624857, + "ewc_loss": 0.08177477866411209, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044177123345434666, + "grad_norm": 9.61129379272461, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8787785768508911, + "num_tokens": 835343148.0, + "step": 21891 + }, + { + "epoch": 2.7848874189034474, + "ewc_loss": 0.08200246095657349, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044404808431863785, + "grad_norm": 9.628825187683105, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8614230155944824, + "num_tokens": 835382337.0, + "step": 21892 + }, + { + "epoch": 2.785014629182038, + "ewc_loss": 0.08177033066749573, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044172676280140877, + "grad_norm": 9.643924713134766, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8587578535079956, + "num_tokens": 835423645.0, + "step": 21893 + }, + { + "epoch": 2.7851418394606284, + "ewc_loss": 0.08177568018436432, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044178019743412733, + "grad_norm": 9.598472595214844, + "learning_rate": 1e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.8348816633224487, + "num_tokens": 835464590.0, + "step": 21894 + }, + { + "epoch": 2.785269049739219, + "ewc_loss": 0.08180785179138184, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044210191117599607, + "grad_norm": 9.54055404663086, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8886236548423767, + "num_tokens": 835496376.0, + "step": 21895 + }, + { + "epoch": 2.7853962600178095, + "ewc_loss": 0.08229708671569824, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044455286115407944, + "grad_norm": 9.632669448852539, + "learning_rate": 1e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.849360466003418, + "num_tokens": 835541231.0, + "step": 21896 + }, + { + "epoch": 2.7855234702964, + "ewc_loss": 0.08196593821048737, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044124136911705136, + "grad_norm": 9.551687240600586, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8673526644706726, + "num_tokens": 835584038.0, + "step": 21897 + }, + { + "epoch": 2.7856506805749905, + "ewc_loss": 0.08218741416931152, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004458975454326719, + "grad_norm": 9.66128921508789, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8585046529769897, + "num_tokens": 835627189.0, + "step": 21898 + }, + { + "epoch": 2.785777890853581, + "ewc_loss": 0.08170603960752487, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044108383008278906, + "grad_norm": 9.594083786010742, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8646718263626099, + "num_tokens": 835659701.0, + "step": 21899 + }, + { + "epoch": 2.7859051011321716, + "ewc_loss": 0.08209338784217834, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004449572879821062, + "grad_norm": 9.68898868560791, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8706996440887451, + "num_tokens": 835694045.0, + "step": 21900 + }, + { + "epoch": 2.786032311410762, + "ewc_loss": 0.08169989287853241, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000441022333689034, + "grad_norm": 9.574623107910156, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8618404269218445, + "num_tokens": 835732118.0, + "step": 21901 + }, + { + "epoch": 2.7861595216893527, + "ewc_loss": 0.08229857683181763, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004470091953407973, + "grad_norm": 9.72193717956543, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8743546605110168, + "num_tokens": 835772700.0, + "step": 21902 + }, + { + "epoch": 2.7862867319679427, + "ewc_loss": 0.08157593756914139, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004397828015498817, + "grad_norm": 16.25815773010254, + "learning_rate": 1e-06, + "loss": 0.543, + "mean_token_accuracy": 0.847454845905304, + "num_tokens": 835811206.0, + "step": 21903 + }, + { + "epoch": 2.7864139422465337, + "ewc_loss": 0.0909922868013382, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0005339462659321725, + "grad_norm": 10.532264709472656, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8679514527320862, + "num_tokens": 835844381.0, + "step": 21904 + }, + { + "epoch": 2.786541152525124, + "ewc_loss": 0.08447294682264328, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004687528999056667, + "grad_norm": 10.08725643157959, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.856537938117981, + "num_tokens": 835879926.0, + "step": 21905 + }, + { + "epoch": 2.7866683628037148, + "ewc_loss": 0.08247695863246918, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044879302731715143, + "grad_norm": 9.718043327331543, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8528730273246765, + "num_tokens": 835924014.0, + "step": 21906 + }, + { + "epoch": 2.786795573082305, + "ewc_loss": 0.08574231714010239, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004814466228708625, + "grad_norm": 10.1654052734375, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8570235967636108, + "num_tokens": 835956054.0, + "step": 21907 + }, + { + "epoch": 2.786922783360896, + "ewc_loss": 0.08228808641433716, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004469042469281703, + "grad_norm": 9.685335159301758, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8698023557662964, + "num_tokens": 835988885.0, + "step": 21908 + }, + { + "epoch": 2.787049993639486, + "ewc_loss": 0.08432076871395111, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004672311479225755, + "grad_norm": 10.008752822875977, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8628729581832886, + "num_tokens": 836028885.0, + "step": 21909 + }, + { + "epoch": 2.7871772039180764, + "ewc_loss": 0.08231694251298904, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044719286961480975, + "grad_norm": 9.644783020019531, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8775842189788818, + "num_tokens": 836067340.0, + "step": 21910 + }, + { + "epoch": 2.787304414196667, + "ewc_loss": 0.08363546431064606, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046281947288662195, + "grad_norm": 9.96174430847168, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.862810492515564, + "num_tokens": 836104432.0, + "step": 21911 + }, + { + "epoch": 2.7874316244752575, + "ewc_loss": 0.08197735995054245, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044379703467711806, + "grad_norm": 9.54114818572998, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8597440123558044, + "num_tokens": 836141197.0, + "step": 21912 + }, + { + "epoch": 2.787558834753848, + "ewc_loss": 0.083720862865448, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004636735247913748, + "grad_norm": 9.965394020080566, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.876563310623169, + "num_tokens": 836176949.0, + "step": 21913 + }, + { + "epoch": 2.7876860450324386, + "ewc_loss": 0.08153684437274933, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044183331192471087, + "grad_norm": 9.614338874816895, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8628026247024536, + "num_tokens": 836206484.0, + "step": 21914 + }, + { + "epoch": 2.787813255311029, + "ewc_loss": 0.08315865695476532, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045805139234289527, + "grad_norm": 9.884111404418945, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8698965907096863, + "num_tokens": 836243056.0, + "step": 21915 + }, + { + "epoch": 2.7879404655896196, + "ewc_loss": 0.08177362382411957, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044420105405151844, + "grad_norm": 9.595821380615234, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.858871340751648, + "num_tokens": 836281154.0, + "step": 21916 + }, + { + "epoch": 2.78806767586821, + "ewc_loss": 0.08277390897274017, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004542039823718369, + "grad_norm": 9.809989929199219, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.863858163356781, + "num_tokens": 836319895.0, + "step": 21917 + }, + { + "epoch": 2.7881948861468007, + "ewc_loss": 0.08153215050697327, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004417863383423537, + "grad_norm": 9.54701042175293, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8702702522277832, + "num_tokens": 836357227.0, + "step": 21918 + }, + { + "epoch": 2.788322096425391, + "ewc_loss": 0.08269326388835907, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045339742791838944, + "grad_norm": 9.814010620117188, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.855814516544342, + "num_tokens": 836398442.0, + "step": 21919 + }, + { + "epoch": 2.7884493067039817, + "ewc_loss": 0.08163401484489441, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044280500151216984, + "grad_norm": 9.550018310546875, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8724745512008667, + "num_tokens": 836436660.0, + "step": 21920 + }, + { + "epoch": 2.7885765169825723, + "ewc_loss": 0.08250002562999725, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004514650790952146, + "grad_norm": 9.71990966796875, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8585798144340515, + "num_tokens": 836479180.0, + "step": 21921 + }, + { + "epoch": 2.788703727261163, + "ewc_loss": 0.08183608949184418, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004423843347467482, + "grad_norm": 9.597393035888672, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8711743950843811, + "num_tokens": 836518874.0, + "step": 21922 + }, + { + "epoch": 2.7888309375397533, + "ewc_loss": 0.08235834538936615, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004500482464209199, + "grad_norm": 9.758400917053223, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8597095012664795, + "num_tokens": 836556910.0, + "step": 21923 + }, + { + "epoch": 2.788958147818344, + "ewc_loss": 0.08175402134656906, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044400503975339234, + "grad_norm": 9.581915855407715, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8760727047920227, + "num_tokens": 836598151.0, + "step": 21924 + }, + { + "epoch": 2.7890853580969344, + "ewc_loss": 0.08225791901350021, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004490440187510103, + "grad_norm": 9.73714542388916, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8557313084602356, + "num_tokens": 836637952.0, + "step": 21925 + }, + { + "epoch": 2.789212568375525, + "ewc_loss": 0.08173167705535889, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004413402348291129, + "grad_norm": 9.539307594299316, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8774112462997437, + "num_tokens": 836677138.0, + "step": 21926 + }, + { + "epoch": 2.7893397786541154, + "ewc_loss": 0.0826728567481041, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045075200614519417, + "grad_norm": 9.691349983215332, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8706037402153015, + "num_tokens": 836716880.0, + "step": 21927 + }, + { + "epoch": 2.7894669889327055, + "ewc_loss": 0.08159209787845612, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044238578993827105, + "grad_norm": 9.608020782470703, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8785284161567688, + "num_tokens": 836752572.0, + "step": 21928 + }, + { + "epoch": 2.7895941992112965, + "ewc_loss": 0.08218367397785187, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044830155093222857, + "grad_norm": 9.715446472167969, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8595992922782898, + "num_tokens": 836794321.0, + "step": 21929 + }, + { + "epoch": 2.7897214094898866, + "ewc_loss": 0.08182811737060547, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044474596506915987, + "grad_norm": 9.584417343139648, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8768495917320251, + "num_tokens": 836829183.0, + "step": 21930 + }, + { + "epoch": 2.7898486197684775, + "ewc_loss": 0.08221253752708435, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044859026093035936, + "grad_norm": 9.73116397857666, + "learning_rate": 1e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8512941598892212, + "num_tokens": 836864644.0, + "step": 21931 + }, + { + "epoch": 2.7899758300470676, + "ewc_loss": 0.08164101094007492, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044287493801675737, + "grad_norm": 9.614879608154297, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8761297464370728, + "num_tokens": 836895384.0, + "step": 21932 + }, + { + "epoch": 2.7901030403256586, + "ewc_loss": 0.08252248913049698, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044924832764081657, + "grad_norm": 9.7210693359375, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8669440150260925, + "num_tokens": 836943336.0, + "step": 21933 + }, + { + "epoch": 2.7902302506042487, + "ewc_loss": 0.08180217444896698, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044204521691426635, + "grad_norm": 9.511383056640625, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.880062460899353, + "num_tokens": 836980708.0, + "step": 21934 + }, + { + "epoch": 2.790357460882839, + "ewc_loss": 0.0823545753955841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004500105860643089, + "grad_norm": 9.73454475402832, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8684190511703491, + "num_tokens": 837015738.0, + "step": 21935 + }, + { + "epoch": 2.7904846711614297, + "ewc_loss": 0.08182942867279053, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004423177451826632, + "grad_norm": 9.55760669708252, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8710079789161682, + "num_tokens": 837053014.0, + "step": 21936 + }, + { + "epoch": 2.7906118814400203, + "ewc_loss": 0.0825781524181366, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045224634231999516, + "grad_norm": 9.748475074768066, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8600277900695801, + "num_tokens": 837083579.0, + "step": 21937 + }, + { + "epoch": 2.790739091718611, + "ewc_loss": 0.0817873477935791, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004418969328980893, + "grad_norm": 9.479507446289062, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8741338849067688, + "num_tokens": 837126201.0, + "step": 21938 + }, + { + "epoch": 2.7908663019972013, + "ewc_loss": 0.08277693390846252, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000454234192147851, + "grad_norm": 9.811253547668457, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8738497495651245, + "num_tokens": 837162400.0, + "step": 21939 + }, + { + "epoch": 2.790993512275792, + "ewc_loss": 0.08135194331407547, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043998428736813366, + "grad_norm": 9.545381546020508, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8686827421188354, + "num_tokens": 837205733.0, + "step": 21940 + }, + { + "epoch": 2.7911207225543824, + "ewc_loss": 0.08274118602275848, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045387665159069, + "grad_norm": 9.797361373901367, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8546148538589478, + "num_tokens": 837243838.0, + "step": 21941 + }, + { + "epoch": 2.791247932832973, + "ewc_loss": 0.08146334439516068, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044109829468652606, + "grad_norm": 9.563692092895508, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8593695163726807, + "num_tokens": 837280936.0, + "step": 21942 + }, + { + "epoch": 2.7913751431115634, + "ewc_loss": 0.0827537253499031, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045400208909995854, + "grad_norm": 9.788314819335938, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8646366596221924, + "num_tokens": 837321790.0, + "step": 21943 + }, + { + "epoch": 2.791502353390154, + "ewc_loss": 0.08174507319927216, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044147419976070523, + "grad_norm": 9.54593563079834, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8690242767333984, + "num_tokens": 837362023.0, + "step": 21944 + }, + { + "epoch": 2.7916295636687445, + "ewc_loss": 0.08265730738639832, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045303793740458786, + "grad_norm": 9.787117004394531, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8724064826965332, + "num_tokens": 837404876.0, + "step": 21945 + }, + { + "epoch": 2.791756773947335, + "ewc_loss": 0.08148642629384995, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044132908806204796, + "grad_norm": 9.5144624710083, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8575775623321533, + "num_tokens": 837442899.0, + "step": 21946 + }, + { + "epoch": 2.7918839842259255, + "ewc_loss": 0.08315601944923401, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004555836203508079, + "grad_norm": 9.87551498413086, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8726804852485657, + "num_tokens": 837477611.0, + "step": 21947 + }, + { + "epoch": 2.792011194504516, + "ewc_loss": 0.0813152939081192, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004396178119350225, + "grad_norm": 9.600436210632324, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8506330251693726, + "num_tokens": 837515928.0, + "step": 21948 + }, + { + "epoch": 2.7921384047831066, + "ewc_loss": 0.08281688392162323, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045463372953236103, + "grad_norm": 9.829693794250488, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8649570345878601, + "num_tokens": 837553042.0, + "step": 21949 + }, + { + "epoch": 2.792265615061697, + "ewc_loss": 0.08124135434627533, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004388783418107778, + "grad_norm": 9.465688705444336, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8718074560165405, + "num_tokens": 837589749.0, + "step": 21950 + }, + { + "epoch": 2.7923928253402877, + "ewc_loss": 0.08318813890218735, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000458346214145422, + "grad_norm": 9.885250091552734, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8827260732650757, + "num_tokens": 837631104.0, + "step": 21951 + }, + { + "epoch": 2.792520035618878, + "ewc_loss": 0.08113802969455719, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043784515582956374, + "grad_norm": 9.455401420593262, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8743884563446045, + "num_tokens": 837668173.0, + "step": 21952 + }, + { + "epoch": 2.7926472458974683, + "ewc_loss": 0.08312415331602097, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045770639553666115, + "grad_norm": 9.870489120483398, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8714882135391235, + "num_tokens": 837704282.0, + "step": 21953 + }, + { + "epoch": 2.7927744561760592, + "ewc_loss": 0.08116090297698975, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004380738828331232, + "grad_norm": 9.4839506149292, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8549845814704895, + "num_tokens": 837750790.0, + "step": 21954 + }, + { + "epoch": 2.7929016664546493, + "ewc_loss": 0.083208829164505, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045855308417230844, + "grad_norm": 9.79403305053711, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8617929220199585, + "num_tokens": 837792041.0, + "step": 21955 + }, + { + "epoch": 2.7930288767332403, + "ewc_loss": 0.08148283511400223, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004412932030390948, + "grad_norm": 9.591645240783691, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8717555999755859, + "num_tokens": 837818645.0, + "step": 21956 + }, + { + "epoch": 2.7931560870118304, + "ewc_loss": 0.08281237632036209, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045458858949132264, + "grad_norm": 9.755722999572754, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8739526271820068, + "num_tokens": 837859816.0, + "step": 21957 + }, + { + "epoch": 2.793283297290421, + "ewc_loss": 0.08175118267536163, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004439766926225275, + "grad_norm": 9.572235107421875, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8595995903015137, + "num_tokens": 837903970.0, + "step": 21958 + }, + { + "epoch": 2.7934105075690114, + "ewc_loss": 0.08275218307971954, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004539867222774774, + "grad_norm": 9.706884384155273, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8638410568237305, + "num_tokens": 837941158.0, + "step": 21959 + }, + { + "epoch": 2.793537717847602, + "ewc_loss": 0.08182644844055176, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004447293176781386, + "grad_norm": 9.635489463806152, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8703168630599976, + "num_tokens": 837978157.0, + "step": 21960 + }, + { + "epoch": 2.7936649281261925, + "ewc_loss": 0.0822843536734581, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044930839794687927, + "grad_norm": 9.70073127746582, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8761113286018372, + "num_tokens": 838017871.0, + "step": 21961 + }, + { + "epoch": 2.793792138404783, + "ewc_loss": 0.08184100687503815, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004448748950380832, + "grad_norm": 9.617952346801758, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.87659752368927, + "num_tokens": 838050521.0, + "step": 21962 + }, + { + "epoch": 2.7939193486833735, + "ewc_loss": 0.08210211992263794, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044748603249900043, + "grad_norm": 9.674338340759277, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8726462721824646, + "num_tokens": 838084450.0, + "step": 21963 + }, + { + "epoch": 2.794046558961964, + "ewc_loss": 0.08203615248203278, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000446826423285529, + "grad_norm": 9.6939058303833, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8585472702980042, + "num_tokens": 838121505.0, + "step": 21964 + }, + { + "epoch": 2.7941737692405546, + "ewc_loss": 0.08182793855667114, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004447442770469934, + "grad_norm": 9.642946243286133, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8627776503562927, + "num_tokens": 838166148.0, + "step": 21965 + }, + { + "epoch": 2.794300979519145, + "ewc_loss": 0.08209206163883209, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044738544966094196, + "grad_norm": 9.702731132507324, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8715119957923889, + "num_tokens": 838203578.0, + "step": 21966 + }, + { + "epoch": 2.7944281897977357, + "ewc_loss": 0.08186003565788269, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044506523408927023, + "grad_norm": 9.659246444702148, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8673070669174194, + "num_tokens": 838240702.0, + "step": 21967 + }, + { + "epoch": 2.794555400076326, + "ewc_loss": 0.08198952674865723, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004463600635062903, + "grad_norm": 9.736595153808594, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8718020915985107, + "num_tokens": 838275639.0, + "step": 21968 + }, + { + "epoch": 2.7946826103549167, + "ewc_loss": 0.08163665980100632, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000442831456894055, + "grad_norm": 9.619318008422852, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8663854598999023, + "num_tokens": 838310045.0, + "step": 21969 + }, + { + "epoch": 2.7948098206335072, + "ewc_loss": 0.08205047994852066, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004469696432352066, + "grad_norm": 9.639325141906738, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8665776252746582, + "num_tokens": 838351571.0, + "step": 21970 + }, + { + "epoch": 2.7949370309120978, + "ewc_loss": 0.08170032501220703, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044346804497763515, + "grad_norm": 9.613675117492676, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8777268528938293, + "num_tokens": 838393804.0, + "step": 21971 + }, + { + "epoch": 2.7950642411906883, + "ewc_loss": 0.08189615607261658, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004454263544175774, + "grad_norm": 9.563154220581055, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.866273045539856, + "num_tokens": 838435516.0, + "step": 21972 + }, + { + "epoch": 2.795191451469279, + "ewc_loss": 0.08201112598180771, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044657610123977065, + "grad_norm": 9.71250057220459, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8723890781402588, + "num_tokens": 838465425.0, + "step": 21973 + }, + { + "epoch": 2.7953186617478694, + "ewc_loss": 0.08146876841783524, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004411525442264974, + "grad_norm": 9.612418174743652, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8623901605606079, + "num_tokens": 838512832.0, + "step": 21974 + }, + { + "epoch": 2.79544587202646, + "ewc_loss": 0.08216816931962967, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000448146543931216, + "grad_norm": 9.665820121765137, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8731670379638672, + "num_tokens": 838552806.0, + "step": 21975 + }, + { + "epoch": 2.79557308230505, + "ewc_loss": 0.08184478431940079, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044247129699215293, + "grad_norm": 9.656457901000977, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8626915216445923, + "num_tokens": 838588592.0, + "step": 21976 + }, + { + "epoch": 2.795700292583641, + "ewc_loss": 0.08181649446487427, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004446298407856375, + "grad_norm": 9.634973526000977, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8624849319458008, + "num_tokens": 838629402.0, + "step": 21977 + }, + { + "epoch": 2.795827502862231, + "ewc_loss": 0.08163486421108246, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044281347072683275, + "grad_norm": 9.544048309326172, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8570518493652344, + "num_tokens": 838676580.0, + "step": 21978 + }, + { + "epoch": 2.795954713140822, + "ewc_loss": 0.08183575421571732, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044482239172793925, + "grad_norm": 9.666141510009766, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8592323064804077, + "num_tokens": 838715773.0, + "step": 21979 + }, + { + "epoch": 2.796081923419412, + "ewc_loss": 0.08157959580421448, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044226081809028983, + "grad_norm": 9.578323364257812, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8551594018936157, + "num_tokens": 838750270.0, + "step": 21980 + }, + { + "epoch": 2.796209133698003, + "ewc_loss": 0.08193940669298172, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044585889554582536, + "grad_norm": 9.550661087036133, + "learning_rate": 1e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8444530963897705, + "num_tokens": 838794823.0, + "step": 21981 + }, + { + "epoch": 2.796336343976593, + "ewc_loss": 0.08203741908073425, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004468389961402863, + "grad_norm": 9.679993629455566, + "learning_rate": 1e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8559294939041138, + "num_tokens": 838828901.0, + "step": 21982 + }, + { + "epoch": 2.7964635542551837, + "ewc_loss": 0.0817178338766098, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044364313362166286, + "grad_norm": 9.546442985534668, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8576399087905884, + "num_tokens": 838872648.0, + "step": 21983 + }, + { + "epoch": 2.796590764533774, + "ewc_loss": 0.0823764055967331, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045022895210422575, + "grad_norm": 9.689496994018555, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8770126104354858, + "num_tokens": 838905150.0, + "step": 21984 + }, + { + "epoch": 2.7967179748123647, + "ewc_loss": 0.08157666027545929, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000442231452325359, + "grad_norm": 9.574589729309082, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8649429082870483, + "num_tokens": 838949027.0, + "step": 21985 + }, + { + "epoch": 2.7968451850909553, + "ewc_loss": 0.08273966610431671, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044897868065163493, + "grad_norm": 9.657599449157715, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8664380311965942, + "num_tokens": 838987320.0, + "step": 21986 + }, + { + "epoch": 2.796972395369546, + "ewc_loss": 0.08168625831604004, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004433274152688682, + "grad_norm": 9.502593994140625, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8611869812011719, + "num_tokens": 839027396.0, + "step": 21987 + }, + { + "epoch": 2.7970996056481363, + "ewc_loss": 0.08252473175525665, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004517121415119618, + "grad_norm": 9.69761848449707, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8709880709648132, + "num_tokens": 839069779.0, + "step": 21988 + }, + { + "epoch": 2.797226815926727, + "ewc_loss": 0.08163740485906601, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004428388783708215, + "grad_norm": 9.548355102539062, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8706951141357422, + "num_tokens": 839110919.0, + "step": 21989 + }, + { + "epoch": 2.7973540262053174, + "ewc_loss": 0.08240654319524765, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004505302640609443, + "grad_norm": 9.734527587890625, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8796761631965637, + "num_tokens": 839143689.0, + "step": 21990 + }, + { + "epoch": 2.797481236483908, + "ewc_loss": 0.08160050958395004, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044246992911212146, + "grad_norm": 9.52983570098877, + "learning_rate": 1e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.845158576965332, + "num_tokens": 839181274.0, + "step": 21991 + }, + { + "epoch": 2.7976084467624984, + "ewc_loss": 0.08274286985397339, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004538935609161854, + "grad_norm": 9.805959701538086, + "learning_rate": 1e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8477522730827332, + "num_tokens": 839217038.0, + "step": 21992 + }, + { + "epoch": 2.797735657041089, + "ewc_loss": 0.08133025467395782, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043976734741590917, + "grad_norm": 9.431587219238281, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8658323287963867, + "num_tokens": 839260104.0, + "step": 21993 + }, + { + "epoch": 2.7978628673196795, + "ewc_loss": 0.08317919820547104, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004582568071782589, + "grad_norm": 9.810606956481934, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8677386045455933, + "num_tokens": 839294328.0, + "step": 21994 + }, + { + "epoch": 2.79799007759827, + "ewc_loss": 0.08133150637149811, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043977994937449694, + "grad_norm": 9.473176002502441, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8642138242721558, + "num_tokens": 839326662.0, + "step": 21995 + }, + { + "epoch": 2.7981172878768605, + "ewc_loss": 0.08315664529800415, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004580313106998801, + "grad_norm": 9.85610580444336, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8662089109420776, + "num_tokens": 839355997.0, + "step": 21996 + }, + { + "epoch": 2.798244498155451, + "ewc_loss": 0.08148178458213806, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004412826965562999, + "grad_norm": 9.508787155151367, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8709058165550232, + "num_tokens": 839396596.0, + "step": 21997 + }, + { + "epoch": 2.7983717084340416, + "ewc_loss": 0.08307022601366043, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045716710155829787, + "grad_norm": 9.829508781433105, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8675286769866943, + "num_tokens": 839432251.0, + "step": 21998 + }, + { + "epoch": 2.798498918712632, + "ewc_loss": 0.0814613550901413, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004410783585626632, + "grad_norm": 9.514628410339355, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8750395774841309, + "num_tokens": 839465987.0, + "step": 21999 + }, + { + "epoch": 2.7986261289912227, + "ewc_loss": 0.08310914784669876, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004551149031613022, + "grad_norm": 9.779982566833496, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.867109477519989, + "num_tokens": 839506147.0, + "step": 22000 + }, + { + "epoch": 2.7987533392698127, + "ewc_loss": 0.08192840218544006, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004408660752233118, + "grad_norm": 9.490496635437012, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8566076755523682, + "num_tokens": 839544620.0, + "step": 22001 + }, + { + "epoch": 2.7988805495484037, + "ewc_loss": 0.08310829848051071, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004551064339466393, + "grad_norm": 9.818296432495117, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8553707599639893, + "num_tokens": 839583217.0, + "step": 22002 + }, + { + "epoch": 2.799007759826994, + "ewc_loss": 0.08162978291511536, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004403212806209922, + "grad_norm": 9.493854522705078, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8851675987243652, + "num_tokens": 839621283.0, + "step": 22003 + }, + { + "epoch": 2.7991349701055848, + "ewc_loss": 0.08305027335882187, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045452616177499294, + "grad_norm": 9.821138381958008, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8603698015213013, + "num_tokens": 839658072.0, + "step": 22004 + }, + { + "epoch": 2.799262180384175, + "ewc_loss": 0.0817626416683197, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004416498413775116, + "grad_norm": 9.494452476501465, + "learning_rate": 1e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8471394181251526, + "num_tokens": 839700926.0, + "step": 22005 + }, + { + "epoch": 2.799389390662766, + "ewc_loss": 0.08311901241540909, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045521356514655054, + "grad_norm": 9.841700553894043, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8734860420227051, + "num_tokens": 839736160.0, + "step": 22006 + }, + { + "epoch": 2.799516600941356, + "ewc_loss": 0.08165782690048218, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044060166692361236, + "grad_norm": 9.562726974487305, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8813427686691284, + "num_tokens": 839776834.0, + "step": 22007 + }, + { + "epoch": 2.7996438112199464, + "ewc_loss": 0.08282074332237244, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004522308590821922, + "grad_norm": 9.746274948120117, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8581138253211975, + "num_tokens": 839818156.0, + "step": 22008 + }, + { + "epoch": 2.799771021498537, + "ewc_loss": 0.08165770769119263, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004406005027703941, + "grad_norm": 9.542020797729492, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8597685098648071, + "num_tokens": 839857133.0, + "step": 22009 + }, + { + "epoch": 2.7998982317771275, + "ewc_loss": 0.08280444890260696, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004520679358392954, + "grad_norm": 9.778144836425781, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.848437488079071, + "num_tokens": 839893129.0, + "step": 22010 + }, + { + "epoch": 2.800025442055718, + "ewc_loss": 0.08171048760414124, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044112830073572695, + "grad_norm": 9.565353393554688, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8681937456130981, + "num_tokens": 839924622.0, + "step": 22011 + }, + { + "epoch": 2.8001526523343085, + "ewc_loss": 0.08242417871952057, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000448265258455649, + "grad_norm": 9.634596824645996, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8828516006469727, + "num_tokens": 839961276.0, + "step": 22012 + }, + { + "epoch": 2.800279862612899, + "ewc_loss": 0.08201851695775986, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044420859194360673, + "grad_norm": 9.57381534576416, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8544914722442627, + "num_tokens": 840002554.0, + "step": 22013 + }, + { + "epoch": 2.8004070728914896, + "ewc_loss": 0.08228059113025665, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004468293336685747, + "grad_norm": 9.649062156677246, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8644084930419922, + "num_tokens": 840041642.0, + "step": 22014 + }, + { + "epoch": 2.80053428317008, + "ewc_loss": 0.08193542063236237, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004433775902725756, + "grad_norm": 9.582769393920898, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8733406662940979, + "num_tokens": 840083532.0, + "step": 22015 + }, + { + "epoch": 2.8006614934486707, + "ewc_loss": 0.0824001282453537, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004480247152969241, + "grad_norm": 9.608177185058594, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8621662259101868, + "num_tokens": 840124221.0, + "step": 22016 + }, + { + "epoch": 2.800788703727261, + "ewc_loss": 0.08218753337860107, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044589879689738154, + "grad_norm": 9.54743480682373, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8697149157524109, + "num_tokens": 840162309.0, + "step": 22017 + }, + { + "epoch": 2.8009159140058517, + "ewc_loss": 0.08230514079332352, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004470748535823077, + "grad_norm": 9.704354286193848, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8655081987380981, + "num_tokens": 840194778.0, + "step": 22018 + }, + { + "epoch": 2.8010431242844422, + "ewc_loss": 0.08208655565977097, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044488898129202425, + "grad_norm": 9.610313415527344, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8811033964157104, + "num_tokens": 840225081.0, + "step": 22019 + }, + { + "epoch": 2.8011703345630328, + "ewc_loss": 0.08274203538894653, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004490024293772876, + "grad_norm": 9.661312103271484, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8640027046203613, + "num_tokens": 840267698.0, + "step": 22020 + }, + { + "epoch": 2.8012975448416233, + "ewc_loss": 0.08199727535247803, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044399622129276395, + "grad_norm": 9.577452659606934, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8577021360397339, + "num_tokens": 840303255.0, + "step": 22021 + }, + { + "epoch": 2.801424755120214, + "ewc_loss": 0.08264659345149994, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044804791104979813, + "grad_norm": 9.709283828735352, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8636552691459656, + "num_tokens": 840344616.0, + "step": 22022 + }, + { + "epoch": 2.8015519653988044, + "ewc_loss": 0.08174584805965424, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004414818831719458, + "grad_norm": 9.565256118774414, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8663633465766907, + "num_tokens": 840383293.0, + "step": 22023 + }, + { + "epoch": 2.801679175677395, + "ewc_loss": 0.0822979062795639, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004470025305636227, + "grad_norm": 9.630889892578125, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8543955683708191, + "num_tokens": 840424583.0, + "step": 22024 + }, + { + "epoch": 2.8018063859559854, + "ewc_loss": 0.0820869505405426, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004448929103091359, + "grad_norm": 9.571386337280273, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8569718599319458, + "num_tokens": 840458752.0, + "step": 22025 + }, + { + "epoch": 2.8019335962345755, + "ewc_loss": 0.08247721940279007, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004487956175580621, + "grad_norm": 9.644633293151855, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.870988130569458, + "num_tokens": 840498441.0, + "step": 22026 + }, + { + "epoch": 2.8020608065131665, + "ewc_loss": 0.08214546740055084, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044547810102812946, + "grad_norm": 9.570141792297363, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8753894567489624, + "num_tokens": 840537838.0, + "step": 22027 + }, + { + "epoch": 2.8021880167917566, + "ewc_loss": 0.08252402395009995, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004492636944632977, + "grad_norm": 9.708406448364258, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8536678552627563, + "num_tokens": 840571770.0, + "step": 22028 + }, + { + "epoch": 2.8023152270703475, + "ewc_loss": 0.08204515278339386, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044447495019994676, + "grad_norm": 9.613444328308105, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8588110208511353, + "num_tokens": 840607559.0, + "step": 22029 + }, + { + "epoch": 2.8024424373489376, + "ewc_loss": 0.08231796324253082, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044964448898099363, + "grad_norm": 9.65836238861084, + "learning_rate": 1e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8503393530845642, + "num_tokens": 840650156.0, + "step": 22030 + }, + { + "epoch": 2.8025696476275286, + "ewc_loss": 0.08183357864618301, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004448006220627576, + "grad_norm": 9.564292907714844, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8561055660247803, + "num_tokens": 840686590.0, + "step": 22031 + }, + { + "epoch": 2.8026968579061187, + "ewc_loss": 0.08265069872140884, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045053043868392706, + "grad_norm": 9.754064559936523, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8556398153305054, + "num_tokens": 840724445.0, + "step": 22032 + }, + { + "epoch": 2.802824068184709, + "ewc_loss": 0.0815671980381012, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004421368066687137, + "grad_norm": 9.483182907104492, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.878949761390686, + "num_tokens": 840765047.0, + "step": 22033 + }, + { + "epoch": 2.8029512784632997, + "ewc_loss": 0.08311024308204651, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004551258170977235, + "grad_norm": 9.932332992553711, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8519484400749207, + "num_tokens": 840802375.0, + "step": 22034 + }, + { + "epoch": 2.8030784887418903, + "ewc_loss": 0.08132871240377426, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043731057667173445, + "grad_norm": 9.496594429016113, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8639439344406128, + "num_tokens": 840839994.0, + "step": 22035 + }, + { + "epoch": 2.803205699020481, + "ewc_loss": 0.08327990770339966, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004568224831018597, + "grad_norm": 9.77950382232666, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8621191382408142, + "num_tokens": 840877144.0, + "step": 22036 + }, + { + "epoch": 2.8033329092990713, + "ewc_loss": 0.0814984142780304, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004390076210256666, + "grad_norm": 9.548073768615723, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8552014827728271, + "num_tokens": 840918478.0, + "step": 22037 + }, + { + "epoch": 2.803460119577662, + "ewc_loss": 0.08268830180168152, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045090646017342806, + "grad_norm": 9.746143341064453, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8756226897239685, + "num_tokens": 840958484.0, + "step": 22038 + }, + { + "epoch": 2.8035873298562524, + "ewc_loss": 0.0820019543170929, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044160158722661436, + "grad_norm": 9.511443138122559, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8675013780593872, + "num_tokens": 841000035.0, + "step": 22039 + }, + { + "epoch": 2.803714540134843, + "ewc_loss": 0.08308956027030945, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045247763046063483, + "grad_norm": 9.775382995605469, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8590364456176758, + "num_tokens": 841035836.0, + "step": 22040 + }, + { + "epoch": 2.8038417504134334, + "ewc_loss": 0.08192049711942673, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044078700011596084, + "grad_norm": 9.544317245483398, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8658879995346069, + "num_tokens": 841069295.0, + "step": 22041 + }, + { + "epoch": 2.803968960692024, + "ewc_loss": 0.08308872580528259, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004524692485574633, + "grad_norm": 9.7019624710083, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8597210645675659, + "num_tokens": 841108341.0, + "step": 22042 + }, + { + "epoch": 2.8040961709706145, + "ewc_loss": 0.08191797137260437, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004432031128089875, + "grad_norm": 9.644998550415039, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8748854398727417, + "num_tokens": 841142945.0, + "step": 22043 + }, + { + "epoch": 2.804223381249205, + "ewc_loss": 0.08264337480068207, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004480157804209739, + "grad_norm": 9.7111177444458, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8774783611297607, + "num_tokens": 841174137.0, + "step": 22044 + }, + { + "epoch": 2.8043505915277955, + "ewc_loss": 0.08215221762657166, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004431041597854346, + "grad_norm": 9.529433250427246, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8751997351646423, + "num_tokens": 841215914.0, + "step": 22045 + }, + { + "epoch": 2.804477801806386, + "ewc_loss": 0.0827644020318985, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044922606321051717, + "grad_norm": 9.756741523742676, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8659790754318237, + "num_tokens": 841256097.0, + "step": 22046 + }, + { + "epoch": 2.8046050120849766, + "ewc_loss": 0.08192834258079529, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044086543493904173, + "grad_norm": 9.5159273147583, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8701969981193542, + "num_tokens": 841288997.0, + "step": 22047 + }, + { + "epoch": 2.804732222363567, + "ewc_loss": 0.08298036456108093, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045138568384572864, + "grad_norm": 9.722743034362793, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8597220778465271, + "num_tokens": 841328053.0, + "step": 22048 + }, + { + "epoch": 2.8048594326421576, + "ewc_loss": 0.0821133553981781, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044271553633734584, + "grad_norm": 9.614043235778809, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8838655948638916, + "num_tokens": 841366848.0, + "step": 22049 + }, + { + "epoch": 2.804986642920748, + "ewc_loss": 0.08242502808570862, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044827372767031193, + "grad_norm": 9.680792808532715, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8734285831451416, + "num_tokens": 841398568.0, + "step": 22050 + }, + { + "epoch": 2.8051138531993383, + "ewc_loss": 0.08206315338611603, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004446549282874912, + "grad_norm": 9.668302536010742, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8619796633720398, + "num_tokens": 841435722.0, + "step": 22051 + }, + { + "epoch": 2.8052410634779292, + "ewc_loss": 0.0822082906961441, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004461063945200294, + "grad_norm": 9.661009788513184, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8603886961936951, + "num_tokens": 841473635.0, + "step": 22052 + }, + { + "epoch": 2.8053682737565193, + "ewc_loss": 0.08188170194625854, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044284050818532705, + "grad_norm": 9.556666374206543, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8669124841690063, + "num_tokens": 841512383.0, + "step": 22053 + }, + { + "epoch": 2.8054954840351103, + "ewc_loss": 0.08211963623762131, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044521980453282595, + "grad_norm": 9.686722755432129, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8587392568588257, + "num_tokens": 841543174.0, + "step": 22054 + }, + { + "epoch": 2.8056226943137004, + "ewc_loss": 0.08171752095222473, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004411986446939409, + "grad_norm": 9.56511116027832, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.869530439376831, + "num_tokens": 841582185.0, + "step": 22055 + }, + { + "epoch": 2.805749904592291, + "ewc_loss": 0.08218815177679062, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044590496690943837, + "grad_norm": 9.63766098022461, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8594341278076172, + "num_tokens": 841623179.0, + "step": 22056 + }, + { + "epoch": 2.8058771148708814, + "ewc_loss": 0.08188647031784058, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004428881511557847, + "grad_norm": 9.598437309265137, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8734863996505737, + "num_tokens": 841661062.0, + "step": 22057 + }, + { + "epoch": 2.806004325149472, + "ewc_loss": 0.08197127282619476, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004437361494638026, + "grad_norm": 9.61530876159668, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8625384569168091, + "num_tokens": 841704259.0, + "step": 22058 + }, + { + "epoch": 2.8061315354280625, + "ewc_loss": 0.08185450732707977, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044256856199353933, + "grad_norm": 9.486577033996582, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.862282395362854, + "num_tokens": 841748396.0, + "step": 22059 + }, + { + "epoch": 2.806258745706653, + "ewc_loss": 0.08239702880382538, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004479937779251486, + "grad_norm": 9.666566848754883, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8835744857788086, + "num_tokens": 841787809.0, + "step": 22060 + }, + { + "epoch": 2.8063859559852435, + "ewc_loss": 0.08171229064464569, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044114634511061013, + "grad_norm": 9.528945922851562, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8650074005126953, + "num_tokens": 841824046.0, + "step": 22061 + }, + { + "epoch": 2.806513166263834, + "ewc_loss": 0.08235777914524078, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004476012836676091, + "grad_norm": 9.66818618774414, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8744901418685913, + "num_tokens": 841856949.0, + "step": 22062 + }, + { + "epoch": 2.8066403765424246, + "ewc_loss": 0.08177614212036133, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044178482494316995, + "grad_norm": 9.596609115600586, + "learning_rate": 1e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8506820797920227, + "num_tokens": 841898644.0, + "step": 22063 + }, + { + "epoch": 2.806767586821015, + "ewc_loss": 0.08226528763771057, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004466763057280332, + "grad_norm": 9.580562591552734, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8682999610900879, + "num_tokens": 841934795.0, + "step": 22064 + }, + { + "epoch": 2.8068947970996057, + "ewc_loss": 0.08205956220626831, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044461904326453805, + "grad_norm": 9.607686042785645, + "learning_rate": 1e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8497074842453003, + "num_tokens": 841976878.0, + "step": 22065 + }, + { + "epoch": 2.807022007378196, + "ewc_loss": 0.08232787251472473, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004448607505764812, + "grad_norm": 9.66965389251709, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8656366467475891, + "num_tokens": 842010998.0, + "step": 22066 + }, + { + "epoch": 2.8071492176567867, + "ewc_loss": 0.08201300352811813, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004441534692887217, + "grad_norm": 9.506982803344727, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.890881359577179, + "num_tokens": 842051410.0, + "step": 22067 + }, + { + "epoch": 2.8072764279353772, + "ewc_loss": 0.08248667418956757, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044889020500704646, + "grad_norm": 9.68822956085205, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8689336180686951, + "num_tokens": 842087954.0, + "step": 22068 + }, + { + "epoch": 2.8074036382139678, + "ewc_loss": 0.08158658444881439, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043988926336169243, + "grad_norm": 9.451220512390137, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8622614145278931, + "num_tokens": 842128326.0, + "step": 22069 + }, + { + "epoch": 2.8075308484925583, + "ewc_loss": 0.0829363465309143, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004533868923317641, + "grad_norm": 9.703108787536621, + "learning_rate": 1e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8437446355819702, + "num_tokens": 842171143.0, + "step": 22070 + }, + { + "epoch": 2.807658058771149, + "ewc_loss": 0.0815812423825264, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004398358578328043, + "grad_norm": 9.459333419799805, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8516676425933838, + "num_tokens": 842212167.0, + "step": 22071 + }, + { + "epoch": 2.8077852690497394, + "ewc_loss": 0.08294571936130524, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045348057756200433, + "grad_norm": 9.676624298095703, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8561986088752747, + "num_tokens": 842252576.0, + "step": 22072 + }, + { + "epoch": 2.80791247932833, + "ewc_loss": 0.08176624774932861, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004416859592311084, + "grad_norm": 9.520009994506836, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8612394332885742, + "num_tokens": 842287812.0, + "step": 22073 + }, + { + "epoch": 2.80803968960692, + "ewc_loss": 0.08266451954841614, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004506686527747661, + "grad_norm": 9.649599075317383, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8606030941009521, + "num_tokens": 842328289.0, + "step": 22074 + }, + { + "epoch": 2.808166899885511, + "ewc_loss": 0.08201880753040314, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044421147322282195, + "grad_norm": 9.575179100036621, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8666471242904663, + "num_tokens": 842359182.0, + "step": 22075 + }, + { + "epoch": 2.808294110164101, + "ewc_loss": 0.08258846402168274, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004499081114772707, + "grad_norm": 9.689642906188965, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8560807108879089, + "num_tokens": 842399820.0, + "step": 22076 + }, + { + "epoch": 2.808421320442692, + "ewc_loss": 0.08198133856058121, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004438368196133524, + "grad_norm": 9.568215370178223, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8629167079925537, + "num_tokens": 842437852.0, + "step": 22077 + }, + { + "epoch": 2.808548530721282, + "ewc_loss": 0.08261376619338989, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045016108197160065, + "grad_norm": 9.73028564453125, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8509044647216797, + "num_tokens": 842476729.0, + "step": 22078 + }, + { + "epoch": 2.808675740999873, + "ewc_loss": 0.0817863792181015, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044188721221871674, + "grad_norm": 9.541598320007324, + "learning_rate": 1e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8520267605781555, + "num_tokens": 842514590.0, + "step": 22079 + }, + { + "epoch": 2.808802951278463, + "ewc_loss": 0.08257341384887695, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044975761556997895, + "grad_norm": 9.688356399536133, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.863874077796936, + "num_tokens": 842556502.0, + "step": 22080 + }, + { + "epoch": 2.8089301615570537, + "ewc_loss": 0.08191116154193878, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044313506805337965, + "grad_norm": 9.579622268676758, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8531050682067871, + "num_tokens": 842594299.0, + "step": 22081 + }, + { + "epoch": 2.809057371835644, + "ewc_loss": 0.08234566450119019, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004474801244214177, + "grad_norm": 9.626797676086426, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8622575998306274, + "num_tokens": 842633189.0, + "step": 22082 + }, + { + "epoch": 2.8091845821142347, + "ewc_loss": 0.0821094736456871, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044511817395687103, + "grad_norm": 9.547962188720703, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8679530024528503, + "num_tokens": 842673626.0, + "step": 22083 + }, + { + "epoch": 2.8093117923928252, + "ewc_loss": 0.08245948702096939, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044861831702291965, + "grad_norm": 9.683980941772461, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8585411310195923, + "num_tokens": 842712574.0, + "step": 22084 + }, + { + "epoch": 2.8094390026714158, + "ewc_loss": 0.0817471370100975, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004414948052726686, + "grad_norm": 9.517375946044922, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8616490364074707, + "num_tokens": 842749534.0, + "step": 22085 + }, + { + "epoch": 2.8095662129500063, + "ewc_loss": 0.08254355192184448, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044945901026949286, + "grad_norm": 9.707974433898926, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.85419762134552, + "num_tokens": 842784989.0, + "step": 22086 + }, + { + "epoch": 2.809693423228597, + "ewc_loss": 0.08179688453674316, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004419922479428351, + "grad_norm": 9.528568267822266, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.865416944026947, + "num_tokens": 842823858.0, + "step": 22087 + }, + { + "epoch": 2.8098206335071874, + "ewc_loss": 0.082834891974926, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004523723619058728, + "grad_norm": 9.720788955688477, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8645506501197815, + "num_tokens": 842858849.0, + "step": 22088 + }, + { + "epoch": 2.809947843785778, + "ewc_loss": 0.08160784840583801, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044010195415467024, + "grad_norm": 9.495589256286621, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8844632506370544, + "num_tokens": 842895809.0, + "step": 22089 + }, + { + "epoch": 2.8100750540643684, + "ewc_loss": 0.08290910720825195, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045311453868635, + "grad_norm": 9.738926887512207, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8919262290000916, + "num_tokens": 842933650.0, + "step": 22090 + }, + { + "epoch": 2.810202264342959, + "ewc_loss": 0.08170051127672195, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044102853280492127, + "grad_norm": 9.591448783874512, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8699740171432495, + "num_tokens": 842967200.0, + "step": 22091 + }, + { + "epoch": 2.8103294746215495, + "ewc_loss": 0.08259862661361694, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004500097129493952, + "grad_norm": 9.728289604187012, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8548707365989685, + "num_tokens": 843002498.0, + "step": 22092 + }, + { + "epoch": 2.81045668490014, + "ewc_loss": 0.08160001784563065, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004424650105647743, + "grad_norm": 9.536959648132324, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8705191016197205, + "num_tokens": 843042627.0, + "step": 22093 + }, + { + "epoch": 2.8105838951787305, + "ewc_loss": 0.08242352306842804, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044825865188613534, + "grad_norm": 9.731327056884766, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8607959151268005, + "num_tokens": 843076862.0, + "step": 22094 + }, + { + "epoch": 2.810711105457321, + "ewc_loss": 0.08155404031276703, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004420052282512188, + "grad_norm": 9.554526329040527, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.864015519618988, + "num_tokens": 843115000.0, + "step": 22095 + }, + { + "epoch": 2.8108383157359116, + "ewc_loss": 0.08211252093315125, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044759007869288325, + "grad_norm": 9.683426856994629, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8720324635505676, + "num_tokens": 843152408.0, + "step": 22096 + }, + { + "epoch": 2.810965526014502, + "ewc_loss": 0.08170241117477417, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004434889415279031, + "grad_norm": 9.646781921386719, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8658034801483154, + "num_tokens": 843189269.0, + "step": 22097 + }, + { + "epoch": 2.8110927362930926, + "ewc_loss": 0.08211825788021088, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044764738413505256, + "grad_norm": 9.571218490600586, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8594620227813721, + "num_tokens": 843227148.0, + "step": 22098 + }, + { + "epoch": 2.8112199465716827, + "ewc_loss": 0.08230019360780716, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044702537707053125, + "grad_norm": 9.715601921081543, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.854128897190094, + "num_tokens": 843259736.0, + "step": 22099 + }, + { + "epoch": 2.8113471568502737, + "ewc_loss": 0.0816977471113205, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004410008550621569, + "grad_norm": 9.501724243164062, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8603099584579468, + "num_tokens": 843302598.0, + "step": 22100 + }, + { + "epoch": 2.811474367128864, + "ewc_loss": 0.08253008127212524, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045176560524851084, + "grad_norm": 9.725333213806152, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8803227543830872, + "num_tokens": 843337615.0, + "step": 22101 + }, + { + "epoch": 2.8116015774074548, + "ewc_loss": 0.08132313191890717, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043969618855044246, + "grad_norm": 9.55355453491211, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8804635405540466, + "num_tokens": 843378845.0, + "step": 22102 + }, + { + "epoch": 2.811728787686045, + "ewc_loss": 0.08233091235160828, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004497739137150347, + "grad_norm": 9.71314525604248, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8528698682785034, + "num_tokens": 843414128.0, + "step": 22103 + }, + { + "epoch": 2.811855997964636, + "ewc_loss": 0.08136508613824844, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004401157202664763, + "grad_norm": 9.484549522399902, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8811339139938354, + "num_tokens": 843454255.0, + "step": 22104 + }, + { + "epoch": 2.811983208243226, + "ewc_loss": 0.08247186243534088, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004511834413278848, + "grad_norm": 9.730087280273438, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8659237623214722, + "num_tokens": 843498332.0, + "step": 22105 + }, + { + "epoch": 2.8121104185218164, + "ewc_loss": 0.08167988061904907, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004408222739584744, + "grad_norm": 9.526108741760254, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8556334972381592, + "num_tokens": 843539196.0, + "step": 22106 + }, + { + "epoch": 2.812237628800407, + "ewc_loss": 0.08294607698917389, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004534842446446419, + "grad_norm": 9.781457901000977, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8685582876205444, + "num_tokens": 843573767.0, + "step": 22107 + }, + { + "epoch": 2.8123648390789975, + "ewc_loss": 0.08157062530517578, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044217114918865263, + "grad_norm": 9.604408264160156, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8529468774795532, + "num_tokens": 843611156.0, + "step": 22108 + }, + { + "epoch": 2.812492049357588, + "ewc_loss": 0.08229139447212219, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004493788001127541, + "grad_norm": 9.738238334655762, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8702878355979919, + "num_tokens": 843644026.0, + "step": 22109 + }, + { + "epoch": 2.8126192596361785, + "ewc_loss": 0.0817122831940651, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004435876908246428, + "grad_norm": 9.60842227935791, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8687926530838013, + "num_tokens": 843683477.0, + "step": 22110 + }, + { + "epoch": 2.812746469914769, + "ewc_loss": 0.08228212594985962, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004492861044127494, + "grad_norm": 9.72065258026123, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.851058840751648, + "num_tokens": 843722815.0, + "step": 22111 + }, + { + "epoch": 2.8128736801933596, + "ewc_loss": 0.08163788914680481, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044284373871050775, + "grad_norm": 9.560959815979004, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8670613765716553, + "num_tokens": 843763843.0, + "step": 22112 + }, + { + "epoch": 2.81300089047195, + "ewc_loss": 0.08226901292800903, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004491550207603723, + "grad_norm": 9.748364448547363, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8576240539550781, + "num_tokens": 843805327.0, + "step": 22113 + }, + { + "epoch": 2.8131281007505406, + "ewc_loss": 0.08148328959941864, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004412977141328156, + "grad_norm": 9.554627418518066, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8652114868164062, + "num_tokens": 843850132.0, + "step": 22114 + }, + { + "epoch": 2.813255311029131, + "ewc_loss": 0.08242601156234741, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045072496868669987, + "grad_norm": 9.735325813293457, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8653532862663269, + "num_tokens": 843887425.0, + "step": 22115 + }, + { + "epoch": 2.8133825213077217, + "ewc_loss": 0.08151021599769592, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004415670409798622, + "grad_norm": 9.634732246398926, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8727242350578308, + "num_tokens": 843920728.0, + "step": 22116 + }, + { + "epoch": 2.8135097315863122, + "ewc_loss": 0.08209045976400375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044736944255419075, + "grad_norm": 9.72171688079834, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8622643947601318, + "num_tokens": 843959733.0, + "step": 22117 + }, + { + "epoch": 2.8136369418649028, + "ewc_loss": 0.08171702176332474, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004436350427567959, + "grad_norm": 9.575213432312012, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8662620782852173, + "num_tokens": 844005616.0, + "step": 22118 + }, + { + "epoch": 2.8137641521434933, + "ewc_loss": 0.08223310858011246, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004487959377001971, + "grad_norm": 9.774327278137207, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8788327574729919, + "num_tokens": 844034216.0, + "step": 22119 + }, + { + "epoch": 2.813891362422084, + "ewc_loss": 0.08142484724521637, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044071333832107484, + "grad_norm": 9.610213279724121, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8624328374862671, + "num_tokens": 844071173.0, + "step": 22120 + }, + { + "epoch": 2.8140185727006743, + "ewc_loss": 0.0824727863073349, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045119275455363095, + "grad_norm": 9.810120582580566, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8620588779449463, + "num_tokens": 844113665.0, + "step": 22121 + }, + { + "epoch": 2.814145782979265, + "ewc_loss": 0.08140795677900314, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044054441968910396, + "grad_norm": 9.58267593383789, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8640376329421997, + "num_tokens": 844154063.0, + "step": 22122 + }, + { + "epoch": 2.8142729932578554, + "ewc_loss": 0.08253346383571625, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004493580781854689, + "grad_norm": 9.76899528503418, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8850644826889038, + "num_tokens": 844185634.0, + "step": 22123 + }, + { + "epoch": 2.8144002035364455, + "ewc_loss": 0.08136090636253357, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044007395626977086, + "grad_norm": 9.567511558532715, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.854293942451477, + "num_tokens": 844221755.0, + "step": 22124 + }, + { + "epoch": 2.8145274138150365, + "ewc_loss": 0.0824064165353775, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004505289834924042, + "grad_norm": 9.761018753051758, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8702763319015503, + "num_tokens": 844256230.0, + "step": 22125 + }, + { + "epoch": 2.8146546240936265, + "ewc_loss": 0.08147232979536057, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000441188138211146, + "grad_norm": 9.704069137573242, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8572776317596436, + "num_tokens": 844292823.0, + "step": 22126 + }, + { + "epoch": 2.8147818343722175, + "ewc_loss": 0.08202891796827316, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004467540420591831, + "grad_norm": 9.69666862487793, + "learning_rate": 1e-06, + "loss": 0.545, + "mean_token_accuracy": 0.842373251914978, + "num_tokens": 844331843.0, + "step": 22127 + }, + { + "epoch": 2.8149090446508076, + "ewc_loss": 0.08177994936704636, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044426435488276184, + "grad_norm": 9.646196365356445, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8747469186782837, + "num_tokens": 844375823.0, + "step": 22128 + }, + { + "epoch": 2.815036254929398, + "ewc_loss": 0.08256706595420837, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004472527070902288, + "grad_norm": 9.694201469421387, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8652013540267944, + "num_tokens": 844415187.0, + "step": 22129 + }, + { + "epoch": 2.8151634652079887, + "ewc_loss": 0.08229106664657593, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044449264532886446, + "grad_norm": 9.581347465515137, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8684178590774536, + "num_tokens": 844454986.0, + "step": 22130 + }, + { + "epoch": 2.815290675486579, + "ewc_loss": 0.08252682536840439, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004492916923481971, + "grad_norm": 9.75349235534668, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8712543845176697, + "num_tokens": 844495514.0, + "step": 22131 + }, + { + "epoch": 2.8154178857651697, + "ewc_loss": 0.08191152662038803, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044313870603218675, + "grad_norm": 9.597293853759766, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.863068163394928, + "num_tokens": 844532406.0, + "step": 22132 + }, + { + "epoch": 2.8155450960437602, + "ewc_loss": 0.08261780440807343, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004502014780882746, + "grad_norm": 9.719529151916504, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8641470670700073, + "num_tokens": 844569561.0, + "step": 22133 + }, + { + "epoch": 2.8156723063223508, + "ewc_loss": 0.08204230666160583, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044444651575759053, + "grad_norm": 9.57451057434082, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8698374032974243, + "num_tokens": 844613504.0, + "step": 22134 + }, + { + "epoch": 2.8157995166009413, + "ewc_loss": 0.0828663557767868, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000452687032520771, + "grad_norm": 9.765387535095215, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8632757663726807, + "num_tokens": 844649255.0, + "step": 22135 + }, + { + "epoch": 2.815926726879532, + "ewc_loss": 0.08184976130723953, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044252106454223394, + "grad_norm": 9.582679748535156, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8664050102233887, + "num_tokens": 844687202.0, + "step": 22136 + }, + { + "epoch": 2.8160539371581224, + "ewc_loss": 0.08283768594264984, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004524003015831113, + "grad_norm": 9.788627624511719, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8654505610466003, + "num_tokens": 844723096.0, + "step": 22137 + }, + { + "epoch": 2.816181147436713, + "ewc_loss": 0.08187642693519592, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004427876847330481, + "grad_norm": 9.571417808532715, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8654539585113525, + "num_tokens": 844759754.0, + "step": 22138 + }, + { + "epoch": 2.8163083577153034, + "ewc_loss": 0.08284201472997665, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004500021750573069, + "grad_norm": 9.660235404968262, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8677005767822266, + "num_tokens": 844798372.0, + "step": 22139 + }, + { + "epoch": 2.816435567993894, + "ewc_loss": 0.08208269625902176, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004448504187166691, + "grad_norm": 9.665840148925781, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8598504662513733, + "num_tokens": 844832738.0, + "step": 22140 + }, + { + "epoch": 2.8165627782724845, + "ewc_loss": 0.08257722109556198, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044735425035469234, + "grad_norm": 9.618083000183105, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8662258386611938, + "num_tokens": 844871684.0, + "step": 22141 + }, + { + "epoch": 2.816689988551075, + "ewc_loss": 0.08231367915868759, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004471602151170373, + "grad_norm": 9.637309074401855, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8575394749641418, + "num_tokens": 844912140.0, + "step": 22142 + }, + { + "epoch": 2.8168171988296655, + "ewc_loss": 0.08217137306928635, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004457371833268553, + "grad_norm": 9.59857177734375, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8531761169433594, + "num_tokens": 844948256.0, + "step": 22143 + }, + { + "epoch": 2.816944409108256, + "ewc_loss": 0.08246661722660065, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044868956319987774, + "grad_norm": 9.668426513671875, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8798878192901611, + "num_tokens": 844983821.0, + "step": 22144 + }, + { + "epoch": 2.8170716193868466, + "ewc_loss": 0.08216723054647446, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044325433555059135, + "grad_norm": 9.554248809814453, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8677710890769958, + "num_tokens": 845020622.0, + "step": 22145 + }, + { + "epoch": 2.817198829665437, + "ewc_loss": 0.08256936073303223, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004497170739341527, + "grad_norm": 9.664876937866211, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8559633493423462, + "num_tokens": 845056748.0, + "step": 22146 + }, + { + "epoch": 2.8173260399440276, + "ewc_loss": 0.08203545212745667, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004443779762368649, + "grad_norm": 9.64622688293457, + "learning_rate": 1e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8450156450271606, + "num_tokens": 845096294.0, + "step": 22147 + }, + { + "epoch": 2.817453250222618, + "ewc_loss": 0.08241288363933563, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044815221917815506, + "grad_norm": 9.647529602050781, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8806973695755005, + "num_tokens": 845135132.0, + "step": 22148 + }, + { + "epoch": 2.8175804605012083, + "ewc_loss": 0.08224846422672272, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044650802738033235, + "grad_norm": 9.61727237701416, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8736110329627991, + "num_tokens": 845172164.0, + "step": 22149 + }, + { + "epoch": 2.8177076707797992, + "ewc_loss": 0.08258116245269775, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044739365694113076, + "grad_norm": 9.616630554199219, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8658859729766846, + "num_tokens": 845210270.0, + "step": 22150 + }, + { + "epoch": 2.8178348810583893, + "ewc_loss": 0.0826413556933403, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004479955823626369, + "grad_norm": 9.624724388122559, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8679893016815186, + "num_tokens": 845251863.0, + "step": 22151 + }, + { + "epoch": 2.8179620913369803, + "ewc_loss": 0.08232450485229492, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004472684522625059, + "grad_norm": 9.602620124816895, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8700405359268188, + "num_tokens": 845290262.0, + "step": 22152 + }, + { + "epoch": 2.8180893016155704, + "ewc_loss": 0.08222579956054688, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004462814540602267, + "grad_norm": 9.569574356079102, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8733358383178711, + "num_tokens": 845326894.0, + "step": 22153 + }, + { + "epoch": 2.818216511894161, + "ewc_loss": 0.08238402754068375, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044786371290683746, + "grad_norm": 9.638864517211914, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8744521737098694, + "num_tokens": 845367545.0, + "step": 22154 + }, + { + "epoch": 2.8183437221727514, + "ewc_loss": 0.0822373554110527, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004463969962671399, + "grad_norm": 9.751397132873535, + "learning_rate": 1e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.8504292368888855, + "num_tokens": 845400920.0, + "step": 22155 + }, + { + "epoch": 2.818470932451342, + "ewc_loss": 0.08198608458042145, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044388428796082735, + "grad_norm": 9.584598541259766, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8623640537261963, + "num_tokens": 845436279.0, + "step": 22156 + }, + { + "epoch": 2.8185981427299325, + "ewc_loss": 0.08252488821744919, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004492723383009434, + "grad_norm": 9.789399147033691, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8691630363464355, + "num_tokens": 845476308.0, + "step": 22157 + }, + { + "epoch": 2.818725353008523, + "ewc_loss": 0.08171813189983368, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004412047564983368, + "grad_norm": 9.486407279968262, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8622865080833435, + "num_tokens": 845516624.0, + "step": 22158 + }, + { + "epoch": 2.8188525632871135, + "ewc_loss": 0.08278894424438477, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045191284152679145, + "grad_norm": 9.73648738861084, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8831555843353271, + "num_tokens": 845552385.0, + "step": 22159 + }, + { + "epoch": 2.818979773565704, + "ewc_loss": 0.08147607743740082, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043878424912691116, + "grad_norm": 9.48088264465332, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.873640239238739, + "num_tokens": 845592600.0, + "step": 22160 + }, + { + "epoch": 2.8191069838442946, + "ewc_loss": 0.08305072784423828, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004545307019725442, + "grad_norm": 9.773566246032715, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8742527961730957, + "num_tokens": 845630028.0, + "step": 22161 + }, + { + "epoch": 2.819234194122885, + "ewc_loss": 0.08180996775627136, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004396817530505359, + "grad_norm": 9.474250793457031, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8612755537033081, + "num_tokens": 845665874.0, + "step": 22162 + }, + { + "epoch": 2.8193614044014756, + "ewc_loss": 0.08319319784641266, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045595536357723176, + "grad_norm": 9.839903831481934, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8524420261383057, + "num_tokens": 845702738.0, + "step": 22163 + }, + { + "epoch": 2.819488614680066, + "ewc_loss": 0.0814259722828865, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004382831684779376, + "grad_norm": 9.418498992919922, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.869357168674469, + "num_tokens": 845744785.0, + "step": 22164 + }, + { + "epoch": 2.8196158249586567, + "ewc_loss": 0.08349685370922089, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004589920281432569, + "grad_norm": 9.890359878540039, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8659987449645996, + "num_tokens": 845779196.0, + "step": 22165 + }, + { + "epoch": 2.8197430352372472, + "ewc_loss": 0.08140309154987335, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043805435416288674, + "grad_norm": 9.510149955749512, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8538827896118164, + "num_tokens": 845819016.0, + "step": 22166 + }, + { + "epoch": 2.8198702455158378, + "ewc_loss": 0.0834532231092453, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045855564530938864, + "grad_norm": 10.049942016601562, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8672971725463867, + "num_tokens": 845855014.0, + "step": 22167 + }, + { + "epoch": 2.8199974557944283, + "ewc_loss": 0.08123403787612915, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043636379996314645, + "grad_norm": 9.401266098022461, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.874182939529419, + "num_tokens": 845895491.0, + "step": 22168 + }, + { + "epoch": 2.820124666073019, + "ewc_loss": 0.08398184180259705, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046384186134673655, + "grad_norm": 10.035640716552734, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8559660911560059, + "num_tokens": 845931560.0, + "step": 22169 + }, + { + "epoch": 2.8202518763516093, + "ewc_loss": 0.08114482462406158, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043547165114432573, + "grad_norm": 9.419445991516113, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8666787147521973, + "num_tokens": 845966701.0, + "step": 22170 + }, + { + "epoch": 2.8203790866302, + "ewc_loss": 0.08407825231552124, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046480598393827677, + "grad_norm": 10.030524253845215, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8833470344543457, + "num_tokens": 846002327.0, + "step": 22171 + }, + { + "epoch": 2.82050629690879, + "ewc_loss": 0.08139697462320328, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004355517739895731, + "grad_norm": 9.42424488067627, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8664464950561523, + "num_tokens": 846042157.0, + "step": 22172 + }, + { + "epoch": 2.820633507187381, + "ewc_loss": 0.08396627008914948, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046368612674996257, + "grad_norm": 9.9685640335083, + "learning_rate": 1e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8513384461402893, + "num_tokens": 846082851.0, + "step": 22173 + }, + { + "epoch": 2.820760717465971, + "ewc_loss": 0.08161197602748871, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004401432233862579, + "grad_norm": 9.556377410888672, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8646008968353271, + "num_tokens": 846116846.0, + "step": 22174 + }, + { + "epoch": 2.820887927744562, + "ewc_loss": 0.08363872021436691, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004579692322295159, + "grad_norm": 9.884815216064453, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8688884377479553, + "num_tokens": 846154373.0, + "step": 22175 + }, + { + "epoch": 2.821015138023152, + "ewc_loss": 0.08178423345088959, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004418657917995006, + "grad_norm": 9.534042358398438, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8836066722869873, + "num_tokens": 846195329.0, + "step": 22176 + }, + { + "epoch": 2.821142348301743, + "ewc_loss": 0.08315333724021912, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004555568448267877, + "grad_norm": 9.841656684875488, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8757920265197754, + "num_tokens": 846233078.0, + "step": 22177 + }, + { + "epoch": 2.821269558580333, + "ewc_loss": 0.08175864815711975, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000441609910922125, + "grad_norm": 9.541582107543945, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8643807172775269, + "num_tokens": 846269030.0, + "step": 22178 + }, + { + "epoch": 2.8213967688589237, + "ewc_loss": 0.08304253220558167, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045444874558597803, + "grad_norm": 9.80843734741211, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8615337610244751, + "num_tokens": 846307269.0, + "step": 22179 + }, + { + "epoch": 2.821523979137514, + "ewc_loss": 0.0821174681186676, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044275668915361166, + "grad_norm": 9.657718658447266, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8554909825325012, + "num_tokens": 846339116.0, + "step": 22180 + }, + { + "epoch": 2.8216511894161047, + "ewc_loss": 0.08251366764307022, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004491601139307022, + "grad_norm": 9.641966819763184, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8589941263198853, + "num_tokens": 846384968.0, + "step": 22181 + }, + { + "epoch": 2.8217783996946952, + "ewc_loss": 0.08251065015792847, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044668855844065547, + "grad_norm": 9.644810676574707, + "learning_rate": 1e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8495628237724304, + "num_tokens": 846426235.0, + "step": 22182 + }, + { + "epoch": 2.8219056099732858, + "ewc_loss": 0.08238938450813293, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004479172930587083, + "grad_norm": 9.675464630126953, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.852283239364624, + "num_tokens": 846465715.0, + "step": 22183 + }, + { + "epoch": 2.8220328202518763, + "ewc_loss": 0.0822967141866684, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000446990568889305, + "grad_norm": 9.684159278869629, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8513783812522888, + "num_tokens": 846499972.0, + "step": 22184 + }, + { + "epoch": 2.822160030530467, + "ewc_loss": 0.08231756091117859, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004471990978345275, + "grad_norm": 9.691230773925781, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8646711111068726, + "num_tokens": 846535906.0, + "step": 22185 + }, + { + "epoch": 2.8222872408090574, + "ewc_loss": 0.08214983344078064, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004455217276699841, + "grad_norm": 9.671988487243652, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.858170747756958, + "num_tokens": 846572723.0, + "step": 22186 + }, + { + "epoch": 2.822414451087648, + "ewc_loss": 0.08246751129627228, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004462570941541344, + "grad_norm": 9.61945915222168, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.868044376373291, + "num_tokens": 846613501.0, + "step": 22187 + }, + { + "epoch": 2.8225416613662384, + "ewc_loss": 0.08230666816234589, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004470901330932975, + "grad_norm": 9.619036674499512, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8865067958831787, + "num_tokens": 846650001.0, + "step": 22188 + }, + { + "epoch": 2.822668871644829, + "ewc_loss": 0.0821368396282196, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044539180817082524, + "grad_norm": 9.580058097839355, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8524843454360962, + "num_tokens": 846690636.0, + "step": 22189 + }, + { + "epoch": 2.8227960819234195, + "ewc_loss": 0.08232821524143219, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044730561785399914, + "grad_norm": 9.709547996520996, + "learning_rate": 1e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.8434261083602905, + "num_tokens": 846728513.0, + "step": 22190 + }, + { + "epoch": 2.82292329220201, + "ewc_loss": 0.08211186528205872, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004451421264093369, + "grad_norm": 9.565523147583008, + "learning_rate": 1e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8506624698638916, + "num_tokens": 846771989.0, + "step": 22191 + }, + { + "epoch": 2.8230505024806005, + "ewc_loss": 0.08285477757453918, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004501297662500292, + "grad_norm": 9.633339881896973, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8823988437652588, + "num_tokens": 846808393.0, + "step": 22192 + }, + { + "epoch": 2.823177712759191, + "ewc_loss": 0.0821814015507698, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044583744602277875, + "grad_norm": 9.634336471557617, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8557027578353882, + "num_tokens": 846846928.0, + "step": 22193 + }, + { + "epoch": 2.8233049230377816, + "ewc_loss": 0.08237957954406738, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044781918404623866, + "grad_norm": 9.630850791931152, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.854500412940979, + "num_tokens": 846884637.0, + "step": 22194 + }, + { + "epoch": 2.823432133316372, + "ewc_loss": 0.08240217715501785, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044804520439356565, + "grad_norm": 9.600083351135254, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8682965636253357, + "num_tokens": 846925749.0, + "step": 22195 + }, + { + "epoch": 2.8235593435949626, + "ewc_loss": 0.08239662647247314, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004479897324927151, + "grad_norm": 9.66203498840332, + "learning_rate": 1e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8493355512619019, + "num_tokens": 846967191.0, + "step": 22196 + }, + { + "epoch": 2.8236865538735527, + "ewc_loss": 0.08245286345481873, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044611067278310657, + "grad_norm": 9.530258178710938, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8708586692810059, + "num_tokens": 847003280.0, + "step": 22197 + }, + { + "epoch": 2.8238137641521437, + "ewc_loss": 0.08272640407085419, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004512874293141067, + "grad_norm": 9.69134521484375, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8511782884597778, + "num_tokens": 847042179.0, + "step": 22198 + }, + { + "epoch": 2.8239409744307338, + "ewc_loss": 0.0820932388305664, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044495577458292246, + "grad_norm": 9.531086921691895, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8610957860946655, + "num_tokens": 847083008.0, + "step": 22199 + }, + { + "epoch": 2.8240681847093247, + "ewc_loss": 0.08311650902032852, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004527471319306642, + "grad_norm": 9.68533706665039, + "learning_rate": 1e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8433689475059509, + "num_tokens": 847124009.0, + "step": 22200 + }, + { + "epoch": 2.824195394987915, + "ewc_loss": 0.08238225430250168, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044540458475239575, + "grad_norm": 14.851065635681152, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8585358262062073, + "num_tokens": 847163526.0, + "step": 22201 + }, + { + "epoch": 2.824322605266506, + "ewc_loss": 0.08774654567241669, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0005014889175072312, + "grad_norm": 10.055940628051758, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8638395071029663, + "num_tokens": 847201374.0, + "step": 22202 + }, + { + "epoch": 2.824449815545096, + "ewc_loss": 0.08710435032844543, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004950669244863093, + "grad_norm": 10.332037925720215, + "learning_rate": 1e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.849311351776123, + "num_tokens": 847239469.0, + "step": 22203 + }, + { + "epoch": 2.8245770258236864, + "ewc_loss": 0.08210050314664841, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004450284759514034, + "grad_norm": 9.582487106323242, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8625108599662781, + "num_tokens": 847275964.0, + "step": 22204 + }, + { + "epoch": 2.824704236102277, + "ewc_loss": 0.08764712512493134, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0005004946724511683, + "grad_norm": 10.41479206085205, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8661240339279175, + "num_tokens": 847316815.0, + "step": 22205 + }, + { + "epoch": 2.8248314463808675, + "ewc_loss": 0.08238451182842255, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004478685150388628, + "grad_norm": 9.654776573181152, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8685036897659302, + "num_tokens": 847350949.0, + "step": 22206 + }, + { + "epoch": 2.824958656659458, + "ewc_loss": 0.08591538667678833, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004831772530451417, + "grad_norm": 10.20128059387207, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8690534830093384, + "num_tokens": 847388945.0, + "step": 22207 + }, + { + "epoch": 2.8250858669380485, + "ewc_loss": 0.08255179226398468, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044954134500585496, + "grad_norm": 9.637659072875977, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8762938976287842, + "num_tokens": 847435114.0, + "step": 22208 + }, + { + "epoch": 2.825213077216639, + "ewc_loss": 0.08503463119268417, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004743697354570031, + "grad_norm": 10.132755279541016, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8551398515701294, + "num_tokens": 847470815.0, + "step": 22209 + }, + { + "epoch": 2.8253402874952296, + "ewc_loss": 0.08265434205532074, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004481254727579653, + "grad_norm": 9.69995403289795, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8615657687187195, + "num_tokens": 847509379.0, + "step": 22210 + }, + { + "epoch": 2.82546749777382, + "ewc_loss": 0.08387826383113861, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004652474890463054, + "grad_norm": 9.979958534240723, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8687974810600281, + "num_tokens": 847547427.0, + "step": 22211 + }, + { + "epoch": 2.8255947080524106, + "ewc_loss": 0.08249404281377792, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044896386680193245, + "grad_norm": 9.778882026672363, + "learning_rate": 1e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8466667532920837, + "num_tokens": 847584740.0, + "step": 22212 + }, + { + "epoch": 2.825721918331001, + "ewc_loss": 0.08301068842411041, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045413029147312045, + "grad_norm": 9.910387992858887, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8532863855361938, + "num_tokens": 847626201.0, + "step": 22213 + }, + { + "epoch": 2.8258491286095917, + "ewc_loss": 0.08238967508077621, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000447920203441754, + "grad_norm": 9.830293655395508, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8785793781280518, + "num_tokens": 847661311.0, + "step": 22214 + }, + { + "epoch": 2.8259763388881822, + "ewc_loss": 0.08269497752189636, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004509732243604958, + "grad_norm": 9.868000984191895, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8637604713439941, + "num_tokens": 847695234.0, + "step": 22215 + }, + { + "epoch": 2.8261035491667728, + "ewc_loss": 0.08204258978366852, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004444492806214839, + "grad_norm": 9.718859672546387, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8821585774421692, + "num_tokens": 847738848.0, + "step": 22216 + }, + { + "epoch": 2.8262307594453633, + "ewc_loss": 0.08240063488483429, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044802980846725404, + "grad_norm": 9.778875350952148, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8685523271560669, + "num_tokens": 847775952.0, + "step": 22217 + }, + { + "epoch": 2.826357969723954, + "ewc_loss": 0.08253636211156845, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044694566167891026, + "grad_norm": 15.271282196044922, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8805273771286011, + "num_tokens": 847809381.0, + "step": 22218 + }, + { + "epoch": 2.8264851800025443, + "ewc_loss": 0.08810395002365112, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000505062926094979, + "grad_norm": 10.216375350952148, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8617940545082092, + "num_tokens": 847849835.0, + "step": 22219 + }, + { + "epoch": 2.826612390281135, + "ewc_loss": 0.08656717836856842, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000489695172291249, + "grad_norm": 10.585176467895508, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8621256351470947, + "num_tokens": 847882726.0, + "step": 22220 + }, + { + "epoch": 2.8267396005597254, + "ewc_loss": 0.08228831738233566, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004469066043384373, + "grad_norm": 9.863821983337402, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8654884696006775, + "num_tokens": 847920663.0, + "step": 22221 + }, + { + "epoch": 2.8268668108383155, + "ewc_loss": 0.08699505031108856, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004939739010296762, + "grad_norm": 10.450883865356445, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8711081743240356, + "num_tokens": 847962204.0, + "step": 22222 + }, + { + "epoch": 2.8269940211169065, + "ewc_loss": 0.08237679302692413, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004477913898881525, + "grad_norm": 10.017036437988281, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8566429615020752, + "num_tokens": 848003722.0, + "step": 22223 + }, + { + "epoch": 2.8271212313954965, + "ewc_loss": 0.08443263173103333, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046834975364618003, + "grad_norm": 10.068512916564941, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8736966848373413, + "num_tokens": 848045335.0, + "step": 22224 + }, + { + "epoch": 2.8272484416740875, + "ewc_loss": 0.08246497064828873, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044867314863950014, + "grad_norm": 9.788304328918457, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8575522899627686, + "num_tokens": 848088817.0, + "step": 22225 + }, + { + "epoch": 2.8273756519526776, + "ewc_loss": 0.08372080326080322, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004636728554032743, + "grad_norm": 10.226923942565918, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.854703962802887, + "num_tokens": 848120993.0, + "step": 22226 + }, + { + "epoch": 2.827502862231268, + "ewc_loss": 0.08146978914737701, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004411627014633268, + "grad_norm": 9.615294456481934, + "learning_rate": 1e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.8556506037712097, + "num_tokens": 848162542.0, + "step": 22227 + }, + { + "epoch": 2.8276300725098586, + "ewc_loss": 0.08404023200273514, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004668671463150531, + "grad_norm": 10.126012802124023, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8763802647590637, + "num_tokens": 848198194.0, + "step": 22228 + }, + { + "epoch": 2.827757282788449, + "ewc_loss": 0.08151598274707794, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004416246374603361, + "grad_norm": 9.785409927368164, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8604587316513062, + "num_tokens": 848240575.0, + "step": 22229 + }, + { + "epoch": 2.8278844930670397, + "ewc_loss": 0.08336739987134933, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046013883547857404, + "grad_norm": 10.053711891174316, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8651067614555359, + "num_tokens": 848289026.0, + "step": 22230 + }, + { + "epoch": 2.8280117033456302, + "ewc_loss": 0.08152194321155548, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004416843003127724, + "grad_norm": 9.597929000854492, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8666587471961975, + "num_tokens": 848326438.0, + "step": 22231 + }, + { + "epoch": 2.8281389136242208, + "ewc_loss": 0.08366658538579941, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004631306801456958, + "grad_norm": 10.154650688171387, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8714327216148376, + "num_tokens": 848368106.0, + "step": 22232 + }, + { + "epoch": 2.8282661239028113, + "ewc_loss": 0.08125664293766022, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043903125333599746, + "grad_norm": 9.492384910583496, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8623301982879639, + "num_tokens": 848409775.0, + "step": 22233 + }, + { + "epoch": 2.828393334181402, + "ewc_loss": 0.08434704691171646, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004699353303294629, + "grad_norm": 10.193432807922363, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8652653098106384, + "num_tokens": 848442149.0, + "step": 22234 + }, + { + "epoch": 2.8285205444599923, + "ewc_loss": 0.08132144808769226, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004396792792249471, + "grad_norm": 9.657132148742676, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8590102195739746, + "num_tokens": 848475468.0, + "step": 22235 + }, + { + "epoch": 2.828647754738583, + "ewc_loss": 0.08370686322450638, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000463533477159217, + "grad_norm": 10.013505935668945, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8653925657272339, + "num_tokens": 848509565.0, + "step": 22236 + }, + { + "epoch": 2.8287749650171734, + "ewc_loss": 0.08171668648719788, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044363175402395427, + "grad_norm": 9.697711944580078, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8664546012878418, + "num_tokens": 848546357.0, + "step": 22237 + }, + { + "epoch": 2.828902175295764, + "ewc_loss": 0.08301137387752533, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004565785638988018, + "grad_norm": 9.922751426696777, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8533211946487427, + "num_tokens": 848587661.0, + "step": 22238 + }, + { + "epoch": 2.8290293855743545, + "ewc_loss": 0.0818881094455719, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000445345911430195, + "grad_norm": 9.68334674835205, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8690503835678101, + "num_tokens": 848628250.0, + "step": 22239 + }, + { + "epoch": 2.829156595852945, + "ewc_loss": 0.08269825577735901, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004534473700914532, + "grad_norm": 9.878110885620117, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8650736808776855, + "num_tokens": 848670452.0, + "step": 22240 + }, + { + "epoch": 2.8292838061315355, + "ewc_loss": 0.08204739540815353, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044693879317492247, + "grad_norm": 9.78591537475586, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8664220571517944, + "num_tokens": 848703592.0, + "step": 22241 + }, + { + "epoch": 2.829411016410126, + "ewc_loss": 0.08221960067749023, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044866089592687786, + "grad_norm": 9.728120803833008, + "learning_rate": 1e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8481664657592773, + "num_tokens": 848739291.0, + "step": 22242 + }, + { + "epoch": 2.8295382266887166, + "ewc_loss": 0.08243485540151596, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004508133861236274, + "grad_norm": 10.095726013183594, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8578615188598633, + "num_tokens": 848776039.0, + "step": 22243 + }, + { + "epoch": 2.829665436967307, + "ewc_loss": 0.08123292028903961, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043879399891011417, + "grad_norm": 9.544170379638672, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8679519295692444, + "num_tokens": 848814799.0, + "step": 22244 + }, + { + "epoch": 2.8297926472458976, + "ewc_loss": 0.08328355848789215, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045930041233077645, + "grad_norm": 9.928933143615723, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8655024766921997, + "num_tokens": 848860123.0, + "step": 22245 + }, + { + "epoch": 2.829919857524488, + "ewc_loss": 0.08139348030090332, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043795828241854906, + "grad_norm": 9.538217544555664, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.881414532661438, + "num_tokens": 848900447.0, + "step": 22246 + }, + { + "epoch": 2.8300470678030782, + "ewc_loss": 0.08372563123703003, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000461279705632478, + "grad_norm": 9.997981071472168, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8576756119728088, + "num_tokens": 848945385.0, + "step": 22247 + }, + { + "epoch": 2.830174278081669, + "ewc_loss": 0.08116281032562256, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004380929167382419, + "grad_norm": 9.581843376159668, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8536297082901001, + "num_tokens": 848978783.0, + "step": 22248 + }, + { + "epoch": 2.8303014883602593, + "ewc_loss": 0.08349472284317017, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004614120698533952, + "grad_norm": 9.940252304077148, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8526889681816101, + "num_tokens": 849017324.0, + "step": 22249 + }, + { + "epoch": 2.8304286986388503, + "ewc_loss": 0.08164970576763153, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004405204963404685, + "grad_norm": 9.590723037719727, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8862336874008179, + "num_tokens": 849045928.0, + "step": 22250 + }, + { + "epoch": 2.8305559089174404, + "ewc_loss": 0.08328348398208618, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045929968473501503, + "grad_norm": 9.950374603271484, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8644245862960815, + "num_tokens": 849080855.0, + "step": 22251 + }, + { + "epoch": 2.830683119196031, + "ewc_loss": 0.0814238116145134, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004407029482536018, + "grad_norm": 9.54045295715332, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8680329322814941, + "num_tokens": 849114835.0, + "step": 22252 + }, + { + "epoch": 2.8308103294746214, + "ewc_loss": 0.0832538902759552, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004590037278831005, + "grad_norm": 9.9448881149292, + "learning_rate": 1e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.852934718132019, + "num_tokens": 849152632.0, + "step": 22253 + }, + { + "epoch": 2.830937539753212, + "ewc_loss": 0.08141109347343445, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004405758227221668, + "grad_norm": 9.520214080810547, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8716009855270386, + "num_tokens": 849189911.0, + "step": 22254 + }, + { + "epoch": 2.8310647500318025, + "ewc_loss": 0.08343373239040375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046080219908617437, + "grad_norm": 9.884957313537598, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8614178895950317, + "num_tokens": 849227847.0, + "step": 22255 + }, + { + "epoch": 2.831191960310393, + "ewc_loss": 0.08173497766256332, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044137320946902037, + "grad_norm": 9.600238800048828, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8702595233917236, + "num_tokens": 849265778.0, + "step": 22256 + }, + { + "epoch": 2.8313191705889835, + "ewc_loss": 0.08331288397312164, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004571522295009345, + "grad_norm": 9.860472679138184, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8630548715591431, + "num_tokens": 849302488.0, + "step": 22257 + }, + { + "epoch": 2.831446380867574, + "ewc_loss": 0.08211870491504669, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044521046220324934, + "grad_norm": 9.5091552734375, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8799766898155212, + "num_tokens": 849337483.0, + "step": 22258 + }, + { + "epoch": 2.8315735911461646, + "ewc_loss": 0.08355452120304108, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004595686332322657, + "grad_norm": 9.869629859924316, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8700518608093262, + "num_tokens": 849378491.0, + "step": 22259 + }, + { + "epoch": 2.831700801424755, + "ewc_loss": 0.08212064206600189, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044522990356199443, + "grad_norm": 9.589017868041992, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.871233344078064, + "num_tokens": 849418897.0, + "step": 22260 + }, + { + "epoch": 2.8318280117033456, + "ewc_loss": 0.08359788358211517, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004600023094099015, + "grad_norm": 9.902311325073242, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8681985139846802, + "num_tokens": 849457057.0, + "step": 22261 + }, + { + "epoch": 2.831955221981936, + "ewc_loss": 0.08199581503868103, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044398155296221375, + "grad_norm": 9.624696731567383, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8681345582008362, + "num_tokens": 849488825.0, + "step": 22262 + }, + { + "epoch": 2.8320824322605267, + "ewc_loss": 0.08346374332904816, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045866085565648973, + "grad_norm": 9.891057014465332, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8727277517318726, + "num_tokens": 849528336.0, + "step": 22263 + }, + { + "epoch": 2.8322096425391172, + "ewc_loss": 0.08199545741081238, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044397800229489803, + "grad_norm": 9.547571182250977, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.878409743309021, + "num_tokens": 849565373.0, + "step": 22264 + }, + { + "epoch": 2.8323368528177078, + "ewc_loss": 0.0835023745894432, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004590471799019724, + "grad_norm": 9.923638343811035, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8771073222160339, + "num_tokens": 849600909.0, + "step": 22265 + }, + { + "epoch": 2.8324640630962983, + "ewc_loss": 0.08149286359548569, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004413934948388487, + "grad_norm": 9.5137357711792, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8550081253051758, + "num_tokens": 849639499.0, + "step": 22266 + }, + { + "epoch": 2.832591273374889, + "ewc_loss": 0.08343678712844849, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046083275810815394, + "grad_norm": 9.904239654541016, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8756279945373535, + "num_tokens": 849677232.0, + "step": 22267 + }, + { + "epoch": 2.8327184836534793, + "ewc_loss": 0.08161661773920059, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004426310188136995, + "grad_norm": 9.508094787597656, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.871636152267456, + "num_tokens": 849717187.0, + "step": 22268 + }, + { + "epoch": 2.83284569393207, + "ewc_loss": 0.08366501331329346, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046311496407724917, + "grad_norm": 9.997231483459473, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8740907907485962, + "num_tokens": 849756545.0, + "step": 22269 + }, + { + "epoch": 2.83297290421066, + "ewc_loss": 0.0813637301325798, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044010215788148344, + "grad_norm": 9.514514923095703, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8670738935470581, + "num_tokens": 849788670.0, + "step": 22270 + }, + { + "epoch": 2.833100114489251, + "ewc_loss": 0.08394783735275269, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046594321611337364, + "grad_norm": 10.106048583984375, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8652049899101257, + "num_tokens": 849825715.0, + "step": 22271 + }, + { + "epoch": 2.833227324767841, + "ewc_loss": 0.08116716146469116, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004381364560686052, + "grad_norm": 9.501852035522461, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8715459704399109, + "num_tokens": 849856435.0, + "step": 22272 + }, + { + "epoch": 2.833354535046432, + "ewc_loss": 0.08418560028076172, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004658794787246734, + "grad_norm": 10.039239883422852, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8798960447311401, + "num_tokens": 849895634.0, + "step": 22273 + }, + { + "epoch": 2.833481745325022, + "ewc_loss": 0.08164598792791367, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004404833307489753, + "grad_norm": 9.47276782989502, + "learning_rate": 1e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.843718409538269, + "num_tokens": 849938646.0, + "step": 22274 + }, + { + "epoch": 2.833608955603613, + "ewc_loss": 0.0843806192278862, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004653882060665637, + "grad_norm": 9.951888084411621, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8625332713127136, + "num_tokens": 849978642.0, + "step": 22275 + }, + { + "epoch": 2.833736165882203, + "ewc_loss": 0.08189056813716888, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004429291293490678, + "grad_norm": 9.514074325561523, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8637418746948242, + "num_tokens": 850017820.0, + "step": 22276 + }, + { + "epoch": 2.8338633761607936, + "ewc_loss": 0.08400353789329529, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004640587721951306, + "grad_norm": 9.931053161621094, + "learning_rate": 1e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8470144271850586, + "num_tokens": 850056619.0, + "step": 22277 + }, + { + "epoch": 2.833990586439384, + "ewc_loss": 0.08204765617847443, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044450003770180047, + "grad_norm": 9.608942031860352, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8616528511047363, + "num_tokens": 850092456.0, + "step": 22278 + }, + { + "epoch": 2.8341177967179747, + "ewc_loss": 0.08370739221572876, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046109731192700565, + "grad_norm": 9.852829933166504, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8611388206481934, + "num_tokens": 850131598.0, + "step": 22279 + }, + { + "epoch": 2.8342450069965652, + "ewc_loss": 0.08234737813472748, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044749723747372627, + "grad_norm": 9.686172485351562, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8557834625244141, + "num_tokens": 850170884.0, + "step": 22280 + }, + { + "epoch": 2.8343722172751558, + "ewc_loss": 0.08319613337516785, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045354338362812996, + "grad_norm": 9.735403060913086, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8625712394714355, + "num_tokens": 850206989.0, + "step": 22281 + }, + { + "epoch": 2.8344994275537463, + "ewc_loss": 0.08236362040042877, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045010101166553795, + "grad_norm": 9.658670425415039, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8606777787208557, + "num_tokens": 850248602.0, + "step": 22282 + }, + { + "epoch": 2.834626637832337, + "ewc_loss": 0.0826408788561821, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004528736462816596, + "grad_norm": 9.718533515930176, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8560380339622498, + "num_tokens": 850289790.0, + "step": 22283 + }, + { + "epoch": 2.8347538481109273, + "ewc_loss": 0.08243235945701599, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045078841503709555, + "grad_norm": 9.695446014404297, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8732643723487854, + "num_tokens": 850326688.0, + "step": 22284 + }, + { + "epoch": 2.834881058389518, + "ewc_loss": 0.08278577774763107, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000451881205663085, + "grad_norm": 9.679332733154297, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8765754699707031, + "num_tokens": 850366460.0, + "step": 22285 + }, + { + "epoch": 2.8350082686681084, + "ewc_loss": 0.08273248374462128, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004513482272159308, + "grad_norm": 9.668048858642578, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8711382150650024, + "num_tokens": 850400046.0, + "step": 22286 + }, + { + "epoch": 2.835135478946699, + "ewc_loss": 0.08252124488353729, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004516772460192442, + "grad_norm": 9.71080493927002, + "learning_rate": 1e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8547348976135254, + "num_tokens": 850434374.0, + "step": 22287 + }, + { + "epoch": 2.8352626892252895, + "ewc_loss": 0.08241213858127594, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004505862598307431, + "grad_norm": 9.702632904052734, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8673471212387085, + "num_tokens": 850469056.0, + "step": 22288 + }, + { + "epoch": 2.83538989950388, + "ewc_loss": 0.08220396935939789, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044850449194200337, + "grad_norm": 9.675034523010254, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8625519275665283, + "num_tokens": 850505805.0, + "step": 22289 + }, + { + "epoch": 2.8355171097824705, + "ewc_loss": 0.0825175940990448, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045164080802351236, + "grad_norm": 9.710187911987305, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8532116413116455, + "num_tokens": 850550056.0, + "step": 22290 + }, + { + "epoch": 2.835644320061061, + "ewc_loss": 0.0822843462228775, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000449308252427727, + "grad_norm": 9.649188041687012, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8734127283096313, + "num_tokens": 850585075.0, + "step": 22291 + }, + { + "epoch": 2.8357715303396516, + "ewc_loss": 0.08252652734518051, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004517301276791841, + "grad_norm": 9.677023887634277, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8757516741752625, + "num_tokens": 850625758.0, + "step": 22292 + }, + { + "epoch": 2.835898740618242, + "ewc_loss": 0.08230595290660858, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004495243774726987, + "grad_norm": 9.635091781616211, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8783921599388123, + "num_tokens": 850664608.0, + "step": 22293 + }, + { + "epoch": 2.8360259508968326, + "ewc_loss": 0.08260315656661987, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004524964315351099, + "grad_norm": 9.730820655822754, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8742924332618713, + "num_tokens": 850705897.0, + "step": 22294 + }, + { + "epoch": 2.8361531611754227, + "ewc_loss": 0.08201158046722412, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004465806996449828, + "grad_norm": 9.625750541687012, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8596453070640564, + "num_tokens": 850746195.0, + "step": 22295 + }, + { + "epoch": 2.8362803714540137, + "ewc_loss": 0.0826614499092102, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004530793521553278, + "grad_norm": 9.752779006958008, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8722283840179443, + "num_tokens": 850785183.0, + "step": 22296 + }, + { + "epoch": 2.8364075817326038, + "ewc_loss": 0.08210479468107224, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004475128080230206, + "grad_norm": 9.63872241973877, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.870055615901947, + "num_tokens": 850819558.0, + "step": 22297 + }, + { + "epoch": 2.8365347920111947, + "ewc_loss": 0.08274012058973312, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045386605779640377, + "grad_norm": 9.792922973632812, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8555793762207031, + "num_tokens": 850857385.0, + "step": 22298 + }, + { + "epoch": 2.836662002289785, + "ewc_loss": 0.08192063122987747, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000445671146735549, + "grad_norm": 9.614686965942383, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8592252135276794, + "num_tokens": 850895297.0, + "step": 22299 + }, + { + "epoch": 2.836789212568376, + "ewc_loss": 0.08267372846603394, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004532021703198552, + "grad_norm": 9.748153686523438, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8649454116821289, + "num_tokens": 850930982.0, + "step": 22300 + }, + { + "epoch": 2.836916422846966, + "ewc_loss": 0.08204793930053711, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044694426469504833, + "grad_norm": 9.668550491333008, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8613035678863525, + "num_tokens": 850969032.0, + "step": 22301 + }, + { + "epoch": 2.8370436331255564, + "ewc_loss": 0.08257906138896942, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004522554809227586, + "grad_norm": 9.76757526397705, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8685275912284851, + "num_tokens": 851008325.0, + "step": 22302 + }, + { + "epoch": 2.837170843404147, + "ewc_loss": 0.0820164680480957, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044662956497631967, + "grad_norm": 9.620668411254883, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8674407005310059, + "num_tokens": 851044289.0, + "step": 22303 + }, + { + "epoch": 2.8372980536827375, + "ewc_loss": 0.08270241320133209, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004534890176728368, + "grad_norm": 9.819153785705566, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8711401224136353, + "num_tokens": 851079887.0, + "step": 22304 + }, + { + "epoch": 2.837425263961328, + "ewc_loss": 0.08177673071622849, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004442321660462767, + "grad_norm": 9.645236015319824, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8685978651046753, + "num_tokens": 851124535.0, + "step": 22305 + }, + { + "epoch": 2.8375524742399185, + "ewc_loss": 0.08270376920700073, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045350255095399916, + "grad_norm": 9.742633819580078, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8752872943878174, + "num_tokens": 851162982.0, + "step": 22306 + }, + { + "epoch": 2.837679684518509, + "ewc_loss": 0.0821089819073677, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004475546593312174, + "grad_norm": 9.70450210571289, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8736605644226074, + "num_tokens": 851201864.0, + "step": 22307 + }, + { + "epoch": 2.8378068947970996, + "ewc_loss": 0.08239366114139557, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045040150871500373, + "grad_norm": 9.727540969848633, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8686143159866333, + "num_tokens": 851241659.0, + "step": 22308 + }, + { + "epoch": 2.83793410507569, + "ewc_loss": 0.08225206285715103, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004489854909479618, + "grad_norm": 9.68562126159668, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8568054437637329, + "num_tokens": 851278861.0, + "step": 22309 + }, + { + "epoch": 2.8380613153542806, + "ewc_loss": 0.08245104551315308, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004509752616286278, + "grad_norm": 9.701789855957031, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8665273189544678, + "num_tokens": 851314442.0, + "step": 22310 + }, + { + "epoch": 2.838188525632871, + "ewc_loss": 0.08227692544460297, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044923406676389277, + "grad_norm": 9.670822143554688, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.860152006149292, + "num_tokens": 851356322.0, + "step": 22311 + }, + { + "epoch": 2.8383157359114617, + "ewc_loss": 0.08258388936519623, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045230373507365584, + "grad_norm": 9.731396675109863, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8535434603691101, + "num_tokens": 851395374.0, + "step": 22312 + }, + { + "epoch": 2.838442946190052, + "ewc_loss": 0.08217902481555939, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004482551303226501, + "grad_norm": 9.653266906738281, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8640367984771729, + "num_tokens": 851430957.0, + "step": 22313 + }, + { + "epoch": 2.8385701564686427, + "ewc_loss": 0.08266038447618484, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045306870015338063, + "grad_norm": 9.72738265991211, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8600596189498901, + "num_tokens": 851474357.0, + "step": 22314 + }, + { + "epoch": 2.8386973667472333, + "ewc_loss": 0.08220971375703812, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004485619720071554, + "grad_norm": 9.705394744873047, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8784874081611633, + "num_tokens": 851510346.0, + "step": 22315 + }, + { + "epoch": 2.838824577025824, + "ewc_loss": 0.0825115442276001, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045158033026382327, + "grad_norm": 9.75147533416748, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8686107993125916, + "num_tokens": 851546798.0, + "step": 22316 + }, + { + "epoch": 2.8389517873044143, + "ewc_loss": 0.0821966677904129, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004484315577428788, + "grad_norm": 9.678815841674805, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8688357472419739, + "num_tokens": 851587819.0, + "step": 22317 + }, + { + "epoch": 2.839078997583005, + "ewc_loss": 0.08259335160255432, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045239838073030114, + "grad_norm": 9.70671272277832, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8696146011352539, + "num_tokens": 851628764.0, + "step": 22318 + }, + { + "epoch": 2.8392062078615954, + "ewc_loss": 0.08240070939064026, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004504719690885395, + "grad_norm": 9.710600852966309, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8524861931800842, + "num_tokens": 851672206.0, + "step": 22319 + }, + { + "epoch": 2.8393334181401855, + "ewc_loss": 0.08247727155685425, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045123754534870386, + "grad_norm": 9.740782737731934, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.859991192817688, + "num_tokens": 851709969.0, + "step": 22320 + }, + { + "epoch": 2.8394606284187764, + "ewc_loss": 0.08229720592498779, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004494368622545153, + "grad_norm": 9.682869911193848, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8576771020889282, + "num_tokens": 851749015.0, + "step": 22321 + }, + { + "epoch": 2.8395878386973665, + "ewc_loss": 0.08250763267278671, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004515411565080285, + "grad_norm": 9.663214683532715, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8700075149536133, + "num_tokens": 851787210.0, + "step": 22322 + }, + { + "epoch": 2.8397150489759575, + "ewc_loss": 0.08246386796236038, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004511035222094506, + "grad_norm": 9.788808822631836, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8598713874816895, + "num_tokens": 851824677.0, + "step": 22323 + }, + { + "epoch": 2.8398422592545476, + "ewc_loss": 0.08198758959770203, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004463407094590366, + "grad_norm": 9.584324836730957, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8707579374313354, + "num_tokens": 851863614.0, + "step": 22324 + }, + { + "epoch": 2.839969469533138, + "ewc_loss": 0.08297616988420486, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004562265530694276, + "grad_norm": 9.757250785827637, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.859760046005249, + "num_tokens": 851908153.0, + "step": 22325 + }, + { + "epoch": 2.8400966798117286, + "ewc_loss": 0.0819825530052185, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044629035983234644, + "grad_norm": 9.614309310913086, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8616011738777161, + "num_tokens": 851946473.0, + "step": 22326 + }, + { + "epoch": 2.840223890090319, + "ewc_loss": 0.08288924396038055, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045535725075751543, + "grad_norm": 9.727961540222168, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8550335168838501, + "num_tokens": 851991600.0, + "step": 22327 + }, + { + "epoch": 2.8403511003689097, + "ewc_loss": 0.08212888985872269, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044775372953154147, + "grad_norm": 9.594759941101074, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8662758469581604, + "num_tokens": 852029468.0, + "step": 22328 + }, + { + "epoch": 2.8404783106475002, + "ewc_loss": 0.08288435637950897, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004553084436338395, + "grad_norm": 9.780545234680176, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8701457977294922, + "num_tokens": 852061400.0, + "step": 22329 + }, + { + "epoch": 2.8406055209260908, + "ewc_loss": 0.08206284791231155, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004470933345146477, + "grad_norm": 9.61109447479248, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8652694225311279, + "num_tokens": 852100596.0, + "step": 22330 + }, + { + "epoch": 2.8407327312046813, + "ewc_loss": 0.08293312788009644, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000455796136520803, + "grad_norm": 9.769373893737793, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8677983283996582, + "num_tokens": 852140320.0, + "step": 22331 + }, + { + "epoch": 2.840859941483272, + "ewc_loss": 0.08217397332191467, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044820457696914673, + "grad_norm": 9.596710205078125, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8637452125549316, + "num_tokens": 852175969.0, + "step": 22332 + }, + { + "epoch": 2.8409871517618623, + "ewc_loss": 0.08292551338672638, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045572000090032816, + "grad_norm": 9.780233383178711, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8502232432365417, + "num_tokens": 852210816.0, + "step": 22333 + }, + { + "epoch": 2.841114362040453, + "ewc_loss": 0.08209829032421112, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004474477609619498, + "grad_norm": 9.600909233093262, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8631845712661743, + "num_tokens": 852246684.0, + "step": 22334 + }, + { + "epoch": 2.8412415723190434, + "ewc_loss": 0.08351150155067444, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004566970164887607, + "grad_norm": 10.30642032623291, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.869232177734375, + "num_tokens": 852285283.0, + "step": 22335 + }, + { + "epoch": 2.841368782597634, + "ewc_loss": 0.08105568587779999, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043702172115445137, + "grad_norm": 9.389253616333008, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8722370266914368, + "num_tokens": 852323310.0, + "step": 22336 + }, + { + "epoch": 2.8414959928762245, + "ewc_loss": 0.08468323200941086, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004732971719931811, + "grad_norm": 10.06815242767334, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8686636090278625, + "num_tokens": 852360850.0, + "step": 22337 + }, + { + "epoch": 2.841623203154815, + "ewc_loss": 0.08100200444459915, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004364849010016769, + "grad_norm": 9.323151588439941, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.859207034111023, + "num_tokens": 852399320.0, + "step": 22338 + }, + { + "epoch": 2.8417504134334055, + "ewc_loss": 0.08523663878440857, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004788312071468681, + "grad_norm": 10.200566291809082, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.862335205078125, + "num_tokens": 852430980.0, + "step": 22339 + }, + { + "epoch": 2.841877623711996, + "ewc_loss": 0.08122450113296509, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043870985973626375, + "grad_norm": 9.356377601623535, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8762279748916626, + "num_tokens": 852465688.0, + "step": 22340 + }, + { + "epoch": 2.8420048339905866, + "ewc_loss": 0.08537346869707108, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00048019952373579144, + "grad_norm": 10.145169258117676, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8632906079292297, + "num_tokens": 852504372.0, + "step": 22341 + }, + { + "epoch": 2.842132044269177, + "ewc_loss": 0.08166773617267609, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004431421693880111, + "grad_norm": 9.511602401733398, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8696854710578918, + "num_tokens": 852542232.0, + "step": 22342 + }, + { + "epoch": 2.8422592545477676, + "ewc_loss": 0.08493529260158539, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004758177965413779, + "grad_norm": 10.122380256652832, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8561248779296875, + "num_tokens": 852580005.0, + "step": 22343 + }, + { + "epoch": 2.842386464826358, + "ewc_loss": 0.0819278433918953, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004457432951312512, + "grad_norm": 9.483431816101074, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8770041465759277, + "num_tokens": 852619912.0, + "step": 22344 + }, + { + "epoch": 2.8425136751049482, + "ewc_loss": 0.08489769697189331, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004754417750518769, + "grad_norm": 10.120950698852539, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8674349188804626, + "num_tokens": 852654349.0, + "step": 22345 + }, + { + "epoch": 2.842640885383539, + "ewc_loss": 0.08208976686000824, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004473624867387116, + "grad_norm": 9.554112434387207, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8576619029045105, + "num_tokens": 852691300.0, + "step": 22346 + }, + { + "epoch": 2.8427680956621293, + "ewc_loss": 0.08437202125787735, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00047018504119478166, + "grad_norm": 10.058707237243652, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8641379475593567, + "num_tokens": 852731228.0, + "step": 22347 + }, + { + "epoch": 2.8428953059407203, + "ewc_loss": 0.08209741115570068, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000447438913397491, + "grad_norm": 9.61221694946289, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8640657663345337, + "num_tokens": 852768333.0, + "step": 22348 + }, + { + "epoch": 2.8430225162193103, + "ewc_loss": 0.08388496190309525, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046531445696018636, + "grad_norm": 9.951691627502441, + "learning_rate": 1e-06, + "loss": 0.58, + "mean_token_accuracy": 0.832595944404602, + "num_tokens": 852810962.0, + "step": 22349 + }, + { + "epoch": 2.843149726497901, + "ewc_loss": 0.0820193812251091, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004466586688067764, + "grad_norm": 9.606026649475098, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8803185224533081, + "num_tokens": 852844326.0, + "step": 22350 + }, + { + "epoch": 2.8432769367764914, + "ewc_loss": 0.08345527201890945, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004610175674315542, + "grad_norm": 9.89563274383545, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8584288954734802, + "num_tokens": 852877737.0, + "step": 22351 + }, + { + "epoch": 2.843404147055082, + "ewc_loss": 0.08253517746925354, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004469337873160839, + "grad_norm": 9.588743209838867, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.872435986995697, + "num_tokens": 852922085.0, + "step": 22352 + }, + { + "epoch": 2.8435313573336725, + "ewc_loss": 0.08323504030704498, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045881528058089316, + "grad_norm": 9.819807052612305, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8829376697540283, + "num_tokens": 852960742.0, + "step": 22353 + }, + { + "epoch": 2.843658567612263, + "ewc_loss": 0.08269798755645752, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004485618555918336, + "grad_norm": 9.645880699157715, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8699433207511902, + "num_tokens": 852993502.0, + "step": 22354 + }, + { + "epoch": 2.8437857778908535, + "ewc_loss": 0.08292994648218155, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045576432603411376, + "grad_norm": 9.800586700439453, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8714169859886169, + "num_tokens": 853036844.0, + "step": 22355 + }, + { + "epoch": 2.843912988169444, + "ewc_loss": 0.08226145803928375, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044907937990501523, + "grad_norm": 9.669085502624512, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8640625476837158, + "num_tokens": 853070851.0, + "step": 22356 + }, + { + "epoch": 2.8440401984480346, + "ewc_loss": 0.08275952935218811, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004540601803455502, + "grad_norm": 9.783004760742188, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.864443838596344, + "num_tokens": 853106116.0, + "step": 22357 + }, + { + "epoch": 2.844167408726625, + "ewc_loss": 0.08217965066432953, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044826132943853736, + "grad_norm": 9.63662338256836, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8636332750320435, + "num_tokens": 853152776.0, + "step": 22358 + }, + { + "epoch": 2.8442946190052156, + "ewc_loss": 0.0827556848526001, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045402173418551683, + "grad_norm": 9.792915344238281, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8632587194442749, + "num_tokens": 853194176.0, + "step": 22359 + }, + { + "epoch": 2.844421829283806, + "ewc_loss": 0.08205856382846832, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044705052278004587, + "grad_norm": 9.587026596069336, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8622345924377441, + "num_tokens": 853236834.0, + "step": 22360 + }, + { + "epoch": 2.8445490395623967, + "ewc_loss": 0.0829477310180664, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004559421504382044, + "grad_norm": 9.849580764770508, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.856937825679779, + "num_tokens": 853274145.0, + "step": 22361 + }, + { + "epoch": 2.844676249840987, + "ewc_loss": 0.08194329589605331, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000445897807367146, + "grad_norm": 9.651290893554688, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8670021295547485, + "num_tokens": 853312569.0, + "step": 22362 + }, + { + "epoch": 2.8448034601195777, + "ewc_loss": 0.08292210847139359, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004556859203148633, + "grad_norm": 9.869006156921387, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.870195209980011, + "num_tokens": 853351742.0, + "step": 22363 + }, + { + "epoch": 2.8449306703981683, + "ewc_loss": 0.08172308653593063, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004436957242432982, + "grad_norm": 9.635226249694824, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8671973943710327, + "num_tokens": 853389607.0, + "step": 22364 + }, + { + "epoch": 2.845057880676759, + "ewc_loss": 0.08296218514442444, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000456086709164083, + "grad_norm": 9.836795806884766, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8721385598182678, + "num_tokens": 853427218.0, + "step": 22365 + }, + { + "epoch": 2.8451850909553493, + "ewc_loss": 0.08173084259033203, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004437732568476349, + "grad_norm": 9.61096477508545, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8695101737976074, + "num_tokens": 853464773.0, + "step": 22366 + }, + { + "epoch": 2.84531230123394, + "ewc_loss": 0.08288395404815674, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000455304398201406, + "grad_norm": 9.854887962341309, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8654757142066956, + "num_tokens": 853507250.0, + "step": 22367 + }, + { + "epoch": 2.84543951151253, + "ewc_loss": 0.08178119361400604, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004442767531145364, + "grad_norm": 9.587665557861328, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8682932257652283, + "num_tokens": 853551947.0, + "step": 22368 + }, + { + "epoch": 2.845566721791121, + "ewc_loss": 0.0828779935836792, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045524482266046107, + "grad_norm": 9.84189510345459, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8569833040237427, + "num_tokens": 853590557.0, + "step": 22369 + }, + { + "epoch": 2.845693932069711, + "ewc_loss": 0.08185312151908875, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044499608338810503, + "grad_norm": 9.626309394836426, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8600937724113464, + "num_tokens": 853630870.0, + "step": 22370 + }, + { + "epoch": 2.845821142348302, + "ewc_loss": 0.08311961591243744, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004552195896394551, + "grad_norm": 9.897961616516113, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8620599508285522, + "num_tokens": 853662360.0, + "step": 22371 + }, + { + "epoch": 2.845948352626892, + "ewc_loss": 0.08157281577587128, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004421929770614952, + "grad_norm": 9.459712982177734, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8599200248718262, + "num_tokens": 853703661.0, + "step": 22372 + }, + { + "epoch": 2.846075562905483, + "ewc_loss": 0.08357205986976624, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000462185445940122, + "grad_norm": 9.977343559265137, + "learning_rate": 1e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8535847067832947, + "num_tokens": 853742398.0, + "step": 22373 + }, + { + "epoch": 2.846202773184073, + "ewc_loss": 0.08145754039287567, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004410402034409344, + "grad_norm": 9.570714950561523, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.872502326965332, + "num_tokens": 853781479.0, + "step": 22374 + }, + { + "epoch": 2.8463299834626636, + "ewc_loss": 0.08360601961612701, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004625250003300607, + "grad_norm": 9.90417766571045, + "learning_rate": 1e-06, + "loss": 0.554, + "mean_token_accuracy": 0.8400494456291199, + "num_tokens": 853822330.0, + "step": 22375 + }, + { + "epoch": 2.846457193741254, + "ewc_loss": 0.08174596726894379, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044392450945451856, + "grad_norm": 9.544819831848145, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8507744073867798, + "num_tokens": 853862747.0, + "step": 22376 + }, + { + "epoch": 2.8465844040198447, + "ewc_loss": 0.0834951400756836, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046141623170115054, + "grad_norm": 9.90803050994873, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8886575698852539, + "num_tokens": 853897124.0, + "step": 22377 + }, + { + "epoch": 2.8467116142984352, + "ewc_loss": 0.08181954175233841, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004446602542884648, + "grad_norm": 9.609703063964844, + "learning_rate": 1e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8510128855705261, + "num_tokens": 853930516.0, + "step": 22378 + }, + { + "epoch": 2.8468388245770258, + "ewc_loss": 0.08339908719062805, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004604557470884174, + "grad_norm": 9.865523338317871, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8707738518714905, + "num_tokens": 853967016.0, + "step": 22379 + }, + { + "epoch": 2.8469660348556163, + "ewc_loss": 0.08208653330802917, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044733015238307416, + "grad_norm": 9.6334867477417, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8492145538330078, + "num_tokens": 854008804.0, + "step": 22380 + }, + { + "epoch": 2.847093245134207, + "ewc_loss": 0.08320988714694977, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045856370707042515, + "grad_norm": 9.874398231506348, + "learning_rate": 1e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8523131608963013, + "num_tokens": 854048393.0, + "step": 22381 + }, + { + "epoch": 2.8472204554127973, + "ewc_loss": 0.08191342651844025, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044559911475516856, + "grad_norm": 9.60317611694336, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8710044622421265, + "num_tokens": 854089480.0, + "step": 22382 + }, + { + "epoch": 2.847347665691388, + "ewc_loss": 0.083149753510952, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004579623928293586, + "grad_norm": 9.868391036987305, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8757752776145935, + "num_tokens": 854125677.0, + "step": 22383 + }, + { + "epoch": 2.8474748759699784, + "ewc_loss": 0.08191728591918945, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004456377064343542, + "grad_norm": 9.581066131591797, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8609797358512878, + "num_tokens": 854160485.0, + "step": 22384 + }, + { + "epoch": 2.847602086248569, + "ewc_loss": 0.08334824442863464, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045994724496267736, + "grad_norm": 9.8796968460083, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.857083797454834, + "num_tokens": 854200635.0, + "step": 22385 + }, + { + "epoch": 2.8477292965271594, + "ewc_loss": 0.08195605129003525, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004460253694560379, + "grad_norm": 9.601717948913574, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8696122169494629, + "num_tokens": 854240046.0, + "step": 22386 + }, + { + "epoch": 2.84785650680575, + "ewc_loss": 0.08330892771482468, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045955413952469826, + "grad_norm": 9.85654067993164, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8761554956436157, + "num_tokens": 854275880.0, + "step": 22387 + }, + { + "epoch": 2.8479837170843405, + "ewc_loss": 0.08203516900539398, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044681658619083464, + "grad_norm": 9.588976860046387, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8669098019599915, + "num_tokens": 854311204.0, + "step": 22388 + }, + { + "epoch": 2.848110927362931, + "ewc_loss": 0.08323942124843597, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045885908184573054, + "grad_norm": 9.898920059204102, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8623461723327637, + "num_tokens": 854346076.0, + "step": 22389 + }, + { + "epoch": 2.8482381376415216, + "ewc_loss": 0.08198030292987823, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044626789167523384, + "grad_norm": 9.572371482849121, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8717619180679321, + "num_tokens": 854383075.0, + "step": 22390 + }, + { + "epoch": 2.848365347920112, + "ewc_loss": 0.08329982310533524, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045946307363919914, + "grad_norm": 9.818379402160645, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8569469451904297, + "num_tokens": 854416011.0, + "step": 22391 + }, + { + "epoch": 2.8484925581987026, + "ewc_loss": 0.08205296099185944, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004469944105949253, + "grad_norm": 9.647976875305176, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8672640919685364, + "num_tokens": 854449520.0, + "step": 22392 + }, + { + "epoch": 2.8486197684772927, + "ewc_loss": 0.08345232158899307, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045610524830408394, + "grad_norm": 9.805594444274902, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8625558018684387, + "num_tokens": 854484355.0, + "step": 22393 + }, + { + "epoch": 2.8487469787558837, + "ewc_loss": 0.08224622905254364, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044892713776789606, + "grad_norm": 9.65640926361084, + "learning_rate": 1e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8499003052711487, + "num_tokens": 854526238.0, + "step": 22394 + }, + { + "epoch": 2.8488741890344738, + "ewc_loss": 0.08277095854282379, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045417441288009286, + "grad_norm": 9.774563789367676, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8666209578514099, + "num_tokens": 854562619.0, + "step": 22395 + }, + { + "epoch": 2.8490013993130647, + "ewc_loss": 0.08231820166110992, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004496468463912606, + "grad_norm": 9.64220142364502, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8702868819236755, + "num_tokens": 854604974.0, + "step": 22396 + }, + { + "epoch": 2.849128609591655, + "ewc_loss": 0.08268596231937408, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004533244646154344, + "grad_norm": 9.763477325439453, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8729368448257446, + "num_tokens": 854644066.0, + "step": 22397 + }, + { + "epoch": 2.849255819870246, + "ewc_loss": 0.08216586709022522, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004481235519051552, + "grad_norm": 9.664255142211914, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8680838346481323, + "num_tokens": 854682723.0, + "step": 22398 + }, + { + "epoch": 2.849383030148836, + "ewc_loss": 0.0824086144566536, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004505509859882295, + "grad_norm": 9.740121841430664, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8749228715896606, + "num_tokens": 854720292.0, + "step": 22399 + }, + { + "epoch": 2.8495102404274264, + "ewc_loss": 0.0822753831744194, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044921867083758116, + "grad_norm": 9.681337356567383, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8614118099212646, + "num_tokens": 854756917.0, + "step": 22400 + }, + { + "epoch": 2.849637450706017, + "ewc_loss": 0.08249685168266296, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004514333268161863, + "grad_norm": 9.747611045837402, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8655283451080322, + "num_tokens": 854798771.0, + "step": 22401 + }, + { + "epoch": 2.8497646609846075, + "ewc_loss": 0.08217865228652954, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044825137592852116, + "grad_norm": 9.61318588256836, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8631420135498047, + "num_tokens": 854838193.0, + "step": 22402 + }, + { + "epoch": 2.849891871263198, + "ewc_loss": 0.08255666494369507, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045203143963590264, + "grad_norm": 9.807452201843262, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8591110706329346, + "num_tokens": 854871650.0, + "step": 22403 + }, + { + "epoch": 2.8500190815417885, + "ewc_loss": 0.08192199468612671, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044568482553586364, + "grad_norm": 9.673685073852539, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8640177249908447, + "num_tokens": 854910757.0, + "step": 22404 + }, + { + "epoch": 2.850146291820379, + "ewc_loss": 0.08246732503175735, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004511380975600332, + "grad_norm": 9.742661476135254, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8519988059997559, + "num_tokens": 854945018.0, + "step": 22405 + }, + { + "epoch": 2.8502735020989696, + "ewc_loss": 0.08198350667953491, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004462998767849058, + "grad_norm": 9.628484725952148, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8578265905380249, + "num_tokens": 854988369.0, + "step": 22406 + }, + { + "epoch": 2.85040071237756, + "ewc_loss": 0.08243304491043091, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045079534174874425, + "grad_norm": 9.726911544799805, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8660784363746643, + "num_tokens": 855023935.0, + "step": 22407 + }, + { + "epoch": 2.8505279226561506, + "ewc_loss": 0.08196785300970078, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044614338548853993, + "grad_norm": 9.65683364868164, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8658719062805176, + "num_tokens": 855062002.0, + "step": 22408 + }, + { + "epoch": 2.850655132934741, + "ewc_loss": 0.08237158507108688, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004501806979533285, + "grad_norm": 9.71203327178955, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8669481873512268, + "num_tokens": 855101257.0, + "step": 22409 + }, + { + "epoch": 2.8507823432133317, + "ewc_loss": 0.08211980760097504, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004476629546843469, + "grad_norm": 9.701949119567871, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8666033744812012, + "num_tokens": 855135834.0, + "step": 22410 + }, + { + "epoch": 2.850909553491922, + "ewc_loss": 0.08236442506313324, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004501091316342354, + "grad_norm": 9.725372314453125, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8659161925315857, + "num_tokens": 855174156.0, + "step": 22411 + }, + { + "epoch": 2.8510367637705127, + "ewc_loss": 0.08223479986190796, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044881278881803155, + "grad_norm": 9.685873031616211, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.875003457069397, + "num_tokens": 855207698.0, + "step": 22412 + }, + { + "epoch": 2.8511639740491033, + "ewc_loss": 0.08210383355617523, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004475032037589699, + "grad_norm": 9.6728515625, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8653355240821838, + "num_tokens": 855240636.0, + "step": 22413 + }, + { + "epoch": 2.851291184327694, + "ewc_loss": 0.08243420720100403, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004508068668656051, + "grad_norm": 9.722665786743164, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8676216006278992, + "num_tokens": 855279923.0, + "step": 22414 + }, + { + "epoch": 2.8514183946062843, + "ewc_loss": 0.08211751282215118, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044763993355445564, + "grad_norm": 9.625412940979004, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8532111644744873, + "num_tokens": 855315460.0, + "step": 22415 + }, + { + "epoch": 2.851545604884875, + "ewc_loss": 0.08241969347000122, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045066175516694784, + "grad_norm": 9.689179420471191, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.87258380651474, + "num_tokens": 855351548.0, + "step": 22416 + }, + { + "epoch": 2.8516728151634654, + "ewc_loss": 0.08198714256286621, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044633628567680717, + "grad_norm": 9.675305366516113, + "learning_rate": 1e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8457597494125366, + "num_tokens": 855390715.0, + "step": 22417 + }, + { + "epoch": 2.8518000254420555, + "ewc_loss": 0.08232090622186661, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004496739129535854, + "grad_norm": 9.668657302856445, + "learning_rate": 1e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8509141206741333, + "num_tokens": 855431997.0, + "step": 22418 + }, + { + "epoch": 2.8519272357206464, + "ewc_loss": 0.08227188885211945, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004491837462410331, + "grad_norm": 9.675352096557617, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8599288463592529, + "num_tokens": 855470242.0, + "step": 22419 + }, + { + "epoch": 2.8520544459992365, + "ewc_loss": 0.08228523284196854, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044931715819984674, + "grad_norm": 9.668013572692871, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8521881103515625, + "num_tokens": 855508901.0, + "step": 22420 + }, + { + "epoch": 2.8521816562778275, + "ewc_loss": 0.08222988247871399, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004487637197598815, + "grad_norm": 9.707786560058594, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8616510033607483, + "num_tokens": 855545290.0, + "step": 22421 + }, + { + "epoch": 2.8523088665564176, + "ewc_loss": 0.08227526396512985, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044921747758053243, + "grad_norm": 9.623821258544922, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8602331876754761, + "num_tokens": 855579567.0, + "step": 22422 + }, + { + "epoch": 2.852436076835008, + "ewc_loss": 0.08238593488931656, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045032420894131064, + "grad_norm": 9.70252513885498, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8583526015281677, + "num_tokens": 855621117.0, + "step": 22423 + }, + { + "epoch": 2.8525632871135986, + "ewc_loss": 0.0825524777173996, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044710683869197965, + "grad_norm": 9.601747512817383, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8690245747566223, + "num_tokens": 855660494.0, + "step": 22424 + }, + { + "epoch": 2.852690497392189, + "ewc_loss": 0.08266393840312958, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045310426503419876, + "grad_norm": 9.739895820617676, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8632658123970032, + "num_tokens": 855694043.0, + "step": 22425 + }, + { + "epoch": 2.8528177076707797, + "ewc_loss": 0.08189143240451813, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004453791189007461, + "grad_norm": 9.54602336883545, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8504530191421509, + "num_tokens": 855734615.0, + "step": 22426 + }, + { + "epoch": 2.85294491794937, + "ewc_loss": 0.08285278081893921, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000454992608865723, + "grad_norm": 9.7647123336792, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8758478164672852, + "num_tokens": 855771141.0, + "step": 22427 + }, + { + "epoch": 2.8530721282279607, + "ewc_loss": 0.08243923634290695, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004459743795450777, + "grad_norm": 9.574280738830566, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8624494075775146, + "num_tokens": 855806877.0, + "step": 22428 + }, + { + "epoch": 2.8531993385065513, + "ewc_loss": 0.08302624523639679, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045672725536860526, + "grad_norm": 9.762308120727539, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8784034252166748, + "num_tokens": 855840076.0, + "step": 22429 + }, + { + "epoch": 2.853326548785142, + "ewc_loss": 0.08197012543678284, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044616605737246573, + "grad_norm": 9.544632911682129, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8757859468460083, + "num_tokens": 855877972.0, + "step": 22430 + }, + { + "epoch": 2.8534537590637323, + "ewc_loss": 0.08301563560962677, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004566212301142514, + "grad_norm": 9.7759370803833, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8593137860298157, + "num_tokens": 855917424.0, + "step": 22431 + }, + { + "epoch": 2.853580969342323, + "ewc_loss": 0.08252428472042084, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004468248807825148, + "grad_norm": 9.590513229370117, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8549535274505615, + "num_tokens": 855954853.0, + "step": 22432 + }, + { + "epoch": 2.8537081796209134, + "ewc_loss": 0.08354566991329193, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004570387245621532, + "grad_norm": 9.807859420776367, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8677316904067993, + "num_tokens": 855993300.0, + "step": 22433 + }, + { + "epoch": 2.853835389899504, + "ewc_loss": 0.08240862935781479, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044566832366399467, + "grad_norm": 9.53487777709961, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8677769899368286, + "num_tokens": 856035307.0, + "step": 22434 + }, + { + "epoch": 2.8539626001780944, + "ewc_loss": 0.08369330316781998, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004585150454659015, + "grad_norm": 9.823732376098633, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8637016415596008, + "num_tokens": 856071801.0, + "step": 22435 + }, + { + "epoch": 2.854089810456685, + "ewc_loss": 0.08179758489131927, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004444406949914992, + "grad_norm": 9.505528450012207, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8541181087493896, + "num_tokens": 856112035.0, + "step": 22436 + }, + { + "epoch": 2.8542170207352755, + "ewc_loss": 0.08333240449428558, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045978892012499273, + "grad_norm": 9.923480987548828, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8700974583625793, + "num_tokens": 856148275.0, + "step": 22437 + }, + { + "epoch": 2.854344231013866, + "ewc_loss": 0.08163999021053314, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004428646934684366, + "grad_norm": 9.754439353942871, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8668544292449951, + "num_tokens": 856180330.0, + "step": 22438 + }, + { + "epoch": 2.8544714412924566, + "ewc_loss": 0.08281609416007996, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045462578418664634, + "grad_norm": 15.158933639526367, + "learning_rate": 1e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8507131338119507, + "num_tokens": 856219354.0, + "step": 22439 + }, + { + "epoch": 2.854598651571047, + "ewc_loss": 0.08935265243053436, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005151085206307471, + "grad_norm": 10.318187713623047, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8748597502708435, + "num_tokens": 856257583.0, + "step": 22440 + }, + { + "epoch": 2.8547258618496376, + "ewc_loss": 0.08617355674505234, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004882003995589912, + "grad_norm": 10.471973419189453, + "learning_rate": 1e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8473657369613647, + "num_tokens": 856298056.0, + "step": 22441 + }, + { + "epoch": 2.854853072128228, + "ewc_loss": 0.08307328075170517, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045231482363305986, + "grad_norm": 9.909323692321777, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8608092069625854, + "num_tokens": 856334109.0, + "step": 22442 + }, + { + "epoch": 2.8549802824068182, + "ewc_loss": 0.08596135675907135, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004860784101765603, + "grad_norm": 10.23346996307373, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8668961524963379, + "num_tokens": 856376655.0, + "step": 22443 + }, + { + "epoch": 2.855107492685409, + "ewc_loss": 0.08250856399536133, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045155046973377466, + "grad_norm": 9.7774076461792, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8818491697311401, + "num_tokens": 856409926.0, + "step": 22444 + }, + { + "epoch": 2.8552347029639993, + "ewc_loss": 0.08448486775159836, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00047131351311691105, + "grad_norm": 10.024535179138184, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.854487955570221, + "num_tokens": 856451471.0, + "step": 22445 + }, + { + "epoch": 2.8553619132425903, + "ewc_loss": 0.0824769139289856, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004512339364737272, + "grad_norm": 9.790518760681152, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8731341361999512, + "num_tokens": 856483237.0, + "step": 22446 + }, + { + "epoch": 2.8554891235211803, + "ewc_loss": 0.08389675617218018, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046543238568119705, + "grad_norm": 10.00768756866455, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.856225311756134, + "num_tokens": 856521278.0, + "step": 22447 + }, + { + "epoch": 2.855616333799771, + "ewc_loss": 0.08224019408226013, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044886674731969833, + "grad_norm": 9.640225410461426, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8613991141319275, + "num_tokens": 856565048.0, + "step": 22448 + }, + { + "epoch": 2.8557435440783614, + "ewc_loss": 0.08369319140911102, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046339674736373127, + "grad_norm": 9.97968578338623, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8688020706176758, + "num_tokens": 856603756.0, + "step": 22449 + }, + { + "epoch": 2.855870754356952, + "ewc_loss": 0.08212310075759888, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044769590022042394, + "grad_norm": 9.667322158813477, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8708032965660095, + "num_tokens": 856640677.0, + "step": 22450 + }, + { + "epoch": 2.8559979646355425, + "ewc_loss": 0.08342278003692627, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004606925940606743, + "grad_norm": 9.966485977172852, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8645855784416199, + "num_tokens": 856679982.0, + "step": 22451 + }, + { + "epoch": 2.856125174914133, + "ewc_loss": 0.08195577561855316, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044602257548831403, + "grad_norm": 9.674434661865234, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8522456884384155, + "num_tokens": 856720621.0, + "step": 22452 + }, + { + "epoch": 2.8562523851927235, + "ewc_loss": 0.08324335515499115, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045889837201684713, + "grad_norm": 9.856831550598145, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8736813068389893, + "num_tokens": 856757172.0, + "step": 22453 + }, + { + "epoch": 2.856379595471314, + "ewc_loss": 0.08212687075138092, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004452921566553414, + "grad_norm": 9.605203628540039, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8615230917930603, + "num_tokens": 856796876.0, + "step": 22454 + }, + { + "epoch": 2.8565068057499046, + "ewc_loss": 0.0831349566578865, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045781442895531654, + "grad_norm": 9.917450904846191, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.864942193031311, + "num_tokens": 856831148.0, + "step": 22455 + }, + { + "epoch": 2.856634016028495, + "ewc_loss": 0.08228927850723267, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000444474775576964, + "grad_norm": 9.77326488494873, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8605685830116272, + "num_tokens": 856867091.0, + "step": 22456 + }, + { + "epoch": 2.8567612263070856, + "ewc_loss": 0.08273789286613464, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004538437642622739, + "grad_norm": 9.793303489685059, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8535608053207397, + "num_tokens": 856906118.0, + "step": 22457 + }, + { + "epoch": 2.856888436585676, + "ewc_loss": 0.08221606910228729, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044862550566904247, + "grad_norm": 9.681386947631836, + "learning_rate": 1e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.8386460542678833, + "num_tokens": 856942694.0, + "step": 22458 + }, + { + "epoch": 2.8570156468642667, + "ewc_loss": 0.08248061686754227, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004512710147537291, + "grad_norm": 9.770071983337402, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8554422855377197, + "num_tokens": 856977530.0, + "step": 22459 + }, + { + "epoch": 2.857142857142857, + "ewc_loss": 0.08215446025133133, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044800943578593433, + "grad_norm": 9.671109199523926, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8671287894248962, + "num_tokens": 857012417.0, + "step": 22460 + }, + { + "epoch": 2.8572700674214477, + "ewc_loss": 0.08262532949447632, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004527181154116988, + "grad_norm": 9.761140823364258, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8580710887908936, + "num_tokens": 857053556.0, + "step": 22461 + }, + { + "epoch": 2.8573972777000383, + "ewc_loss": 0.08221283555030823, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044859322952106595, + "grad_norm": 9.696651458740234, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8691610097885132, + "num_tokens": 857094514.0, + "step": 22462 + }, + { + "epoch": 2.857524487978629, + "ewc_loss": 0.08248867094516754, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004513515450526029, + "grad_norm": 9.712512016296387, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8665697574615479, + "num_tokens": 857133670.0, + "step": 22463 + }, + { + "epoch": 2.8576516982572193, + "ewc_loss": 0.08290205150842667, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004506025288719684, + "grad_norm": 9.755324363708496, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8566399812698364, + "num_tokens": 857167934.0, + "step": 22464 + }, + { + "epoch": 2.85777890853581, + "ewc_loss": 0.08232224732637405, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044968732981942594, + "grad_norm": 9.789291381835938, + "learning_rate": 1e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8471624851226807, + "num_tokens": 857208263.0, + "step": 22465 + }, + { + "epoch": 2.8579061188144, + "ewc_loss": 0.08224151283502579, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004488799604587257, + "grad_norm": 9.607938766479492, + "learning_rate": 1e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8466458916664124, + "num_tokens": 857255107.0, + "step": 22466 + }, + { + "epoch": 2.858033329092991, + "ewc_loss": 0.08264109492301941, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045287582906894386, + "grad_norm": 9.773895263671875, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8622292280197144, + "num_tokens": 857298368.0, + "step": 22467 + }, + { + "epoch": 2.858160539371581, + "ewc_loss": 0.08205650746822357, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004470299172680825, + "grad_norm": 9.656611442565918, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8643366694450378, + "num_tokens": 857336899.0, + "step": 22468 + }, + { + "epoch": 2.858287749650172, + "ewc_loss": 0.08271491527557373, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000453613989520818, + "grad_norm": 9.79254150390625, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8680567741394043, + "num_tokens": 857378482.0, + "step": 22469 + }, + { + "epoch": 2.858414959928762, + "ewc_loss": 0.08209936320781708, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004474584711715579, + "grad_norm": 9.708242416381836, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8736168146133423, + "num_tokens": 857413203.0, + "step": 22470 + }, + { + "epoch": 2.858542170207353, + "ewc_loss": 0.08238199353218079, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004502848023548722, + "grad_norm": 9.814263343811035, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8622146844863892, + "num_tokens": 857446706.0, + "step": 22471 + }, + { + "epoch": 2.858669380485943, + "ewc_loss": 0.08182387053966522, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044470353168435395, + "grad_norm": 9.66329288482666, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8686644434928894, + "num_tokens": 857483415.0, + "step": 22472 + }, + { + "epoch": 2.8587965907645336, + "ewc_loss": 0.08244171738624573, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004508820129558444, + "grad_norm": 9.783143043518066, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8698166608810425, + "num_tokens": 857525323.0, + "step": 22473 + }, + { + "epoch": 2.858923801043124, + "ewc_loss": 0.08217521011829376, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004433341382537037, + "grad_norm": 9.678218841552734, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8802993297576904, + "num_tokens": 857564400.0, + "step": 22474 + }, + { + "epoch": 2.8590510113217147, + "ewc_loss": 0.08241192996501923, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004481427895370871, + "grad_norm": 9.681546211242676, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8628053665161133, + "num_tokens": 857602756.0, + "step": 22475 + }, + { + "epoch": 2.859178221600305, + "ewc_loss": 0.08206348121166229, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044709970825351775, + "grad_norm": 9.741300582885742, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8630599975585938, + "num_tokens": 857636875.0, + "step": 22476 + }, + { + "epoch": 2.8593054318788957, + "ewc_loss": 0.08183027803897858, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004447676183190197, + "grad_norm": 9.637157440185547, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8818557858467102, + "num_tokens": 857671950.0, + "step": 22477 + }, + { + "epoch": 2.8594326421574863, + "ewc_loss": 0.08245767652988434, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004510415601544082, + "grad_norm": 9.747370719909668, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8788498640060425, + "num_tokens": 857705274.0, + "step": 22478 + }, + { + "epoch": 2.859559852436077, + "ewc_loss": 0.08235642313957214, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004451462300494313, + "grad_norm": 9.661847114562988, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.858264684677124, + "num_tokens": 857743730.0, + "step": 22479 + }, + { + "epoch": 2.8596870627146673, + "ewc_loss": 0.08234681934118271, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044993305345997214, + "grad_norm": 9.726248741149902, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8606269955635071, + "num_tokens": 857784055.0, + "step": 22480 + }, + { + "epoch": 2.859814272993258, + "ewc_loss": 0.08259078860282898, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004474899615161121, + "grad_norm": 9.725629806518555, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8702723383903503, + "num_tokens": 857822431.0, + "step": 22481 + }, + { + "epoch": 2.8599414832718484, + "ewc_loss": 0.08273753523826599, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004489574348554015, + "grad_norm": 9.75275707244873, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8762527108192444, + "num_tokens": 857859258.0, + "step": 22482 + }, + { + "epoch": 2.860068693550439, + "ewc_loss": 0.08207046985626221, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044716952834278345, + "grad_norm": 9.729085922241211, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8561263680458069, + "num_tokens": 857901209.0, + "step": 22483 + }, + { + "epoch": 2.8601959038290294, + "ewc_loss": 0.08204114437103271, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004468763363547623, + "grad_norm": 9.764545440673828, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8764908909797668, + "num_tokens": 857935563.0, + "step": 22484 + }, + { + "epoch": 2.86032311410762, + "ewc_loss": 0.08210960030555725, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044756088755093515, + "grad_norm": 9.751531600952148, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8546386957168579, + "num_tokens": 857966777.0, + "step": 22485 + }, + { + "epoch": 2.8604503243862105, + "ewc_loss": 0.08202709257602692, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004467358230613172, + "grad_norm": 9.764755249023438, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.871285080909729, + "num_tokens": 858005078.0, + "step": 22486 + }, + { + "epoch": 2.860577534664801, + "ewc_loss": 0.08194150030612946, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044587987940758467, + "grad_norm": 9.670160293579102, + "learning_rate": 1e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8529002666473389, + "num_tokens": 858049288.0, + "step": 22487 + }, + { + "epoch": 2.8607047449433916, + "ewc_loss": 0.08210007101297379, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004474655434023589, + "grad_norm": 9.741683006286621, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8865562677383423, + "num_tokens": 858089834.0, + "step": 22488 + }, + { + "epoch": 2.860831955221982, + "ewc_loss": 0.08239925652742386, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044557458022609353, + "grad_norm": 9.73948860168457, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.869922399520874, + "num_tokens": 858124152.0, + "step": 22489 + }, + { + "epoch": 2.8609591655005726, + "ewc_loss": 0.08194875717163086, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004459523770492524, + "grad_norm": 9.689980506896973, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8697731494903564, + "num_tokens": 858163202.0, + "step": 22490 + }, + { + "epoch": 2.8610863757791627, + "ewc_loss": 0.08263924717903137, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004479744820855558, + "grad_norm": 9.727067947387695, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8855785131454468, + "num_tokens": 858199173.0, + "step": 22491 + }, + { + "epoch": 2.8612135860577537, + "ewc_loss": 0.08238089829683304, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004453910223674029, + "grad_norm": 9.745153427124023, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8632259368896484, + "num_tokens": 858239693.0, + "step": 22492 + }, + { + "epoch": 2.8613407963363438, + "ewc_loss": 0.08199340105056763, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004463988880161196, + "grad_norm": 9.719230651855469, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.871179461479187, + "num_tokens": 858280720.0, + "step": 22493 + }, + { + "epoch": 2.8614680066149347, + "ewc_loss": 0.082216277718544, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044862765935249627, + "grad_norm": 9.738414764404297, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8775423169136047, + "num_tokens": 858314145.0, + "step": 22494 + }, + { + "epoch": 2.861595216893525, + "ewc_loss": 0.08204255998134613, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004468904226087034, + "grad_norm": 9.674491882324219, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8662726879119873, + "num_tokens": 858355177.0, + "step": 22495 + }, + { + "epoch": 2.861722427172116, + "ewc_loss": 0.08231917023658752, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044965650886297226, + "grad_norm": 9.758284568786621, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8686138391494751, + "num_tokens": 858393070.0, + "step": 22496 + }, + { + "epoch": 2.861849637450706, + "ewc_loss": 0.08221884071826935, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044621183769777417, + "grad_norm": 9.730245590209961, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8649591207504272, + "num_tokens": 858431827.0, + "step": 22497 + }, + { + "epoch": 2.8619768477292964, + "ewc_loss": 0.0820775255560875, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044724010513164103, + "grad_norm": 9.666747093200684, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8630859851837158, + "num_tokens": 858474990.0, + "step": 22498 + }, + { + "epoch": 2.862104058007887, + "ewc_loss": 0.08233171701431274, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004497820627875626, + "grad_norm": 9.68533706665039, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8779792785644531, + "num_tokens": 858514698.0, + "step": 22499 + }, + { + "epoch": 2.8622312682864774, + "ewc_loss": 0.08224096894264221, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044887454714626074, + "grad_norm": 9.781393051147461, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8654292821884155, + "num_tokens": 858557601.0, + "step": 22500 + }, + { + "epoch": 2.862358478565068, + "ewc_loss": 0.0820351094007492, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044681597501039505, + "grad_norm": 9.670950889587402, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8742693662643433, + "num_tokens": 858596830.0, + "step": 22501 + }, + { + "epoch": 2.8624856888436585, + "ewc_loss": 0.08245126903057098, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004509775317274034, + "grad_norm": 9.806401252746582, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.855271577835083, + "num_tokens": 858637555.0, + "step": 22502 + }, + { + "epoch": 2.862612899122249, + "ewc_loss": 0.08230267465114594, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044705020263791084, + "grad_norm": 9.761680603027344, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8843452334403992, + "num_tokens": 858672813.0, + "step": 22503 + }, + { + "epoch": 2.8627401094008396, + "ewc_loss": 0.08224718272686005, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044893668382428586, + "grad_norm": 9.671825408935547, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8732951879501343, + "num_tokens": 858710743.0, + "step": 22504 + }, + { + "epoch": 2.86286731967943, + "ewc_loss": 0.08240631222724915, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004505279357545078, + "grad_norm": 9.85034465789795, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8777481913566589, + "num_tokens": 858739486.0, + "step": 22505 + }, + { + "epoch": 2.8629945299580206, + "ewc_loss": 0.08171480894088745, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004436129820533097, + "grad_norm": 9.655789375305176, + "learning_rate": 1e-06, + "loss": 0.526, + "mean_token_accuracy": 0.8462779521942139, + "num_tokens": 858780678.0, + "step": 22506 + }, + { + "epoch": 2.863121740236611, + "ewc_loss": 0.08263857662677765, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004528505669441074, + "grad_norm": 9.801959991455078, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8618379235267639, + "num_tokens": 858826112.0, + "step": 22507 + }, + { + "epoch": 2.8632489505152017, + "ewc_loss": 0.08177037537097931, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004441685741767287, + "grad_norm": 9.688370704650879, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8776742219924927, + "num_tokens": 858862158.0, + "step": 22508 + }, + { + "epoch": 2.863376160793792, + "ewc_loss": 0.08254332095384598, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045189805678091943, + "grad_norm": 9.817187309265137, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8710014224052429, + "num_tokens": 858905270.0, + "step": 22509 + }, + { + "epoch": 2.8635033710723827, + "ewc_loss": 0.08180370926856995, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044450192945078015, + "grad_norm": 9.669392585754395, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8596134185791016, + "num_tokens": 858941802.0, + "step": 22510 + }, + { + "epoch": 2.8636305813509733, + "ewc_loss": 0.08254632353782654, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004519280628301203, + "grad_norm": 9.899153709411621, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8682878613471985, + "num_tokens": 858974672.0, + "step": 22511 + }, + { + "epoch": 2.863757791629564, + "ewc_loss": 0.08161155879497528, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044258046546019614, + "grad_norm": 9.624505996704102, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8635240197181702, + "num_tokens": 859015305.0, + "step": 22512 + }, + { + "epoch": 2.8638850019081543, + "ewc_loss": 0.08284065127372742, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004548713914118707, + "grad_norm": 9.97083854675293, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8576749563217163, + "num_tokens": 859050646.0, + "step": 22513 + }, + { + "epoch": 2.864012212186745, + "ewc_loss": 0.08123338222503662, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004387986264191568, + "grad_norm": 9.579507827758789, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8745294213294983, + "num_tokens": 859090139.0, + "step": 22514 + }, + { + "epoch": 2.8641394224653354, + "ewc_loss": 0.08321153372526169, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045858017983846366, + "grad_norm": 9.991557121276855, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8699673414230347, + "num_tokens": 859130661.0, + "step": 22515 + }, + { + "epoch": 2.8642666327439255, + "ewc_loss": 0.0812886506319046, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043690993334166706, + "grad_norm": 9.55067253112793, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8641552329063416, + "num_tokens": 859166399.0, + "step": 22516 + }, + { + "epoch": 2.8643938430225164, + "ewc_loss": 0.08336535096168518, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004601183463819325, + "grad_norm": 10.021477699279785, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8658304214477539, + "num_tokens": 859208349.0, + "step": 22517 + }, + { + "epoch": 2.8645210533011065, + "ewc_loss": 0.08106400817632675, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043710492900572717, + "grad_norm": 9.558444023132324, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8684914112091064, + "num_tokens": 859243784.0, + "step": 22518 + }, + { + "epoch": 2.8646482635796975, + "ewc_loss": 0.08374960720539093, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046151946298778057, + "grad_norm": 10.071540832519531, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8511815667152405, + "num_tokens": 859278345.0, + "step": 22519 + }, + { + "epoch": 2.8647754738582876, + "ewc_loss": 0.08112289756536484, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004376938159111887, + "grad_norm": 9.520462036132812, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8554028272628784, + "num_tokens": 859317454.0, + "step": 22520 + }, + { + "epoch": 2.864902684136878, + "ewc_loss": 0.08391017466783524, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046556658344343305, + "grad_norm": 10.125547409057617, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8612047433853149, + "num_tokens": 859352076.0, + "step": 22521 + }, + { + "epoch": 2.8650298944154686, + "ewc_loss": 0.08114132285118103, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00043787804315797985, + "grad_norm": 9.568522453308105, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8726010918617249, + "num_tokens": 859389957.0, + "step": 22522 + }, + { + "epoch": 2.865157104694059, + "ewc_loss": 0.08358331024646759, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004622979322448373, + "grad_norm": 10.089365005493164, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8657357692718506, + "num_tokens": 859425167.0, + "step": 22523 + }, + { + "epoch": 2.8652843149726497, + "ewc_loss": 0.08127959072589874, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004392607370391488, + "grad_norm": 9.557185173034668, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8621739149093628, + "num_tokens": 859470999.0, + "step": 22524 + }, + { + "epoch": 2.86541152525124, + "ewc_loss": 0.08354964852333069, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004619613755494356, + "grad_norm": 9.998411178588867, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8715333342552185, + "num_tokens": 859503763.0, + "step": 22525 + }, + { + "epoch": 2.8655387355298307, + "ewc_loss": 0.08168424665927887, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004408658714964986, + "grad_norm": 9.625750541687012, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.875377893447876, + "num_tokens": 859537739.0, + "step": 22526 + }, + { + "epoch": 2.8656659458084213, + "ewc_loss": 0.0832839161157608, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045930405030958354, + "grad_norm": 10.031264305114746, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8763274550437927, + "num_tokens": 859573497.0, + "step": 22527 + }, + { + "epoch": 2.865793156087012, + "ewc_loss": 0.08149169385433197, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044138176599517465, + "grad_norm": 9.662896156311035, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8617401719093323, + "num_tokens": 859606997.0, + "step": 22528 + }, + { + "epoch": 2.8659203663656023, + "ewc_loss": 0.08297158777713776, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045618077274411917, + "grad_norm": 9.900513648986816, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8527912497520447, + "num_tokens": 859642278.0, + "step": 22529 + }, + { + "epoch": 2.866047576644193, + "ewc_loss": 0.08171403408050537, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004436051822267473, + "grad_norm": 9.607178688049316, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8775551319122314, + "num_tokens": 859684728.0, + "step": 22530 + }, + { + "epoch": 2.8661747869227834, + "ewc_loss": 0.08319796621799469, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045600312296301126, + "grad_norm": 9.936019897460938, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8584873080253601, + "num_tokens": 859720946.0, + "step": 22531 + }, + { + "epoch": 2.866301997201374, + "ewc_loss": 0.08190996944904327, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044312310637906194, + "grad_norm": 9.66006851196289, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.861653208732605, + "num_tokens": 859752605.0, + "step": 22532 + }, + { + "epoch": 2.8664292074799644, + "ewc_loss": 0.08286680281162262, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045513283112086356, + "grad_norm": 9.922295570373535, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8773494958877563, + "num_tokens": 859784206.0, + "step": 22533 + }, + { + "epoch": 2.866556417758555, + "ewc_loss": 0.08170270919799805, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004434919392224401, + "grad_norm": 9.690583229064941, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8626226186752319, + "num_tokens": 859828142.0, + "step": 22534 + }, + { + "epoch": 2.8666836280371455, + "ewc_loss": 0.0826791375875473, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004532562452368438, + "grad_norm": 9.84306526184082, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8613193035125732, + "num_tokens": 859870032.0, + "step": 22535 + }, + { + "epoch": 2.866810838315736, + "ewc_loss": 0.08210019022226334, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044502533273771405, + "grad_norm": 9.668160438537598, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.855540931224823, + "num_tokens": 859911417.0, + "step": 22536 + }, + { + "epoch": 2.8669380485943265, + "ewc_loss": 0.08268586546182632, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045332350418902934, + "grad_norm": 9.928369522094727, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8774145841598511, + "num_tokens": 859953929.0, + "step": 22537 + }, + { + "epoch": 2.867065258872917, + "ewc_loss": 0.08210083842277527, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004425904480740428, + "grad_norm": 9.656312942504883, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.868711531162262, + "num_tokens": 859985402.0, + "step": 22538 + }, + { + "epoch": 2.8671924691515076, + "ewc_loss": 0.08268168568611145, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045328171108849347, + "grad_norm": 9.912520408630371, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8635499477386475, + "num_tokens": 860017374.0, + "step": 22539 + }, + { + "epoch": 2.867319679430098, + "ewc_loss": 0.08149370551109314, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004414019058458507, + "grad_norm": 9.625574111938477, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8758499622344971, + "num_tokens": 860058217.0, + "step": 22540 + }, + { + "epoch": 2.867446889708688, + "ewc_loss": 0.0826750248670578, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.000453215092420578, + "grad_norm": 9.893913269042969, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8487287759780884, + "num_tokens": 860092317.0, + "step": 22541 + }, + { + "epoch": 2.867574099987279, + "ewc_loss": 0.08147686719894409, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004412335401866585, + "grad_norm": 9.665731430053711, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8628552556037903, + "num_tokens": 860125067.0, + "step": 22542 + }, + { + "epoch": 2.8677013102658693, + "ewc_loss": 0.08298179507255554, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045384137774817646, + "grad_norm": 9.902998924255371, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8794971704483032, + "num_tokens": 860163805.0, + "step": 22543 + }, + { + "epoch": 2.8678285205444602, + "ewc_loss": 0.08177432417869568, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044176672236062586, + "grad_norm": 9.695433616638184, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8523110151290894, + "num_tokens": 860200178.0, + "step": 22544 + }, + { + "epoch": 2.8679557308230503, + "ewc_loss": 0.08271860331296921, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045120948925614357, + "grad_norm": 9.803396224975586, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8645460605621338, + "num_tokens": 860235283.0, + "step": 22545 + }, + { + "epoch": 2.868082941101641, + "ewc_loss": 0.08209178596735, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044494130997918546, + "grad_norm": 9.666101455688477, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.883866548538208, + "num_tokens": 860276776.0, + "step": 22546 + }, + { + "epoch": 2.8682101513802314, + "ewc_loss": 0.08262297511100769, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045025322469882667, + "grad_norm": 9.722074508666992, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8710016012191772, + "num_tokens": 860318492.0, + "step": 22547 + }, + { + "epoch": 2.868337361658822, + "ewc_loss": 0.08237811923027039, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044780466123484075, + "grad_norm": 9.772262573242188, + "learning_rate": 1e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8515033721923828, + "num_tokens": 860358556.0, + "step": 22548 + }, + { + "epoch": 2.8684645719374124, + "ewc_loss": 0.0822126492857933, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00044859133777208626, + "grad_norm": 9.837250709533691, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8547869920730591, + "num_tokens": 860391114.0, + "step": 22549 + }, + { + "epoch": 2.868591782216003, + "ewc_loss": 0.0822397992014885, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044642144348472357, + "grad_norm": 9.642560005187988, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8611070513725281, + "num_tokens": 860430696.0, + "step": 22550 + }, + { + "epoch": 2.8687189924945935, + "ewc_loss": 0.08281786739826202, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004522021336015314, + "grad_norm": 9.81827163696289, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8731481432914734, + "num_tokens": 860471055.0, + "step": 22551 + }, + { + "epoch": 2.868846202773184, + "ewc_loss": 0.08202289044857025, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044425230589695275, + "grad_norm": 9.70301342010498, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8599497079849243, + "num_tokens": 860512472.0, + "step": 22552 + }, + { + "epoch": 2.8689734130517746, + "ewc_loss": 0.08283372223377228, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045236069126985967, + "grad_norm": 9.860198974609375, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8601862192153931, + "num_tokens": 860541014.0, + "step": 22553 + }, + { + "epoch": 2.869100623330365, + "ewc_loss": 0.08191046118736267, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044312799582257867, + "grad_norm": 9.609073638916016, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8811028003692627, + "num_tokens": 860585025.0, + "step": 22554 + }, + { + "epoch": 2.8692278336089556, + "ewc_loss": 0.08298248797655106, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004538483335636556, + "grad_norm": 9.891166687011719, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.873318076133728, + "num_tokens": 860616361.0, + "step": 22555 + }, + { + "epoch": 2.869355043887546, + "ewc_loss": 0.081759512424469, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044161861296743155, + "grad_norm": 9.609637260437012, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8564473390579224, + "num_tokens": 860655976.0, + "step": 22556 + }, + { + "epoch": 2.8694822541661367, + "ewc_loss": 0.08319069445133209, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000455930334283039, + "grad_norm": 9.905938148498535, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8529741764068604, + "num_tokens": 860703949.0, + "step": 22557 + }, + { + "epoch": 2.869609464444727, + "ewc_loss": 0.08180888742208481, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004421123303472996, + "grad_norm": 9.670064926147461, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8656172752380371, + "num_tokens": 860739935.0, + "step": 22558 + }, + { + "epoch": 2.8697366747233177, + "ewc_loss": 0.08306244760751724, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045464790309779346, + "grad_norm": 9.845891952514648, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8691751956939697, + "num_tokens": 860782483.0, + "step": 22559 + }, + { + "epoch": 2.8698638850019083, + "ewc_loss": 0.08225616067647934, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004441436321940273, + "grad_norm": 9.659492492675781, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8549069762229919, + "num_tokens": 860816821.0, + "step": 22560 + }, + { + "epoch": 2.869991095280499, + "ewc_loss": 0.08296526223421097, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004536760679911822, + "grad_norm": 9.893102645874023, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8542327284812927, + "num_tokens": 860851313.0, + "step": 22561 + }, + { + "epoch": 2.8701183055590893, + "ewc_loss": 0.08190547674894333, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004430781991686672, + "grad_norm": 9.648200035095215, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8737101554870605, + "num_tokens": 860890540.0, + "step": 22562 + }, + { + "epoch": 2.87024551583768, + "ewc_loss": 0.08306538313627243, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004546772688627243, + "grad_norm": 9.918731689453125, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8575150966644287, + "num_tokens": 860929686.0, + "step": 22563 + }, + { + "epoch": 2.87037272611627, + "ewc_loss": 0.08173947781324387, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004414182039909065, + "grad_norm": 9.573699951171875, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8672338724136353, + "num_tokens": 860968186.0, + "step": 22564 + }, + { + "epoch": 2.870499936394861, + "ewc_loss": 0.08317780494689941, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045580146252177656, + "grad_norm": 9.81870174407959, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.86143559217453, + "num_tokens": 861007252.0, + "step": 22565 + }, + { + "epoch": 2.870627146673451, + "ewc_loss": 0.08195832371711731, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004436066374182701, + "grad_norm": 9.627056121826172, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8671079277992249, + "num_tokens": 861049898.0, + "step": 22566 + }, + { + "epoch": 2.870754356952042, + "ewc_loss": 0.08321452885866165, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004561687237583101, + "grad_norm": 9.90245246887207, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8721314668655396, + "num_tokens": 861087711.0, + "step": 22567 + }, + { + "epoch": 2.870881567230632, + "ewc_loss": 0.08153489232063293, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004418137832544744, + "grad_norm": 9.560800552368164, + "learning_rate": 1e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8450954556465149, + "num_tokens": 861128566.0, + "step": 22568 + }, + { + "epoch": 2.871008777509223, + "ewc_loss": 0.08313848078250885, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00045784961548633873, + "grad_norm": 9.989274024963379, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8704035878181458, + "num_tokens": 861156299.0, + "step": 22569 + }, + { + "epoch": 2.871135987787813, + "ewc_loss": 0.08166894316673279, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004407128435559571, + "grad_norm": 9.549649238586426, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8767507076263428, + "num_tokens": 861189376.0, + "step": 22570 + }, + { + "epoch": 2.8712631980664036, + "ewc_loss": 0.08400043845176697, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046158640179783106, + "grad_norm": 10.016398429870605, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8604964017868042, + "num_tokens": 861227156.0, + "step": 22571 + }, + { + "epoch": 2.871390408344994, + "ewc_loss": 0.08151675760746002, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004391910624690354, + "grad_norm": 9.499967575073242, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8655626773834229, + "num_tokens": 861268088.0, + "step": 22572 + }, + { + "epoch": 2.8715176186235847, + "ewc_loss": 0.08390848338603973, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004631082993000746, + "grad_norm": 9.951090812683105, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.866969108581543, + "num_tokens": 861308892.0, + "step": 22573 + }, + { + "epoch": 2.871644828902175, + "ewc_loss": 0.08153968304395676, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.0004418616881594062, + "grad_norm": 9.544145584106445, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8684816956520081, + "num_tokens": 861350108.0, + "step": 22574 + }, + { + "epoch": 2.8717720391807657, + "ewc_loss": 0.08361488580703735, + "ewc_loss_diag": 3.743171691894531e-05, + "ewc_loss_parallel": 0.00046261370880529284, + "grad_norm": 9.949809074401855, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8764069080352783, + "num_tokens": 861390553.0, + "step": 22575 + }, + { + "epoch": 2.8718992494593563, + "ewc_loss": 0.08219816535711288, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044356368016451597, + "grad_norm": 10.205047607421875, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8661341667175293, + "num_tokens": 861425878.0, + "step": 22576 + }, + { + "epoch": 2.872026459737947, + "ewc_loss": 0.08205446600914001, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044456805335357785, + "grad_norm": 9.634045600891113, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8806232213973999, + "num_tokens": 861460935.0, + "step": 22577 + }, + { + "epoch": 2.8721536700165373, + "ewc_loss": 0.08317112922668457, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045573472743853927, + "grad_norm": 9.837602615356445, + "learning_rate": 1e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8484102487564087, + "num_tokens": 861500446.0, + "step": 22578 + }, + { + "epoch": 2.872280880295128, + "ewc_loss": 0.08179299533367157, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004419533652253449, + "grad_norm": 9.621903419494629, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8527661561965942, + "num_tokens": 861538000.0, + "step": 22579 + }, + { + "epoch": 2.8724080905737184, + "ewc_loss": 0.08325508236885071, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004565742565318942, + "grad_norm": 9.878998756408691, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8678164482116699, + "num_tokens": 861578052.0, + "step": 22580 + }, + { + "epoch": 2.872535300852309, + "ewc_loss": 0.08188262581825256, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044284973409958184, + "grad_norm": 9.585591316223145, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8618433475494385, + "num_tokens": 861619065.0, + "step": 22581 + }, + { + "epoch": 2.8726625111308994, + "ewc_loss": 0.08325119316577911, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004565353156067431, + "grad_norm": 9.874378204345703, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8688265085220337, + "num_tokens": 861660112.0, + "step": 22582 + }, + { + "epoch": 2.87278972140949, + "ewc_loss": 0.08195430040359497, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044356644502840936, + "grad_norm": 9.563846588134766, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8681557178497314, + "num_tokens": 861699312.0, + "step": 22583 + }, + { + "epoch": 2.8729169316880805, + "ewc_loss": 0.08353669941425323, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004593904013745487, + "grad_norm": 9.917594909667969, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8827213644981384, + "num_tokens": 861735167.0, + "step": 22584 + }, + { + "epoch": 2.873044141966671, + "ewc_loss": 0.08184526115655899, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004424760409165174, + "grad_norm": 9.582513809204102, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8592506051063538, + "num_tokens": 861778594.0, + "step": 22585 + }, + { + "epoch": 2.8731713522452615, + "ewc_loss": 0.0834968090057373, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045899153337813914, + "grad_norm": 9.89084529876709, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8636302351951599, + "num_tokens": 861814754.0, + "step": 22586 + }, + { + "epoch": 2.873298562523852, + "ewc_loss": 0.0820370763540268, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044439415796659887, + "grad_norm": 9.638298034667969, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.849892258644104, + "num_tokens": 861849019.0, + "step": 22587 + }, + { + "epoch": 2.8734257728024426, + "ewc_loss": 0.08403979241847992, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00045953859807923436, + "grad_norm": 9.994155883789062, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8615783452987671, + "num_tokens": 861889471.0, + "step": 22588 + }, + { + "epoch": 2.8735529830810327, + "ewc_loss": 0.08175067603588104, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000441530195530504, + "grad_norm": 9.532231330871582, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8578689694404602, + "num_tokens": 861926811.0, + "step": 22589 + }, + { + "epoch": 2.8736801933596237, + "ewc_loss": 0.08410652726888657, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046508872765116394, + "grad_norm": 10.012219429016113, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8632852435112, + "num_tokens": 861963628.0, + "step": 22590 + }, + { + "epoch": 2.8738074036382137, + "ewc_loss": 0.08175672590732574, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000441590731497854, + "grad_norm": 9.546165466308594, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8516888618469238, + "num_tokens": 862002915.0, + "step": 22591 + }, + { + "epoch": 2.8739346139168047, + "ewc_loss": 0.08407637476921082, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004647872119676322, + "grad_norm": 9.935200691223145, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8706260919570923, + "num_tokens": 862047276.0, + "step": 22592 + }, + { + "epoch": 2.874061824195395, + "ewc_loss": 0.08222275972366333, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004438096657395363, + "grad_norm": 9.57748794555664, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.872439980506897, + "num_tokens": 862091241.0, + "step": 22593 + }, + { + "epoch": 2.8741890344739858, + "ewc_loss": 0.08390051126480103, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046302852570079267, + "grad_norm": 9.972817420959473, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8681775331497192, + "num_tokens": 862126025.0, + "step": 22594 + }, + { + "epoch": 2.874316244752576, + "ewc_loss": 0.08211363852024078, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004451598506420851, + "grad_norm": 9.648828506469727, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8551300764083862, + "num_tokens": 862160822.0, + "step": 22595 + }, + { + "epoch": 2.8744434550311664, + "ewc_loss": 0.08376356214284897, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004616590740624815, + "grad_norm": 9.974932670593262, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8693103790283203, + "num_tokens": 862196808.0, + "step": 22596 + }, + { + "epoch": 2.874570665309757, + "ewc_loss": 0.08212773501873016, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004453007422853261, + "grad_norm": 9.69215202331543, + "learning_rate": 1e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8490643501281738, + "num_tokens": 862233722.0, + "step": 22597 + }, + { + "epoch": 2.8746978755883474, + "ewc_loss": 0.08349200338125229, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045894348295405507, + "grad_norm": 9.890304565429688, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8823622465133667, + "num_tokens": 862271914.0, + "step": 22598 + }, + { + "epoch": 2.874825085866938, + "ewc_loss": 0.0822131484746933, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044615493970923126, + "grad_norm": 9.62725830078125, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8626013994216919, + "num_tokens": 862313696.0, + "step": 22599 + }, + { + "epoch": 2.8749522961455285, + "ewc_loss": 0.08342976868152618, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004583211266435683, + "grad_norm": 9.926300048828125, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8592531085014343, + "num_tokens": 862355366.0, + "step": 22600 + }, + { + "epoch": 2.875079506424119, + "ewc_loss": 0.08222484588623047, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044627190800383687, + "grad_norm": 9.64663028717041, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.856743335723877, + "num_tokens": 862395630.0, + "step": 22601 + }, + { + "epoch": 2.8752067167027096, + "ewc_loss": 0.08337172865867615, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045774070895276964, + "grad_norm": 9.932600021362305, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8644469976425171, + "num_tokens": 862429650.0, + "step": 22602 + }, + { + "epoch": 2.8753339269813, + "ewc_loss": 0.08203620463609695, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044438548502512276, + "grad_norm": 9.65512752532959, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8634127378463745, + "num_tokens": 862470921.0, + "step": 22603 + }, + { + "epoch": 2.8754611372598906, + "ewc_loss": 0.08353978395462036, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004594213096424937, + "grad_norm": 9.959851264953613, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8709285855293274, + "num_tokens": 862505204.0, + "step": 22604 + }, + { + "epoch": 2.875588347538481, + "ewc_loss": 0.08201149106025696, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004441383935045451, + "grad_norm": 9.617457389831543, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.863508939743042, + "num_tokens": 862548686.0, + "step": 22605 + }, + { + "epoch": 2.8757155578170717, + "ewc_loss": 0.0835212916135788, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004592363547999412, + "grad_norm": 9.940167427062988, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.878697395324707, + "num_tokens": 862585920.0, + "step": 22606 + }, + { + "epoch": 2.875842768095662, + "ewc_loss": 0.08189171552658081, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004429406253620982, + "grad_norm": 9.644757270812988, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8591547012329102, + "num_tokens": 862619364.0, + "step": 22607 + }, + { + "epoch": 2.8759699783742527, + "ewc_loss": 0.08347570151090622, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045878044329583645, + "grad_norm": 9.990528106689453, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8601839542388916, + "num_tokens": 862661317.0, + "step": 22608 + }, + { + "epoch": 2.8760971886528433, + "ewc_loss": 0.08203017711639404, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044432521099224687, + "grad_norm": 9.726845741271973, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8694044351577759, + "num_tokens": 862698236.0, + "step": 22609 + }, + { + "epoch": 2.876224398931434, + "ewc_loss": 0.08322009444236755, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004562243411783129, + "grad_norm": 9.959017753601074, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8768229484558105, + "num_tokens": 862739224.0, + "step": 22610 + }, + { + "epoch": 2.8763516092100243, + "ewc_loss": 0.08191490918397903, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044317252468317747, + "grad_norm": 9.627838134765625, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8804449439048767, + "num_tokens": 862776534.0, + "step": 22611 + }, + { + "epoch": 2.876478819488615, + "ewc_loss": 0.08314444124698639, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045546781620942056, + "grad_norm": 9.898824691772461, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8575650453567505, + "num_tokens": 862815690.0, + "step": 22612 + }, + { + "epoch": 2.8766060297672054, + "ewc_loss": 0.08212736248970032, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004452970460988581, + "grad_norm": 9.596023559570312, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8649933338165283, + "num_tokens": 862851021.0, + "step": 22613 + }, + { + "epoch": 2.8767332400457954, + "ewc_loss": 0.08355712890625, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045959471026435494, + "grad_norm": 10.005964279174805, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8725310564041138, + "num_tokens": 862885122.0, + "step": 22614 + }, + { + "epoch": 2.8768604503243864, + "ewc_loss": 0.08183600008487701, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044238343252800405, + "grad_norm": 9.636600494384766, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8718918561935425, + "num_tokens": 862922233.0, + "step": 22615 + }, + { + "epoch": 2.8769876606029765, + "ewc_loss": 0.08349194377660751, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004589428717736155, + "grad_norm": 9.965174674987793, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8770892024040222, + "num_tokens": 862961355.0, + "step": 22616 + }, + { + "epoch": 2.8771148708815675, + "ewc_loss": 0.08169873058795929, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004410106921568513, + "grad_norm": 9.568115234375, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8744627237319946, + "num_tokens": 862998523.0, + "step": 22617 + }, + { + "epoch": 2.8772420811601576, + "ewc_loss": 0.0835375189781189, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004593986668623984, + "grad_norm": 9.937454223632812, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8647836446762085, + "num_tokens": 863039684.0, + "step": 22618 + }, + { + "epoch": 2.877369291438748, + "ewc_loss": 0.08192272484302521, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044325063936412334, + "grad_norm": 9.615804672241211, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8697479963302612, + "num_tokens": 863077649.0, + "step": 22619 + }, + { + "epoch": 2.8774965017173386, + "ewc_loss": 0.08352985978126526, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004593220364768058, + "grad_norm": 9.896756172180176, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.862067699432373, + "num_tokens": 863116816.0, + "step": 22620 + }, + { + "epoch": 2.877623711995929, + "ewc_loss": 0.08218024671077728, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044582586269825697, + "grad_norm": 9.735311508178711, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8764094114303589, + "num_tokens": 863155306.0, + "step": 22621 + }, + { + "epoch": 2.8777509222745197, + "ewc_loss": 0.08305531740188599, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000454576569609344, + "grad_norm": 9.842870712280273, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8726719617843628, + "num_tokens": 863194381.0, + "step": 22622 + }, + { + "epoch": 2.87787813255311, + "ewc_loss": 0.08257625997066498, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004497860209085047, + "grad_norm": 9.806074142456055, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8634592890739441, + "num_tokens": 863231470.0, + "step": 22623 + }, + { + "epoch": 2.8780053428317007, + "ewc_loss": 0.08284313976764679, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045245481305755675, + "grad_norm": 9.860030174255371, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8579643964767456, + "num_tokens": 863265617.0, + "step": 22624 + }, + { + "epoch": 2.8781325531102913, + "ewc_loss": 0.08254488557577133, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004494723107200116, + "grad_norm": 9.805316925048828, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.856697678565979, + "num_tokens": 863300111.0, + "step": 22625 + }, + { + "epoch": 2.878259763388882, + "ewc_loss": 0.08267400413751602, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004507634730543941, + "grad_norm": 9.801682472229004, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8732544183731079, + "num_tokens": 863338626.0, + "step": 22626 + }, + { + "epoch": 2.8783869736674723, + "ewc_loss": 0.08259987831115723, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045002216938883066, + "grad_norm": 9.773618698120117, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8572734594345093, + "num_tokens": 863374672.0, + "step": 22627 + }, + { + "epoch": 2.878514183946063, + "ewc_loss": 0.08254985511302948, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044952199095860124, + "grad_norm": 9.753100395202637, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8592361211776733, + "num_tokens": 863412717.0, + "step": 22628 + }, + { + "epoch": 2.8786413942246534, + "ewc_loss": 0.08264826238155365, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004505060787778348, + "grad_norm": 9.740763664245605, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8735741972923279, + "num_tokens": 863447776.0, + "step": 22629 + }, + { + "epoch": 2.878768604503244, + "ewc_loss": 0.08253653347492218, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004493888118304312, + "grad_norm": 9.686156272888184, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8824021816253662, + "num_tokens": 863485697.0, + "step": 22630 + }, + { + "epoch": 2.8788958147818344, + "ewc_loss": 0.08263912796974182, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045041469275020063, + "grad_norm": 9.785026550292969, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.861601710319519, + "num_tokens": 863528706.0, + "step": 22631 + }, + { + "epoch": 2.879023025060425, + "ewc_loss": 0.08233289420604706, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044735235860571265, + "grad_norm": 9.648396492004395, + "learning_rate": 1e-06, + "loss": 0.526, + "mean_token_accuracy": 0.8477463126182556, + "num_tokens": 863569181.0, + "step": 22632 + }, + { + "epoch": 2.8791502353390155, + "ewc_loss": 0.08299626410007477, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004539860237855464, + "grad_norm": 9.760066032409668, + "learning_rate": 1e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8494297862052917, + "num_tokens": 863602985.0, + "step": 22633 + }, + { + "epoch": 2.879277445617606, + "ewc_loss": 0.08243681490421295, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004483915981836617, + "grad_norm": 9.722493171691895, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8703027963638306, + "num_tokens": 863634590.0, + "step": 22634 + }, + { + "epoch": 2.8794046558961965, + "ewc_loss": 0.08291460573673248, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045316945761442184, + "grad_norm": 9.695611953735352, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8653444647789001, + "num_tokens": 863677671.0, + "step": 22635 + }, + { + "epoch": 2.879531866174787, + "ewc_loss": 0.08287781476974487, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004528015560936183, + "grad_norm": 9.753371238708496, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.867858350276947, + "num_tokens": 863716102.0, + "step": 22636 + }, + { + "epoch": 2.8796590764533776, + "ewc_loss": 0.08272279798984528, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004512514278758317, + "grad_norm": 9.727497100830078, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8584038615226746, + "num_tokens": 863751560.0, + "step": 22637 + }, + { + "epoch": 2.879786286731968, + "ewc_loss": 0.08272252231836319, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045124866301193833, + "grad_norm": 9.65744686126709, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8797637224197388, + "num_tokens": 863792373.0, + "step": 22638 + }, + { + "epoch": 2.879913497010558, + "ewc_loss": 0.08290040493011475, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045302751823328435, + "grad_norm": 9.710750579833984, + "learning_rate": 1e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8621389865875244, + "num_tokens": 863829969.0, + "step": 22639 + }, + { + "epoch": 2.880040707289149, + "ewc_loss": 0.08287316560745239, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045275507727637887, + "grad_norm": 9.743332862854004, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8726682662963867, + "num_tokens": 863867080.0, + "step": 22640 + }, + { + "epoch": 2.8801679175677393, + "ewc_loss": 0.08286772668361664, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045270068221725523, + "grad_norm": 9.717517852783203, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8691790699958801, + "num_tokens": 863904230.0, + "step": 22641 + }, + { + "epoch": 2.8802951278463302, + "ewc_loss": 0.08278544247150421, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004518778878264129, + "grad_norm": 9.747054100036621, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8694170713424683, + "num_tokens": 863947309.0, + "step": 22642 + }, + { + "epoch": 2.8804223381249203, + "ewc_loss": 0.08273501694202423, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004513736057560891, + "grad_norm": 9.724662780761719, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8717453479766846, + "num_tokens": 863985557.0, + "step": 22643 + }, + { + "epoch": 2.880549548403511, + "ewc_loss": 0.08277627825737, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004517862107604742, + "grad_norm": 9.729373931884766, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8543699383735657, + "num_tokens": 864021842.0, + "step": 22644 + }, + { + "epoch": 2.8806767586821014, + "ewc_loss": 0.08268614113330841, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045088480692356825, + "grad_norm": 9.717108726501465, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8583148717880249, + "num_tokens": 864060765.0, + "step": 22645 + }, + { + "epoch": 2.880803968960692, + "ewc_loss": 0.0830143541097641, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045172555837780237, + "grad_norm": 9.81401252746582, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8674880862236023, + "num_tokens": 864100624.0, + "step": 22646 + }, + { + "epoch": 2.8809311792392824, + "ewc_loss": 0.08256345987319946, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044965805136598647, + "grad_norm": 9.72488021850586, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8763047456741333, + "num_tokens": 864136599.0, + "step": 22647 + }, + { + "epoch": 2.881058389517873, + "ewc_loss": 0.08292567729949951, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045328016858547926, + "grad_norm": 9.776789665222168, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8697744607925415, + "num_tokens": 864169876.0, + "step": 22648 + }, + { + "epoch": 2.8811855997964635, + "ewc_loss": 0.08264271914958954, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004504505777731538, + "grad_norm": 9.68901252746582, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8645374178886414, + "num_tokens": 864213709.0, + "step": 22649 + }, + { + "epoch": 2.881312810075054, + "ewc_loss": 0.08309361338615417, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045495954691432416, + "grad_norm": 9.8612699508667, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.870917797088623, + "num_tokens": 864245581.0, + "step": 22650 + }, + { + "epoch": 2.8814400203536445, + "ewc_loss": 0.08233630657196045, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044738652650266886, + "grad_norm": 9.641073226928711, + "learning_rate": 1e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8515962362289429, + "num_tokens": 864287081.0, + "step": 22651 + }, + { + "epoch": 2.881567230632235, + "ewc_loss": 0.08332537114620209, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004572772013489157, + "grad_norm": 9.910221099853516, + "learning_rate": 1e-06, + "loss": 0.526, + "mean_token_accuracy": 0.8488105535507202, + "num_tokens": 864321480.0, + "step": 22652 + }, + { + "epoch": 2.8816944409108256, + "ewc_loss": 0.08213192224502563, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044534271000884473, + "grad_norm": 9.62974739074707, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8718308806419373, + "num_tokens": 864358580.0, + "step": 22653 + }, + { + "epoch": 2.881821651189416, + "ewc_loss": 0.08383168280124664, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004598989035002887, + "grad_norm": 9.930350303649902, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8547821640968323, + "num_tokens": 864396277.0, + "step": 22654 + }, + { + "epoch": 2.8819488614680067, + "ewc_loss": 0.08181436359882355, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044216704554855824, + "grad_norm": 9.563821792602539, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8615775108337402, + "num_tokens": 864433282.0, + "step": 22655 + }, + { + "epoch": 2.882076071746597, + "ewc_loss": 0.08354628086090088, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004594862402882427, + "grad_norm": 9.96912956237793, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8576833009719849, + "num_tokens": 864469305.0, + "step": 22656 + }, + { + "epoch": 2.8822032820251877, + "ewc_loss": 0.08181753754615784, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044219885603524745, + "grad_norm": 9.593987464904785, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8661972880363464, + "num_tokens": 864507246.0, + "step": 22657 + }, + { + "epoch": 2.8823304923037782, + "ewc_loss": 0.08358877897262573, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004599112435244024, + "grad_norm": 9.93398666381836, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8607516288757324, + "num_tokens": 864547315.0, + "step": 22658 + }, + { + "epoch": 2.8824577025823688, + "ewc_loss": 0.08181455731391907, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004421690246090293, + "grad_norm": 9.653555870056152, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8590950965881348, + "num_tokens": 864590813.0, + "step": 22659 + }, + { + "epoch": 2.8825849128609593, + "ewc_loss": 0.08332887291908264, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045731221325695515, + "grad_norm": 9.878682136535645, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8555421233177185, + "num_tokens": 864625078.0, + "step": 22660 + }, + { + "epoch": 2.88271212313955, + "ewc_loss": 0.08193746954202652, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044339813757687807, + "grad_norm": 9.58961296081543, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8728455305099487, + "num_tokens": 864663714.0, + "step": 22661 + }, + { + "epoch": 2.88283933341814, + "ewc_loss": 0.08334825187921524, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004575059574563056, + "grad_norm": 9.978821754455566, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8641887307167053, + "num_tokens": 864697026.0, + "step": 22662 + }, + { + "epoch": 2.882966543696731, + "ewc_loss": 0.08206713199615479, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044469471322372556, + "grad_norm": 9.634525299072266, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8733634352684021, + "num_tokens": 864732337.0, + "step": 22663 + }, + { + "epoch": 2.883093753975321, + "ewc_loss": 0.08347778767347336, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004588013107422739, + "grad_norm": 9.866399765014648, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8586481809616089, + "num_tokens": 864776598.0, + "step": 22664 + }, + { + "epoch": 2.883220964253912, + "ewc_loss": 0.08213435113430023, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044536698260344565, + "grad_norm": 9.605207443237305, + "learning_rate": 1e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.8517909646034241, + "num_tokens": 864817279.0, + "step": 22665 + }, + { + "epoch": 2.883348174532502, + "ewc_loss": 0.08358033001422882, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045982678420841694, + "grad_norm": 9.903953552246094, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8722039461135864, + "num_tokens": 864854914.0, + "step": 22666 + }, + { + "epoch": 2.883475384811093, + "ewc_loss": 0.08220880478620529, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044611148769035935, + "grad_norm": 9.670069694519043, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8688176274299622, + "num_tokens": 864892914.0, + "step": 22667 + }, + { + "epoch": 2.883602595089683, + "ewc_loss": 0.08334871381521225, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045751058496534824, + "grad_norm": 9.894768714904785, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8627445697784424, + "num_tokens": 864934189.0, + "step": 22668 + }, + { + "epoch": 2.8837298053682736, + "ewc_loss": 0.08218204975128174, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004458439361769706, + "grad_norm": 9.590996742248535, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8744939565658569, + "num_tokens": 864972085.0, + "step": 22669 + }, + { + "epoch": 2.883857015646864, + "ewc_loss": 0.08351285755634308, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000459152041003108, + "grad_norm": 9.991812705993652, + "learning_rate": 1e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8487937450408936, + "num_tokens": 865008674.0, + "step": 22670 + }, + { + "epoch": 2.8839842259254547, + "ewc_loss": 0.08205186575651169, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044454209273681045, + "grad_norm": 9.674777030944824, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.854691207408905, + "num_tokens": 865046004.0, + "step": 22671 + }, + { + "epoch": 2.884111436204045, + "ewc_loss": 0.08347902446985245, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004563722759485245, + "grad_norm": 9.856430053710938, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8684878349304199, + "num_tokens": 865082409.0, + "step": 22672 + }, + { + "epoch": 2.8842386464826357, + "ewc_loss": 0.08236387372016907, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044522082316689193, + "grad_norm": 9.712968826293945, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8639162182807922, + "num_tokens": 865124457.0, + "step": 22673 + }, + { + "epoch": 2.8843658567612263, + "ewc_loss": 0.08305265754461288, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045455002691596746, + "grad_norm": 9.865641593933105, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8775510787963867, + "num_tokens": 865162290.0, + "step": 22674 + }, + { + "epoch": 2.884493067039817, + "ewc_loss": 0.08222515881061554, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004462750512175262, + "grad_norm": 9.70659065246582, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8612124919891357, + "num_tokens": 865201686.0, + "step": 22675 + }, + { + "epoch": 2.8846202773184073, + "ewc_loss": 0.08287718892097473, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000452795356977731, + "grad_norm": 9.888558387756348, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8534063100814819, + "num_tokens": 865236876.0, + "step": 22676 + }, + { + "epoch": 2.884747487596998, + "ewc_loss": 0.08202709257602692, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004442943900357932, + "grad_norm": 9.634295463562012, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8689603805541992, + "num_tokens": 865276994.0, + "step": 22677 + }, + { + "epoch": 2.8848746978755884, + "ewc_loss": 0.08314384520053864, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004554618790280074, + "grad_norm": 9.879084587097168, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8650381565093994, + "num_tokens": 865311990.0, + "step": 22678 + }, + { + "epoch": 2.885001908154179, + "ewc_loss": 0.0820758268237114, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044478170457296073, + "grad_norm": 9.673377990722656, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8704535365104675, + "num_tokens": 865352324.0, + "step": 22679 + }, + { + "epoch": 2.8851291184327694, + "ewc_loss": 0.08304750919342041, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045449857134371996, + "grad_norm": 9.893533706665039, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8654274344444275, + "num_tokens": 865391428.0, + "step": 22680 + }, + { + "epoch": 2.88525632871136, + "ewc_loss": 0.08201465755701065, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004441700002644211, + "grad_norm": 9.679598808288574, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8616539239883423, + "num_tokens": 865428478.0, + "step": 22681 + }, + { + "epoch": 2.8853835389899505, + "ewc_loss": 0.08318881690502167, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045591159141622484, + "grad_norm": 9.93537712097168, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8645066022872925, + "num_tokens": 865466531.0, + "step": 22682 + }, + { + "epoch": 2.885510749268541, + "ewc_loss": 0.08205723762512207, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004445958184078336, + "grad_norm": 9.693613052368164, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.872539758682251, + "num_tokens": 865503177.0, + "step": 22683 + }, + { + "epoch": 2.8856379595471315, + "ewc_loss": 0.08281956613063812, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004522190720308572, + "grad_norm": 9.909151077270508, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8570430278778076, + "num_tokens": 865539454.0, + "step": 22684 + }, + { + "epoch": 2.885765169825722, + "ewc_loss": 0.08191578090190887, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004431812558323145, + "grad_norm": 9.622198104858398, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8834450244903564, + "num_tokens": 865571937.0, + "step": 22685 + }, + { + "epoch": 2.8858923801043126, + "ewc_loss": 0.08321313560009003, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045615481212735176, + "grad_norm": 10.017459869384766, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8683793544769287, + "num_tokens": 865609457.0, + "step": 22686 + }, + { + "epoch": 2.8860195903829027, + "ewc_loss": 0.08154764771461487, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043949991231784225, + "grad_norm": 9.641191482543945, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8523834943771362, + "num_tokens": 865647391.0, + "step": 22687 + }, + { + "epoch": 2.8861468006614936, + "ewc_loss": 0.08336571604013443, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000457680580439046, + "grad_norm": 10.148179054260254, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.863700807094574, + "num_tokens": 865680578.0, + "step": 22688 + }, + { + "epoch": 2.8862740109400837, + "ewc_loss": 0.08119910955429077, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043601455399766564, + "grad_norm": 9.569360733032227, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.861316442489624, + "num_tokens": 865714610.0, + "step": 22689 + }, + { + "epoch": 2.8864012212186747, + "ewc_loss": 0.08376457542181015, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046166917309165, + "grad_norm": 10.144580841064453, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8607279062271118, + "num_tokens": 865755672.0, + "step": 22690 + }, + { + "epoch": 2.886528431497265, + "ewc_loss": 0.08098863065242767, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00043390976497903466, + "grad_norm": 9.548617362976074, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8720587491989136, + "num_tokens": 865790111.0, + "step": 22691 + }, + { + "epoch": 2.8866556417758558, + "ewc_loss": 0.08376112580299377, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046163471415638924, + "grad_norm": 10.170323371887207, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.864376425743103, + "num_tokens": 865825580.0, + "step": 22692 + }, + { + "epoch": 2.886782852054446, + "ewc_loss": 0.08125191926956177, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004365426138974726, + "grad_norm": 9.514215469360352, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8696206212043762, + "num_tokens": 865869099.0, + "step": 22693 + }, + { + "epoch": 2.8869100623330364, + "ewc_loss": 0.08392016589641571, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004632251220755279, + "grad_norm": 10.261008262634277, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8559702038764954, + "num_tokens": 865906957.0, + "step": 22694 + }, + { + "epoch": 2.887037272611627, + "ewc_loss": 0.08159768581390381, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004375588905531913, + "grad_norm": 9.503204345703125, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8758388757705688, + "num_tokens": 865945712.0, + "step": 22695 + }, + { + "epoch": 2.8871644828902174, + "ewc_loss": 0.0843932032585144, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046795548405498266, + "grad_norm": 10.27786636352539, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.85179203748703, + "num_tokens": 865984469.0, + "step": 22696 + }, + { + "epoch": 2.887291693168808, + "ewc_loss": 0.08142806589603424, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00043586271931417286, + "grad_norm": 9.516520500183105, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8665627241134644, + "num_tokens": 866024624.0, + "step": 22697 + }, + { + "epoch": 2.8874189034473985, + "ewc_loss": 0.08487274497747421, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00047030948917381465, + "grad_norm": 10.267053604125977, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8619956374168396, + "num_tokens": 866062446.0, + "step": 22698 + }, + { + "epoch": 2.887546113725989, + "ewc_loss": 0.0815257653594017, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004392811097204685, + "grad_norm": 9.548178672790527, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8750256896018982, + "num_tokens": 866100896.0, + "step": 22699 + }, + { + "epoch": 2.8876733240045795, + "ewc_loss": 0.08433009684085846, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046732439659535885, + "grad_norm": 10.264313697814941, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8507879972457886, + "num_tokens": 866134668.0, + "step": 22700 + }, + { + "epoch": 2.88780053428317, + "ewc_loss": 0.08164817094802856, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044050515862181783, + "grad_norm": 9.635397911071777, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8574581742286682, + "num_tokens": 866173834.0, + "step": 22701 + }, + { + "epoch": 2.8879277445617606, + "ewc_loss": 0.08444789052009583, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004636195080820471, + "grad_norm": 10.059795379638672, + "learning_rate": 1e-06, + "loss": 0.5432, + "mean_token_accuracy": 0.8464512228965759, + "num_tokens": 866211691.0, + "step": 22702 + }, + { + "epoch": 2.888054954840351, + "ewc_loss": 0.08210192620754242, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044504267862066627, + "grad_norm": 9.748173713684082, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8637477159500122, + "num_tokens": 866240878.0, + "step": 22703 + }, + { + "epoch": 2.8881821651189417, + "ewc_loss": 0.08352481573820114, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004568301956169307, + "grad_norm": 9.886677742004395, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8515172004699707, + "num_tokens": 866289552.0, + "step": 22704 + }, + { + "epoch": 2.888309375397532, + "ewc_loss": 0.0827648714184761, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004492307489272207, + "grad_norm": 9.84875202178955, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8552997708320618, + "num_tokens": 866330361.0, + "step": 22705 + }, + { + "epoch": 2.8884365856761227, + "ewc_loss": 0.08253840357065201, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000449407467385754, + "grad_norm": 9.816572189331055, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8678160905838013, + "num_tokens": 866371557.0, + "step": 22706 + }, + { + "epoch": 2.8885637959547132, + "ewc_loss": 0.08272671699523926, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004512906598392874, + "grad_norm": 9.840958595275879, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8661676645278931, + "num_tokens": 866410627.0, + "step": 22707 + }, + { + "epoch": 2.8886910062333038, + "ewc_loss": 0.08265134692192078, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044809546670876443, + "grad_norm": 9.790122985839844, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8626974821090698, + "num_tokens": 866446036.0, + "step": 22708 + }, + { + "epoch": 2.8888182165118943, + "ewc_loss": 0.08259615302085876, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044998491648584604, + "grad_norm": 9.753950119018555, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.855459988117218, + "num_tokens": 866489655.0, + "step": 22709 + }, + { + "epoch": 2.888945426790485, + "ewc_loss": 0.08259141445159912, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004499375354498625, + "grad_norm": 9.777368545532227, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8567700386047363, + "num_tokens": 866532750.0, + "step": 22710 + }, + { + "epoch": 2.8890726370690754, + "ewc_loss": 0.0825214758515358, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004492381995078176, + "grad_norm": 9.74821949005127, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8712828159332275, + "num_tokens": 866570748.0, + "step": 22711 + }, + { + "epoch": 2.8891998473476654, + "ewc_loss": 0.08254892379045486, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004495126777328551, + "grad_norm": 9.806396484375, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8599069714546204, + "num_tokens": 866609422.0, + "step": 22712 + }, + { + "epoch": 2.8893270576262564, + "ewc_loss": 0.08269977569580078, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004485798126552254, + "grad_norm": 9.76740550994873, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8657265305519104, + "num_tokens": 866650865.0, + "step": 22713 + }, + { + "epoch": 2.8894542679048465, + "ewc_loss": 0.08285696059465408, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045015165233053267, + "grad_norm": 9.771627426147461, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8629378080368042, + "num_tokens": 866687605.0, + "step": 22714 + }, + { + "epoch": 2.8895814781834375, + "ewc_loss": 0.08238764107227325, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044789983076043427, + "grad_norm": 9.709549903869629, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8551050424575806, + "num_tokens": 866722974.0, + "step": 22715 + }, + { + "epoch": 2.8897086884620276, + "ewc_loss": 0.08264551311731339, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045047857565805316, + "grad_norm": 9.773194313049316, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8626312017440796, + "num_tokens": 866760694.0, + "step": 22716 + }, + { + "epoch": 2.889835898740618, + "ewc_loss": 0.08239999413490295, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044802334741689265, + "grad_norm": 9.746227264404297, + "learning_rate": 1e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8472172617912292, + "num_tokens": 866795102.0, + "step": 22717 + }, + { + "epoch": 2.8899631090192086, + "ewc_loss": 0.0826418548822403, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045044193393550813, + "grad_norm": 9.770469665527344, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8787630796432495, + "num_tokens": 866831282.0, + "step": 22718 + }, + { + "epoch": 2.890090319297799, + "ewc_loss": 0.08234953880310059, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044751877430826426, + "grad_norm": 9.653705596923828, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8563777208328247, + "num_tokens": 866872661.0, + "step": 22719 + }, + { + "epoch": 2.8902175295763897, + "ewc_loss": 0.08284015953540802, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045242501073516905, + "grad_norm": 9.756248474121094, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8552265167236328, + "num_tokens": 866917576.0, + "step": 22720 + }, + { + "epoch": 2.89034473985498, + "ewc_loss": 0.0824819803237915, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044884326052851975, + "grad_norm": 9.714436531066895, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8684091567993164, + "num_tokens": 866957245.0, + "step": 22721 + }, + { + "epoch": 2.8904719501335707, + "ewc_loss": 0.0829925686120987, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004539490910246968, + "grad_norm": 9.787250518798828, + "learning_rate": 1e-06, + "loss": 0.558, + "mean_token_accuracy": 0.8350040912628174, + "num_tokens": 866998695.0, + "step": 22722 + }, + { + "epoch": 2.8905991604121613, + "ewc_loss": 0.08249558508396149, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044897926272824407, + "grad_norm": 9.6901273727417, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8494031429290771, + "num_tokens": 867033502.0, + "step": 22723 + }, + { + "epoch": 2.890726370690752, + "ewc_loss": 0.08341965079307556, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045577858691103756, + "grad_norm": 9.89528751373291, + "learning_rate": 1e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8472773432731628, + "num_tokens": 867076032.0, + "step": 22724 + }, + { + "epoch": 2.8908535809693423, + "ewc_loss": 0.08226283639669418, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004466518003027886, + "grad_norm": 9.685264587402344, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.864453911781311, + "num_tokens": 867107429.0, + "step": 22725 + }, + { + "epoch": 2.890980791247933, + "ewc_loss": 0.08344404399394989, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004584638518281281, + "grad_norm": 9.931635856628418, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8681336641311646, + "num_tokens": 867139432.0, + "step": 22726 + }, + { + "epoch": 2.8911080015265234, + "ewc_loss": 0.08224408328533173, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044646431342698634, + "grad_norm": 9.706246376037598, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.860339879989624, + "num_tokens": 867180588.0, + "step": 22727 + }, + { + "epoch": 2.891235211805114, + "ewc_loss": 0.08364495635032654, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004580315435305238, + "grad_norm": 9.941508293151855, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8612704277038574, + "num_tokens": 867215170.0, + "step": 22728 + }, + { + "epoch": 2.8913624220837044, + "ewc_loss": 0.08232767879962921, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044485877151601017, + "grad_norm": 9.671770095825195, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8799847364425659, + "num_tokens": 867254958.0, + "step": 22729 + }, + { + "epoch": 2.891489632362295, + "ewc_loss": 0.08342985808849335, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004583220579661429, + "grad_norm": 9.897220611572266, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8623438477516174, + "num_tokens": 867291135.0, + "step": 22730 + }, + { + "epoch": 2.8916168426408855, + "ewc_loss": 0.08245938271284103, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044617586536332965, + "grad_norm": 9.663352012634277, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8719288110733032, + "num_tokens": 867326926.0, + "step": 22731 + }, + { + "epoch": 2.891744052919476, + "ewc_loss": 0.0834406167268753, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000458429625723511, + "grad_norm": 9.992884635925293, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8549748659133911, + "num_tokens": 867361142.0, + "step": 22732 + }, + { + "epoch": 2.8918712631980665, + "ewc_loss": 0.0819491446018219, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044351493124850094, + "grad_norm": 9.640222549438477, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.881528913974762, + "num_tokens": 867398344.0, + "step": 22733 + }, + { + "epoch": 2.891998473476657, + "ewc_loss": 0.08357072621583939, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045973071246407926, + "grad_norm": 9.875223159790039, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8727887868881226, + "num_tokens": 867437642.0, + "step": 22734 + }, + { + "epoch": 2.8921256837552476, + "ewc_loss": 0.08256146311759949, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044719662400893867, + "grad_norm": 9.68815803527832, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8531546592712402, + "num_tokens": 867471646.0, + "step": 22735 + }, + { + "epoch": 2.892252894033838, + "ewc_loss": 0.08344857394695282, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045850922469981015, + "grad_norm": 9.901040077209473, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8658207058906555, + "num_tokens": 867509682.0, + "step": 22736 + }, + { + "epoch": 2.892380104312428, + "ewc_loss": 0.08265353739261627, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004481174109969288, + "grad_norm": 9.676219940185547, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8596042990684509, + "num_tokens": 867544425.0, + "step": 22737 + }, + { + "epoch": 2.892507314591019, + "ewc_loss": 0.08346664905548096, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000456248497357592, + "grad_norm": 9.846965789794922, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8568158149719238, + "num_tokens": 867584978.0, + "step": 22738 + }, + { + "epoch": 2.8926345248696093, + "ewc_loss": 0.08279979228973389, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004495799366850406, + "grad_norm": 9.723010063171387, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8654927015304565, + "num_tokens": 867620360.0, + "step": 22739 + }, + { + "epoch": 2.8927617351482002, + "ewc_loss": 0.08327455073595047, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004543275572359562, + "grad_norm": 9.86542797088623, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8726573586463928, + "num_tokens": 867655246.0, + "step": 22740 + }, + { + "epoch": 2.8928889454267903, + "ewc_loss": 0.08274553716182709, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004490373539738357, + "grad_norm": 9.657504081726074, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8545911908149719, + "num_tokens": 867692792.0, + "step": 22741 + }, + { + "epoch": 2.893016155705381, + "ewc_loss": 0.08364257216453552, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004580077948048711, + "grad_norm": 9.883554458618164, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8672456741333008, + "num_tokens": 867732365.0, + "step": 22742 + }, + { + "epoch": 2.8931433659839714, + "ewc_loss": 0.08240503072738647, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044563229312188923, + "grad_norm": 9.71047306060791, + "learning_rate": 1e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8507457375526428, + "num_tokens": 867764800.0, + "step": 22743 + }, + { + "epoch": 2.893270576262562, + "ewc_loss": 0.08347433805465698, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004563253605738282, + "grad_norm": 9.784770965576172, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8572934865951538, + "num_tokens": 867807107.0, + "step": 22744 + }, + { + "epoch": 2.8933977865411524, + "ewc_loss": 0.0831364318728447, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00045050494372844696, + "grad_norm": 9.692831039428711, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8545311093330383, + "num_tokens": 867850225.0, + "step": 22745 + }, + { + "epoch": 2.893524996819743, + "ewc_loss": 0.08324719965457916, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045405406854115427, + "grad_norm": 9.801103591918945, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8601917028427124, + "num_tokens": 867890457.0, + "step": 22746 + }, + { + "epoch": 2.8936522070983335, + "ewc_loss": 0.08279676735401154, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004495496687013656, + "grad_norm": 9.671113014221191, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8694634437561035, + "num_tokens": 867925880.0, + "step": 22747 + }, + { + "epoch": 2.893779417376924, + "ewc_loss": 0.08338287472724915, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004554108309093863, + "grad_norm": 9.820281982421875, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8798109292984009, + "num_tokens": 867963923.0, + "step": 22748 + }, + { + "epoch": 2.8939066276555145, + "ewc_loss": 0.08272766321897507, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044885865645483136, + "grad_norm": 9.676607131958008, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8672223091125488, + "num_tokens": 868000489.0, + "step": 22749 + }, + { + "epoch": 2.894033837934105, + "ewc_loss": 0.08355662226676941, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004571483004838228, + "grad_norm": 9.823436737060547, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8675114512443542, + "num_tokens": 868034001.0, + "step": 22750 + }, + { + "epoch": 2.8941610482126956, + "ewc_loss": 0.0826813280582428, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004483952943701297, + "grad_norm": 9.80958080291748, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8686675429344177, + "num_tokens": 868074956.0, + "step": 22751 + }, + { + "epoch": 2.894288258491286, + "ewc_loss": 0.0831194818019867, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045277687604539096, + "grad_norm": 9.745523452758789, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8728213310241699, + "num_tokens": 868110727.0, + "step": 22752 + }, + { + "epoch": 2.8944154687698767, + "ewc_loss": 0.08302170783281326, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045179910375736654, + "grad_norm": 9.738103866577148, + "learning_rate": 1e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8444019556045532, + "num_tokens": 868151480.0, + "step": 22753 + }, + { + "epoch": 2.894542679048467, + "ewc_loss": 0.08314159512519836, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004529979487415403, + "grad_norm": 9.831697463989258, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.855628252029419, + "num_tokens": 868190620.0, + "step": 22754 + }, + { + "epoch": 2.8946698893270577, + "ewc_loss": 0.08311556279659271, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00045029621105641127, + "grad_norm": 9.748857498168945, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8623987436294556, + "num_tokens": 868230598.0, + "step": 22755 + }, + { + "epoch": 2.8947970996056482, + "ewc_loss": 0.08278556913137436, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004518791101872921, + "grad_norm": 9.831372261047363, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8567367196083069, + "num_tokens": 868265933.0, + "step": 22756 + }, + { + "epoch": 2.8949243098842388, + "ewc_loss": 0.08296199887990952, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004512020095717162, + "grad_norm": 9.833148956298828, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8665950298309326, + "num_tokens": 868300282.0, + "step": 22757 + }, + { + "epoch": 2.8950515201628293, + "ewc_loss": 0.08289197087287903, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045050171320326626, + "grad_norm": 9.779541969299316, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8793742060661316, + "num_tokens": 868331031.0, + "step": 22758 + }, + { + "epoch": 2.89517873044142, + "ewc_loss": 0.08275123685598373, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004490943974815309, + "grad_norm": 9.703917503356934, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.865409255027771, + "num_tokens": 868368268.0, + "step": 22759 + }, + { + "epoch": 2.89530594072001, + "ewc_loss": 0.08300478756427765, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000451629952294752, + "grad_norm": 9.810537338256836, + "learning_rate": 1e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.847968578338623, + "num_tokens": 868407551.0, + "step": 22760 + }, + { + "epoch": 2.895433150998601, + "ewc_loss": 0.08253371715545654, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044936061021871865, + "grad_norm": 9.782581329345703, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.861663818359375, + "num_tokens": 868449301.0, + "step": 22761 + }, + { + "epoch": 2.895560361277191, + "ewc_loss": 0.08281679451465607, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004497499903663993, + "grad_norm": 14.997787475585938, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8615392446517944, + "num_tokens": 868482853.0, + "step": 22762 + }, + { + "epoch": 2.895687571555782, + "ewc_loss": 0.0884447693824768, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005060296971350908, + "grad_norm": 10.24243450164795, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8594985604286194, + "num_tokens": 868523960.0, + "step": 22763 + }, + { + "epoch": 2.895814781834372, + "ewc_loss": 0.0871564969420433, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004931470029987395, + "grad_norm": 10.54481029510498, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.856667160987854, + "num_tokens": 868566447.0, + "step": 22764 + }, + { + "epoch": 2.895941992112963, + "ewc_loss": 0.08279635012149811, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004495455650612712, + "grad_norm": 9.731071472167969, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8741168975830078, + "num_tokens": 868608342.0, + "step": 22765 + }, + { + "epoch": 2.896069202391553, + "ewc_loss": 0.08786432445049286, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005002252873964608, + "grad_norm": 10.5493803024292, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.875, + "num_tokens": 868647354.0, + "step": 22766 + }, + { + "epoch": 2.8961964126701436, + "ewc_loss": 0.08314985781908035, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045308060362003744, + "grad_norm": 9.783849716186523, + "learning_rate": 1e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.846506655216217, + "num_tokens": 868690716.0, + "step": 22767 + }, + { + "epoch": 2.896323622948734, + "ewc_loss": 0.08598996698856354, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00048392306780442595, + "grad_norm": 10.36780834197998, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8708219528198242, + "num_tokens": 868728318.0, + "step": 22768 + }, + { + "epoch": 2.8964508332273247, + "ewc_loss": 0.08290300518274307, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004530535079538822, + "grad_norm": 9.788352966308594, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8788677453994751, + "num_tokens": 868764323.0, + "step": 22769 + }, + { + "epoch": 2.896578043505915, + "ewc_loss": 0.08577421307563782, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004793241387233138, + "grad_norm": 10.250810623168945, + "learning_rate": 1e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.8458908796310425, + "num_tokens": 868801371.0, + "step": 22770 + }, + { + "epoch": 2.8967052537845057, + "ewc_loss": 0.08293106406927109, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045089266495779157, + "grad_norm": 9.824679374694824, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8701209425926208, + "num_tokens": 868846679.0, + "step": 22771 + }, + { + "epoch": 2.8968324640630962, + "ewc_loss": 0.08494327962398529, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004710148205049336, + "grad_norm": 10.209394454956055, + "learning_rate": 1e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.849817156791687, + "num_tokens": 868880076.0, + "step": 22772 + }, + { + "epoch": 2.8969596743416868, + "ewc_loss": 0.08302747458219528, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045185675844550133, + "grad_norm": 9.791643142700195, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8643670082092285, + "num_tokens": 868917086.0, + "step": 22773 + }, + { + "epoch": 2.8970868846202773, + "ewc_loss": 0.08416542410850525, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004656776727642864, + "grad_norm": 10.051607131958008, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8703181743621826, + "num_tokens": 868954952.0, + "step": 22774 + }, + { + "epoch": 2.897214094898868, + "ewc_loss": 0.08291579782962799, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045074004447087646, + "grad_norm": 9.816608428955078, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.859340488910675, + "num_tokens": 868990911.0, + "step": 22775 + }, + { + "epoch": 2.8973413051774584, + "ewc_loss": 0.08387084305286407, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046273189946077764, + "grad_norm": 9.989141464233398, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8750747442245483, + "num_tokens": 869033398.0, + "step": 22776 + }, + { + "epoch": 2.897468515456049, + "ewc_loss": 0.08266066014766693, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045063003199175, + "grad_norm": 9.751947402954102, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8516372442245483, + "num_tokens": 869070513.0, + "step": 22777 + }, + { + "epoch": 2.8975957257346394, + "ewc_loss": 0.08383023738861084, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004598843806888908, + "grad_norm": 9.921319961547852, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8714662790298462, + "num_tokens": 869113494.0, + "step": 22778 + }, + { + "epoch": 2.89772293601323, + "ewc_loss": 0.0825611799955368, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044963520485907793, + "grad_norm": 9.776739120483398, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8746165037155151, + "num_tokens": 869147433.0, + "step": 22779 + }, + { + "epoch": 2.8978501462918205, + "ewc_loss": 0.08356393873691559, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004572214384097606, + "grad_norm": 9.909725189208984, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8613294363021851, + "num_tokens": 869186763.0, + "step": 22780 + }, + { + "epoch": 2.897977356570411, + "ewc_loss": 0.08281648904085159, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044974693446420133, + "grad_norm": 9.79149341583252, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8775355219841003, + "num_tokens": 869222624.0, + "step": 22781 + }, + { + "epoch": 2.8981045668490015, + "ewc_loss": 0.08340060710906982, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004555881314445287, + "grad_norm": 9.87878131866455, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8861871361732483, + "num_tokens": 869263027.0, + "step": 22782 + }, + { + "epoch": 2.898231777127592, + "ewc_loss": 0.08254027366638184, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044942612294107676, + "grad_norm": 9.720466613769531, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8603674173355103, + "num_tokens": 869304232.0, + "step": 22783 + }, + { + "epoch": 2.8983589874061826, + "ewc_loss": 0.08308716118335724, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004548950237222016, + "grad_norm": 9.872845649719238, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.870211124420166, + "num_tokens": 869340141.0, + "step": 22784 + }, + { + "epoch": 2.8984861976847727, + "ewc_loss": 0.0825849175453186, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044987257570028305, + "grad_norm": 9.76840591430664, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8562659621238708, + "num_tokens": 869382428.0, + "step": 22785 + }, + { + "epoch": 2.8986134079633636, + "ewc_loss": 0.08311107754707336, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045513425720855594, + "grad_norm": 9.830689430236816, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8711801171302795, + "num_tokens": 869418890.0, + "step": 22786 + }, + { + "epoch": 2.8987406182419537, + "ewc_loss": 0.08252918720245361, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004493153537623584, + "grad_norm": 9.775758743286133, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8648964762687683, + "num_tokens": 869453704.0, + "step": 22787 + }, + { + "epoch": 2.8988678285205447, + "ewc_loss": 0.08298924565315247, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045391591265797615, + "grad_norm": 9.835970878601074, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8601841330528259, + "num_tokens": 869491845.0, + "step": 22788 + }, + { + "epoch": 2.898995038799135, + "ewc_loss": 0.08280468732118607, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045207032235339284, + "grad_norm": 9.859015464782715, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8756071329116821, + "num_tokens": 869523393.0, + "step": 22789 + }, + { + "epoch": 2.8991222490777258, + "ewc_loss": 0.08269941806793213, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045101766590960324, + "grad_norm": 9.782086372375488, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8633008003234863, + "num_tokens": 869561927.0, + "step": 22790 + }, + { + "epoch": 2.899249459356316, + "ewc_loss": 0.08295156806707382, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004535391053650528, + "grad_norm": 9.746733665466309, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8631453514099121, + "num_tokens": 869599568.0, + "step": 22791 + }, + { + "epoch": 2.8993766696349064, + "ewc_loss": 0.08297903090715408, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000453813758213073, + "grad_norm": 9.829574584960938, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8674050569534302, + "num_tokens": 869636329.0, + "step": 22792 + }, + { + "epoch": 2.899503879913497, + "ewc_loss": 0.08275564014911652, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045157980639487505, + "grad_norm": 10.795650482177734, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8588967323303223, + "num_tokens": 869679720.0, + "step": 22793 + }, + { + "epoch": 2.8996310901920874, + "ewc_loss": 0.08130374550819397, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004370608658064157, + "grad_norm": 14.749605178833008, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8667237758636475, + "num_tokens": 869719527.0, + "step": 22794 + }, + { + "epoch": 2.899758300470678, + "ewc_loss": 0.08432746678590775, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004648566828109324, + "grad_norm": 9.648229598999023, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8693341016769409, + "num_tokens": 869755139.0, + "step": 22795 + }, + { + "epoch": 2.8998855107492685, + "ewc_loss": 0.08832083642482758, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0005023489356972277, + "grad_norm": 10.659660339355469, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8592408895492554, + "num_tokens": 869791860.0, + "step": 22796 + }, + { + "epoch": 2.900012721027859, + "ewc_loss": 0.08195126056671143, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004386531945783645, + "grad_norm": 9.475873947143555, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8859081268310547, + "num_tokens": 869827294.0, + "step": 22797 + }, + { + "epoch": 2.9001399313064495, + "ewc_loss": 0.08923359215259552, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0005114765954203904, + "grad_norm": 10.680896759033203, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8591469526290894, + "num_tokens": 869859398.0, + "step": 22798 + }, + { + "epoch": 2.90026714158504, + "ewc_loss": 0.08320111036300659, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00045115171815268695, + "grad_norm": 9.670395851135254, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8641313314437866, + "num_tokens": 869898215.0, + "step": 22799 + }, + { + "epoch": 2.9003943518636306, + "ewc_loss": 0.08775491267442703, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004942483501508832, + "grad_norm": 10.40666389465332, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8542253971099854, + "num_tokens": 869939787.0, + "step": 22800 + }, + { + "epoch": 2.900521562142221, + "ewc_loss": 0.0841463953256607, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.00045816312194801867, + "grad_norm": 9.785744667053223, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8552188277244568, + "num_tokens": 869976790.0, + "step": 22801 + }, + { + "epoch": 2.9006487724208116, + "ewc_loss": 0.0865749716758728, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004824489587917924, + "grad_norm": 10.2142972946167, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8692222833633423, + "num_tokens": 870013547.0, + "step": 22802 + }, + { + "epoch": 2.900775982699402, + "ewc_loss": 0.08380308002233505, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004571714380290359, + "grad_norm": 9.790658950805664, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8771167993545532, + "num_tokens": 870049540.0, + "step": 22803 + }, + { + "epoch": 2.9009031929779927, + "ewc_loss": 0.0852193683385849, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004713342641480267, + "grad_norm": 10.023159980773926, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8733530640602112, + "num_tokens": 870088247.0, + "step": 22804 + }, + { + "epoch": 2.9010304032565832, + "ewc_loss": 0.08424952626228333, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004591945034917444, + "grad_norm": 9.809624671936035, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8652462959289551, + "num_tokens": 870129112.0, + "step": 22805 + }, + { + "epoch": 2.9011576135351738, + "ewc_loss": 0.084555983543396, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00046470051165670156, + "grad_norm": 10.00186538696289, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8650176525115967, + "num_tokens": 870162082.0, + "step": 22806 + }, + { + "epoch": 2.9012848238137643, + "ewc_loss": 0.0839950293302536, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004566495190374553, + "grad_norm": 9.740194320678711, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.875077486038208, + "num_tokens": 870199868.0, + "step": 22807 + }, + { + "epoch": 2.901412034092355, + "ewc_loss": 0.08451245725154877, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004642652056645602, + "grad_norm": 9.91256046295166, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.870776355266571, + "num_tokens": 870238121.0, + "step": 22808 + }, + { + "epoch": 2.9015392443709453, + "ewc_loss": 0.08366512507200241, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.00045335045433603227, + "grad_norm": 9.67110538482666, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8558812141418457, + "num_tokens": 870278008.0, + "step": 22809 + }, + { + "epoch": 2.9016664546495354, + "ewc_loss": 0.08448608964681625, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004640015249606222, + "grad_norm": 9.904502868652344, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8605707883834839, + "num_tokens": 870317671.0, + "step": 22810 + }, + { + "epoch": 2.9017936649281264, + "ewc_loss": 0.08344608545303345, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004536015330813825, + "grad_norm": 9.750956535339355, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8706457018852234, + "num_tokens": 870352893.0, + "step": 22811 + }, + { + "epoch": 2.9019208752067165, + "ewc_loss": 0.08388923853635788, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00045803302782587707, + "grad_norm": 9.873556137084961, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.872068464756012, + "num_tokens": 870387513.0, + "step": 22812 + }, + { + "epoch": 2.9020480854853075, + "ewc_loss": 0.08337543904781342, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004528949793893844, + "grad_norm": 9.66802978515625, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.872907280921936, + "num_tokens": 870428649.0, + "step": 22813 + }, + { + "epoch": 2.9021752957638975, + "ewc_loss": 0.08439819514751434, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004606811562553048, + "grad_norm": 9.928099632263184, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8796172142028809, + "num_tokens": 870468329.0, + "step": 22814 + }, + { + "epoch": 2.902302506042488, + "ewc_loss": 0.08320900797843933, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004487892729230225, + "grad_norm": 9.647591590881348, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8646436929702759, + "num_tokens": 870509273.0, + "step": 22815 + }, + { + "epoch": 2.9024297163210786, + "ewc_loss": 0.08447033166885376, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004614025237970054, + "grad_norm": 9.900871276855469, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8595001697540283, + "num_tokens": 870544206.0, + "step": 22816 + }, + { + "epoch": 2.902556926599669, + "ewc_loss": 0.08318936824798584, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.00044859288027510047, + "grad_norm": 9.639262199401855, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8809583187103271, + "num_tokens": 870578708.0, + "step": 22817 + }, + { + "epoch": 2.9026841368782597, + "ewc_loss": 0.08441243320703506, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004608235612977296, + "grad_norm": 9.875163078308105, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8723258972167969, + "num_tokens": 870620599.0, + "step": 22818 + }, + { + "epoch": 2.90281134715685, + "ewc_loss": 0.08317701518535614, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004484693636186421, + "grad_norm": 9.651850700378418, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.877988874912262, + "num_tokens": 870657828.0, + "step": 22819 + }, + { + "epoch": 2.9029385574354407, + "ewc_loss": 0.08378748595714569, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046189824934117496, + "grad_norm": 9.916279792785645, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8595490455627441, + "num_tokens": 870694786.0, + "step": 22820 + }, + { + "epoch": 2.9030657677140312, + "ewc_loss": 0.08305400609970093, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.00044723923201672733, + "grad_norm": 9.63782787322998, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8835639953613281, + "num_tokens": 870733850.0, + "step": 22821 + }, + { + "epoch": 2.9031929779926218, + "ewc_loss": 0.08433711528778076, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00046251178719103336, + "grad_norm": 9.877320289611816, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8660439252853394, + "num_tokens": 870771556.0, + "step": 22822 + }, + { + "epoch": 2.9033201882712123, + "ewc_loss": 0.08327294886112213, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.00044942874228581786, + "grad_norm": 9.689603805541992, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.875677227973938, + "num_tokens": 870809335.0, + "step": 22823 + }, + { + "epoch": 2.903447398549803, + "ewc_loss": 0.08386294543743134, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00045777010382153094, + "grad_norm": 9.85801887512207, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8609201908111572, + "num_tokens": 870844239.0, + "step": 22824 + }, + { + "epoch": 2.9035746088283934, + "ewc_loss": 0.08343939483165741, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.000451093161245808, + "grad_norm": 9.63425350189209, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8654321432113647, + "num_tokens": 870879540.0, + "step": 22825 + }, + { + "epoch": 2.903701819106984, + "ewc_loss": 0.08407765626907349, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004599171516019851, + "grad_norm": 9.883118629455566, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8642253279685974, + "num_tokens": 870917219.0, + "step": 22826 + }, + { + "epoch": 2.9038290293855744, + "ewc_loss": 0.08286403119564056, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004477808834053576, + "grad_norm": 9.618195533752441, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8668476343154907, + "num_tokens": 870952332.0, + "step": 22827 + }, + { + "epoch": 2.903956239664165, + "ewc_loss": 0.08364908397197723, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046051430399529636, + "grad_norm": 9.845690727233887, + "learning_rate": 1e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8520814180374146, + "num_tokens": 870991609.0, + "step": 22828 + }, + { + "epoch": 2.9040834499427555, + "ewc_loss": 0.08305282145738602, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004496688488870859, + "grad_norm": 9.651172637939453, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8686797618865967, + "num_tokens": 871027456.0, + "step": 22829 + }, + { + "epoch": 2.904210660221346, + "ewc_loss": 0.08404717594385147, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004596123762894422, + "grad_norm": 9.867854118347168, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8562942743301392, + "num_tokens": 871071833.0, + "step": 22830 + }, + { + "epoch": 2.9043378704999365, + "ewc_loss": 0.08318221569061279, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004485213721636683, + "grad_norm": 9.61587142944336, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.863661527633667, + "num_tokens": 871105446.0, + "step": 22831 + }, + { + "epoch": 2.904465080778527, + "ewc_loss": 0.0841461718082428, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.000460602343082428, + "grad_norm": 9.917975425720215, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8633852005004883, + "num_tokens": 871141408.0, + "step": 22832 + }, + { + "epoch": 2.9045922910571176, + "ewc_loss": 0.08271268010139465, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.000446267455117777, + "grad_norm": 9.61236572265625, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8673272728919983, + "num_tokens": 871173296.0, + "step": 22833 + }, + { + "epoch": 2.904719501335708, + "ewc_loss": 0.08413537591695786, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00046049439697526395, + "grad_norm": 9.840386390686035, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8612797260284424, + "num_tokens": 871210711.0, + "step": 22834 + }, + { + "epoch": 2.904846711614298, + "ewc_loss": 0.08239205926656723, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044794403947889805, + "grad_norm": 9.670412063598633, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.861264169216156, + "num_tokens": 871251179.0, + "step": 22835 + }, + { + "epoch": 2.904973921892889, + "ewc_loss": 0.08416386693716049, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.00045833789044991136, + "grad_norm": 9.766374588012695, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8847333788871765, + "num_tokens": 871295758.0, + "step": 22836 + }, + { + "epoch": 2.9051011321714793, + "ewc_loss": 0.0834130197763443, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.00045082945143803954, + "grad_norm": 9.710236549377441, + "learning_rate": 1e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8490327596664429, + "num_tokens": 871331145.0, + "step": 22837 + }, + { + "epoch": 2.9052283424500702, + "ewc_loss": 0.08373419940471649, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004540412628557533, + "grad_norm": 9.742929458618164, + "learning_rate": 1e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8402685523033142, + "num_tokens": 871372819.0, + "step": 22838 + }, + { + "epoch": 2.9053555527286603, + "ewc_loss": 0.08375825732946396, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004542818060144782, + "grad_norm": 11.659618377685547, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8594223856925964, + "num_tokens": 871414223.0, + "step": 22839 + }, + { + "epoch": 2.905482763007251, + "ewc_loss": 0.08211056143045425, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.00043780484702438116, + "grad_norm": 9.2945556640625, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.86899733543396, + "num_tokens": 871455013.0, + "step": 22840 + }, + { + "epoch": 2.9056099732858414, + "ewc_loss": 0.08871663361787796, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0005038655363023281, + "grad_norm": 10.528313636779785, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8640220165252686, + "num_tokens": 871495462.0, + "step": 22841 + }, + { + "epoch": 2.905737183564432, + "ewc_loss": 0.08200570940971375, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004367562651168555, + "grad_norm": 9.313618659973145, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8673872947692871, + "num_tokens": 871534999.0, + "step": 22842 + }, + { + "epoch": 2.9058643938430224, + "ewc_loss": 0.0891990214586258, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0005086893797852099, + "grad_norm": 10.573687553405762, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8654106855392456, + "num_tokens": 871569816.0, + "step": 22843 + }, + { + "epoch": 2.905991604121613, + "ewc_loss": 0.08301384747028351, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004492791194934398, + "grad_norm": 9.546043395996094, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8676964044570923, + "num_tokens": 871610441.0, + "step": 22844 + }, + { + "epoch": 2.9061188144002035, + "ewc_loss": 0.08823392540216446, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004990384913980961, + "grad_norm": 10.444913864135742, + "learning_rate": 1e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8509373664855957, + "num_tokens": 871649005.0, + "step": 22845 + }, + { + "epoch": 2.906246024678794, + "ewc_loss": 0.08362063765525818, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00045534694800153375, + "grad_norm": 9.660989761352539, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8605265617370605, + "num_tokens": 871689263.0, + "step": 22846 + }, + { + "epoch": 2.9063732349573845, + "ewc_loss": 0.08690740168094635, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.000488214660435915, + "grad_norm": 10.269864082336426, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8739053010940552, + "num_tokens": 871725440.0, + "step": 22847 + }, + { + "epoch": 2.906500445235975, + "ewc_loss": 0.08382557332515717, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.000457396381534636, + "grad_norm": 9.769044876098633, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8623567819595337, + "num_tokens": 871767839.0, + "step": 22848 + }, + { + "epoch": 2.9066276555145656, + "ewc_loss": 0.08572721481323242, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00047641273704357445, + "grad_norm": 10.102148056030273, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8690574765205383, + "num_tokens": 871801516.0, + "step": 22849 + }, + { + "epoch": 2.906754865793156, + "ewc_loss": 0.08349935710430145, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004590169992297888, + "grad_norm": 9.835237503051758, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8609066605567932, + "num_tokens": 871840359.0, + "step": 22850 + }, + { + "epoch": 2.9068820760717466, + "ewc_loss": 0.08419743180274963, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004659977275878191, + "grad_norm": 9.977703094482422, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8694865703582764, + "num_tokens": 871879396.0, + "step": 22851 + }, + { + "epoch": 2.907009286350337, + "ewc_loss": 0.08370622247457504, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004586442664731294, + "grad_norm": 9.823840141296387, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.859017014503479, + "num_tokens": 871915043.0, + "step": 22852 + }, + { + "epoch": 2.9071364966289277, + "ewc_loss": 0.08397822827100754, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004613643104676157, + "grad_norm": 9.9061861038208, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8596769571304321, + "num_tokens": 871953391.0, + "step": 22853 + }, + { + "epoch": 2.9072637069075182, + "ewc_loss": 0.083470419049263, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045628624502569437, + "grad_norm": 9.77296257019043, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8681627511978149, + "num_tokens": 871989879.0, + "step": 22854 + }, + { + "epoch": 2.9073909171861088, + "ewc_loss": 0.08327914774417877, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004568149452097714, + "grad_norm": 9.818086624145508, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8674169182777405, + "num_tokens": 872022428.0, + "step": 22855 + }, + { + "epoch": 2.9075181274646993, + "ewc_loss": 0.08320730924606323, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045609657536260784, + "grad_norm": 9.788793563842773, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8706471920013428, + "num_tokens": 872060644.0, + "step": 22856 + }, + { + "epoch": 2.90764533774329, + "ewc_loss": 0.08315469324588776, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045557040721178055, + "grad_norm": 9.807693481445312, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8617935180664062, + "num_tokens": 872098230.0, + "step": 22857 + }, + { + "epoch": 2.90777254802188, + "ewc_loss": 0.08375909924507141, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004542901588138193, + "grad_norm": 9.82010269165039, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.853718638420105, + "num_tokens": 872131217.0, + "step": 22858 + }, + { + "epoch": 2.907899758300471, + "ewc_loss": 0.0831117331981659, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004526993725448847, + "grad_norm": 9.792651176452637, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8728079199790955, + "num_tokens": 872163912.0, + "step": 22859 + }, + { + "epoch": 2.908026968579061, + "ewc_loss": 0.08299440145492554, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004539673973340541, + "grad_norm": 9.80122184753418, + "learning_rate": 1e-06, + "loss": 0.5582, + "mean_token_accuracy": 0.8412714004516602, + "num_tokens": 872200694.0, + "step": 22860 + }, + { + "epoch": 2.908154178857652, + "ewc_loss": 0.08287335932254791, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045275702723301947, + "grad_norm": 9.744508743286133, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8652934432029724, + "num_tokens": 872241156.0, + "step": 22861 + }, + { + "epoch": 2.908281389136242, + "ewc_loss": 0.08303381502628326, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004543615796137601, + "grad_norm": 9.855561256408691, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8669086694717407, + "num_tokens": 872273048.0, + "step": 22862 + }, + { + "epoch": 2.908408599414833, + "ewc_loss": 0.08264876902103424, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045051114284433424, + "grad_norm": 9.724784851074219, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8693206310272217, + "num_tokens": 872312165.0, + "step": 22863 + }, + { + "epoch": 2.908535809693423, + "ewc_loss": 0.08323617279529572, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045394376502372324, + "grad_norm": 9.715958595275879, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8673806190490723, + "num_tokens": 872352740.0, + "step": 22864 + }, + { + "epoch": 2.9086630199720136, + "ewc_loss": 0.08296757936477661, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004536992055363953, + "grad_norm": 9.817276954650879, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8517848253250122, + "num_tokens": 872397800.0, + "step": 22865 + }, + { + "epoch": 2.908790230250604, + "ewc_loss": 0.08306752890348434, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045225731446407735, + "grad_norm": 9.742249488830566, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8776422739028931, + "num_tokens": 872429307.0, + "step": 22866 + }, + { + "epoch": 2.9089174405291947, + "ewc_loss": 0.08318287134170532, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045341072836890817, + "grad_norm": 9.746393203735352, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8613578081130981, + "num_tokens": 872465897.0, + "step": 22867 + }, + { + "epoch": 2.909044650807785, + "ewc_loss": 0.08307188749313354, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004523008828982711, + "grad_norm": 9.75255298614502, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8553497195243835, + "num_tokens": 872503252.0, + "step": 22868 + }, + { + "epoch": 2.9091718610863757, + "ewc_loss": 0.08304200321435928, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004520020738709718, + "grad_norm": 9.693889617919922, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8678559064865112, + "num_tokens": 872546276.0, + "step": 22869 + }, + { + "epoch": 2.9092990713649662, + "ewc_loss": 0.08300669491291046, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045409039012156427, + "grad_norm": 9.758482933044434, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8632113933563232, + "num_tokens": 872585690.0, + "step": 22870 + }, + { + "epoch": 2.9094262816435568, + "ewc_loss": 0.08273576945066452, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045138111454434693, + "grad_norm": 9.742443084716797, + "learning_rate": 1e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8547371029853821, + "num_tokens": 872621718.0, + "step": 22871 + }, + { + "epoch": 2.9095534919221473, + "ewc_loss": 0.08308371901512146, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045241921907290816, + "grad_norm": 9.66937255859375, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8685804009437561, + "num_tokens": 872659349.0, + "step": 22872 + }, + { + "epoch": 2.909680702200738, + "ewc_loss": 0.08298175036907196, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004538409411907196, + "grad_norm": 9.724353790283203, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8617345690727234, + "num_tokens": 872704468.0, + "step": 22873 + }, + { + "epoch": 2.9098079124793284, + "ewc_loss": 0.08280074596405029, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045203088666312397, + "grad_norm": 9.76552677154541, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8651407957077026, + "num_tokens": 872740003.0, + "step": 22874 + }, + { + "epoch": 2.909935122757919, + "ewc_loss": 0.08284762501716614, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004524997202679515, + "grad_norm": 9.732934951782227, + "learning_rate": 1e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.8436218500137329, + "num_tokens": 872780362.0, + "step": 22875 + }, + { + "epoch": 2.9100623330365094, + "ewc_loss": 0.08325716853141785, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004541536618489772, + "grad_norm": 9.754541397094727, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8726822137832642, + "num_tokens": 872823719.0, + "step": 22876 + }, + { + "epoch": 2.9101895433151, + "ewc_loss": 0.0830225944519043, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045180792221799493, + "grad_norm": 9.768293380737305, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8680688738822937, + "num_tokens": 872856276.0, + "step": 22877 + }, + { + "epoch": 2.9103167535936905, + "ewc_loss": 0.08316958695650101, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045327789848670363, + "grad_norm": 9.76015853881836, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8573093414306641, + "num_tokens": 872893688.0, + "step": 22878 + }, + { + "epoch": 2.910443963872281, + "ewc_loss": 0.0831829085946083, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045341113582253456, + "grad_norm": 9.79326057434082, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8839787244796753, + "num_tokens": 872931169.0, + "step": 22879 + }, + { + "epoch": 2.9105711741508715, + "ewc_loss": 0.08312179893255234, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045280001359060407, + "grad_norm": 9.752896308898926, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8602902889251709, + "num_tokens": 872970836.0, + "step": 22880 + }, + { + "epoch": 2.910698384429462, + "ewc_loss": 0.083079993724823, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000452381995273754, + "grad_norm": 11.708656311035156, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8793886303901672, + "num_tokens": 873010879.0, + "step": 22881 + }, + { + "epoch": 2.9108255947080526, + "ewc_loss": 0.08168971538543701, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00043847921187989414, + "grad_norm": 9.332277297973633, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.866378128528595, + "num_tokens": 873058018.0, + "step": 22882 + }, + { + "epoch": 2.9109528049866427, + "ewc_loss": 0.08811706304550171, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005027526640333235, + "grad_norm": 10.68371295928955, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8551289439201355, + "num_tokens": 873096259.0, + "step": 22883 + }, + { + "epoch": 2.9110800152652336, + "ewc_loss": 0.08153779804706573, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00043696005013771355, + "grad_norm": 9.4087553024292, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8549357652664185, + "num_tokens": 873131955.0, + "step": 22884 + }, + { + "epoch": 2.9112072255438237, + "ewc_loss": 0.08893224596977234, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005109044723212719, + "grad_norm": 10.651611328125, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8603388071060181, + "num_tokens": 873171024.0, + "step": 22885 + }, + { + "epoch": 2.9113344358224147, + "ewc_loss": 0.0827779471874237, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044936148333363235, + "grad_norm": 9.609189987182617, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8688617944717407, + "num_tokens": 873210568.0, + "step": 22886 + }, + { + "epoch": 2.9114616461010048, + "ewc_loss": 0.08754976093769073, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004970796289853752, + "grad_norm": 10.48644733428955, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8683211207389832, + "num_tokens": 873250025.0, + "step": 22887 + }, + { + "epoch": 2.9115888563795957, + "ewc_loss": 0.08334434032440186, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045502546709030867, + "grad_norm": 9.797113418579102, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8576388359069824, + "num_tokens": 873285284.0, + "step": 22888 + }, + { + "epoch": 2.911716066658186, + "ewc_loss": 0.08607716858386993, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004823537601623684, + "grad_norm": 10.310712814331055, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8674303889274597, + "num_tokens": 873320102.0, + "step": 22889 + }, + { + "epoch": 2.9118432769367764, + "ewc_loss": 0.08351245522499084, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000456706591648981, + "grad_norm": 9.81502914428711, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8686855435371399, + "num_tokens": 873354668.0, + "step": 22890 + }, + { + "epoch": 2.911970487215367, + "ewc_loss": 0.08491462469100952, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004707282641902566, + "grad_norm": 10.145833969116211, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8618146181106567, + "num_tokens": 873387415.0, + "step": 22891 + }, + { + "epoch": 2.9120976974939574, + "ewc_loss": 0.08353307098150253, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000456912734080106, + "grad_norm": 9.845406532287598, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8839014768600464, + "num_tokens": 873428679.0, + "step": 22892 + }, + { + "epoch": 2.912224907772548, + "ewc_loss": 0.08424396812915802, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046402172301895916, + "grad_norm": 9.986040115356445, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.865426242351532, + "num_tokens": 873466973.0, + "step": 22893 + }, + { + "epoch": 2.9123521180511385, + "ewc_loss": 0.08306674659252167, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004546908603515476, + "grad_norm": 9.761682510375977, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8793026208877563, + "num_tokens": 873510957.0, + "step": 22894 + }, + { + "epoch": 2.912479328329729, + "ewc_loss": 0.08374527841806412, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046147621469572186, + "grad_norm": 10.055622100830078, + "learning_rate": 1e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8437206745147705, + "num_tokens": 873546456.0, + "step": 22895 + }, + { + "epoch": 2.9126065386083195, + "ewc_loss": 0.08282947540283203, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045231819967739284, + "grad_norm": 9.747751235961914, + "learning_rate": 1e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.849524736404419, + "num_tokens": 873591972.0, + "step": 22896 + }, + { + "epoch": 2.91273374888691, + "ewc_loss": 0.0839671865105629, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046369529445655644, + "grad_norm": 10.126087188720703, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.862837553024292, + "num_tokens": 873629790.0, + "step": 22897 + }, + { + "epoch": 2.9128609591655006, + "ewc_loss": 0.0822240561246872, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044626399176195264, + "grad_norm": 9.635527610778809, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.865610659122467, + "num_tokens": 873667419.0, + "step": 22898 + }, + { + "epoch": 2.912988169444091, + "ewc_loss": 0.0841946229338646, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004659696714952588, + "grad_norm": 10.051397323608398, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8747568130493164, + "num_tokens": 873705834.0, + "step": 22899 + }, + { + "epoch": 2.9131153797226816, + "ewc_loss": 0.082365021109581, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00044767369399778545, + "grad_norm": 9.716375350952148, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8565064072608948, + "num_tokens": 873748859.0, + "step": 22900 + }, + { + "epoch": 2.913242590001272, + "ewc_loss": 0.0838630199432373, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046265366836450994, + "grad_norm": 9.975860595703125, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8548336029052734, + "num_tokens": 873788068.0, + "step": 22901 + }, + { + "epoch": 2.9133698002798627, + "ewc_loss": 0.08264052867889404, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004504287207964808, + "grad_norm": 9.70120620727539, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8723094463348389, + "num_tokens": 873827820.0, + "step": 22902 + }, + { + "epoch": 2.9134970105584532, + "ewc_loss": 0.08392511308193207, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004608331946656108, + "grad_norm": 9.93855094909668, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8629597425460815, + "num_tokens": 873866271.0, + "step": 22903 + }, + { + "epoch": 2.9136242208370438, + "ewc_loss": 0.082684226334095, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004508656857069582, + "grad_norm": 9.750244140625, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8606898784637451, + "num_tokens": 873907409.0, + "step": 22904 + }, + { + "epoch": 2.9137514311156343, + "ewc_loss": 0.08331097662448883, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004571332538034767, + "grad_norm": 9.862228393554688, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8720093369483948, + "num_tokens": 873947940.0, + "step": 22905 + }, + { + "epoch": 2.913878641394225, + "ewc_loss": 0.08302263915538788, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045180838787928224, + "grad_norm": 9.732515335083008, + "learning_rate": 1e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8509951829910278, + "num_tokens": 873990062.0, + "step": 22906 + }, + { + "epoch": 2.9140058516728153, + "ewc_loss": 0.0834282785654068, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045586476335301995, + "grad_norm": 9.859658241271973, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8754695653915405, + "num_tokens": 874026660.0, + "step": 22907 + }, + { + "epoch": 2.9141330619514054, + "ewc_loss": 0.08276151120662689, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045163859613239765, + "grad_norm": 9.765106201171875, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.872061014175415, + "num_tokens": 874064935.0, + "step": 22908 + }, + { + "epoch": 2.9142602722299964, + "ewc_loss": 0.08317124843597412, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045573594979941845, + "grad_norm": 9.836119651794434, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8646121025085449, + "num_tokens": 874102807.0, + "step": 22909 + }, + { + "epoch": 2.9143874825085865, + "ewc_loss": 0.08288279920816422, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004528514400590211, + "grad_norm": 9.773447036743164, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8675607442855835, + "num_tokens": 874139688.0, + "step": 22910 + }, + { + "epoch": 2.9145146927871775, + "ewc_loss": 0.08320613205432892, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004560847592074424, + "grad_norm": 9.86745548248291, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.87065589427948, + "num_tokens": 874184505.0, + "step": 22911 + }, + { + "epoch": 2.9146419030657675, + "ewc_loss": 0.08292972296476364, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004533206520136446, + "grad_norm": 9.772564888000488, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8654845356941223, + "num_tokens": 874223210.0, + "step": 22912 + }, + { + "epoch": 2.914769113344358, + "ewc_loss": 0.08326227217912674, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004566461720969528, + "grad_norm": 9.916125297546387, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8522725105285645, + "num_tokens": 874263939.0, + "step": 22913 + }, + { + "epoch": 2.9148963236229486, + "ewc_loss": 0.08274249732494354, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004514484608080238, + "grad_norm": 9.784643173217773, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8733843564987183, + "num_tokens": 874303946.0, + "step": 22914 + }, + { + "epoch": 2.915023533901539, + "ewc_loss": 0.08335082232952118, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004575316561385989, + "grad_norm": 9.884956359863281, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8661308288574219, + "num_tokens": 874343701.0, + "step": 22915 + }, + { + "epoch": 2.9151507441801296, + "ewc_loss": 0.08271624147891998, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004511857987381518, + "grad_norm": 9.785622596740723, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.867581844329834, + "num_tokens": 874380425.0, + "step": 22916 + }, + { + "epoch": 2.91527795445872, + "ewc_loss": 0.08311191201210022, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000455142610007897, + "grad_norm": 9.806365966796875, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8664068579673767, + "num_tokens": 874417643.0, + "step": 22917 + }, + { + "epoch": 2.9154051647373107, + "ewc_loss": 0.0829658955335617, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004536823835223913, + "grad_norm": 9.77814769744873, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8572312593460083, + "num_tokens": 874463939.0, + "step": 22918 + }, + { + "epoch": 2.9155323750159012, + "ewc_loss": 0.08312922716140747, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045531574869528413, + "grad_norm": 9.855111122131348, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8614450693130493, + "num_tokens": 874503156.0, + "step": 22919 + }, + { + "epoch": 2.9156595852944918, + "ewc_loss": 0.08294100314378738, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004534334584604949, + "grad_norm": 9.732580184936523, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8539078235626221, + "num_tokens": 874546754.0, + "step": 22920 + }, + { + "epoch": 2.9157867955730823, + "ewc_loss": 0.083218514919281, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004562085960060358, + "grad_norm": 9.87625503540039, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8683885335922241, + "num_tokens": 874590781.0, + "step": 22921 + }, + { + "epoch": 2.915914005851673, + "ewc_loss": 0.08272357285022736, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004512591112870723, + "grad_norm": 9.821481704711914, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.875800371170044, + "num_tokens": 874629592.0, + "step": 22922 + }, + { + "epoch": 2.9160412161302633, + "ewc_loss": 0.08323588967323303, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045638237497769296, + "grad_norm": 9.806770324707031, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8710064888000488, + "num_tokens": 874667372.0, + "step": 22923 + }, + { + "epoch": 2.916168426408854, + "ewc_loss": 0.08306718617677689, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.000454695284133777, + "grad_norm": 9.797154426574707, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8622581362724304, + "num_tokens": 874704687.0, + "step": 22924 + }, + { + "epoch": 2.9162956366874444, + "ewc_loss": 0.08334222435951233, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004550043086055666, + "grad_norm": 9.961344718933105, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8639551997184753, + "num_tokens": 874740471.0, + "step": 22925 + }, + { + "epoch": 2.916422846966035, + "ewc_loss": 0.08279384672641754, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00045196188148111105, + "grad_norm": 9.69692611694336, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8703054189682007, + "num_tokens": 874780747.0, + "step": 22926 + }, + { + "epoch": 2.9165500572446255, + "ewc_loss": 0.0836266353726387, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004602897970471531, + "grad_norm": 9.982776641845703, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8692119121551514, + "num_tokens": 874814204.0, + "step": 22927 + }, + { + "epoch": 2.916677267523216, + "ewc_loss": 0.08221977949142456, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.0004462212382350117, + "grad_norm": 9.7025728225708, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8649516105651855, + "num_tokens": 874851872.0, + "step": 22928 + }, + { + "epoch": 2.9168044778018065, + "ewc_loss": 0.08410686999559402, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004626507288776338, + "grad_norm": 9.920671463012695, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8502280712127686, + "num_tokens": 874884215.0, + "step": 22929 + }, + { + "epoch": 2.916931688080397, + "ewc_loss": 0.0828123688697815, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044970575254410505, + "grad_norm": 9.736065864562988, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8624005317687988, + "num_tokens": 874919047.0, + "step": 22930 + }, + { + "epoch": 2.917058898358987, + "ewc_loss": 0.08379361033439636, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004595181089825928, + "grad_norm": 9.996868133544922, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8586500883102417, + "num_tokens": 874956469.0, + "step": 22931 + }, + { + "epoch": 2.917186108637578, + "ewc_loss": 0.08288789540529251, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004504609969444573, + "grad_norm": 9.719196319580078, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8760840892791748, + "num_tokens": 874994633.0, + "step": 22932 + }, + { + "epoch": 2.917313318916168, + "ewc_loss": 0.08401307463645935, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046171274152584374, + "grad_norm": 9.993690490722656, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8779038190841675, + "num_tokens": 875028530.0, + "step": 22933 + }, + { + "epoch": 2.917440529194759, + "ewc_loss": 0.08251743018627167, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004467563412617892, + "grad_norm": 9.7184419631958, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8575731515884399, + "num_tokens": 875059785.0, + "step": 22934 + }, + { + "epoch": 2.9175677394733492, + "ewc_loss": 0.08396422117948532, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046122423373162746, + "grad_norm": 9.927495002746582, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8635055422782898, + "num_tokens": 875096057.0, + "step": 22935 + }, + { + "epoch": 2.91769494975194, + "ewc_loss": 0.08286955952644348, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004502776719164103, + "grad_norm": 9.671287536621094, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8713183403015137, + "num_tokens": 875141988.0, + "step": 22936 + }, + { + "epoch": 2.9178221600305303, + "ewc_loss": 0.0841032862663269, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004626148729585111, + "grad_norm": 10.018186569213867, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8701131939888, + "num_tokens": 875176973.0, + "step": 22937 + }, + { + "epoch": 2.917949370309121, + "ewc_loss": 0.08260461688041687, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044762814650312066, + "grad_norm": 9.628090858459473, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8732597827911377, + "num_tokens": 875209271.0, + "step": 22938 + }, + { + "epoch": 2.9180765805877114, + "ewc_loss": 0.08433511853218079, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046493319678120315, + "grad_norm": 10.049470901489258, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8698415756225586, + "num_tokens": 875247460.0, + "step": 22939 + }, + { + "epoch": 2.918203790866302, + "ewc_loss": 0.08228158950805664, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004443979705683887, + "grad_norm": 9.590191841125488, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8638893365859985, + "num_tokens": 875286013.0, + "step": 22940 + }, + { + "epoch": 2.9183310011448924, + "ewc_loss": 0.08458550274372101, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004674370284192264, + "grad_norm": 10.038420677185059, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8598384857177734, + "num_tokens": 875324547.0, + "step": 22941 + }, + { + "epoch": 2.918458211423483, + "ewc_loss": 0.08243566751480103, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044593875645659864, + "grad_norm": 9.596278190612793, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.856797456741333, + "num_tokens": 875366655.0, + "step": 22942 + }, + { + "epoch": 2.9185854217020735, + "ewc_loss": 0.08458025753498077, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004673846415244043, + "grad_norm": 10.106431007385254, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8595991730690002, + "num_tokens": 875408571.0, + "step": 22943 + }, + { + "epoch": 2.918712631980664, + "ewc_loss": 0.08248896896839142, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004464716766960919, + "grad_norm": 9.602238655090332, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.859237790107727, + "num_tokens": 875450730.0, + "step": 22944 + }, + { + "epoch": 2.9188398422592545, + "ewc_loss": 0.0846979022026062, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046856101835146546, + "grad_norm": 10.051141738891602, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8679447770118713, + "num_tokens": 875486850.0, + "step": 22945 + }, + { + "epoch": 2.918967052537845, + "ewc_loss": 0.08264358341693878, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044801790500059724, + "grad_norm": 9.714022636413574, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8580499887466431, + "num_tokens": 875528653.0, + "step": 22946 + }, + { + "epoch": 2.9190942628164356, + "ewc_loss": 0.08408566564321518, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046243867836892605, + "grad_norm": 9.885632514953613, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8755365014076233, + "num_tokens": 875564155.0, + "step": 22947 + }, + { + "epoch": 2.919221473095026, + "ewc_loss": 0.08301995694637299, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045178155414760113, + "grad_norm": 9.767815589904785, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8650648593902588, + "num_tokens": 875598548.0, + "step": 22948 + }, + { + "epoch": 2.9193486833736166, + "ewc_loss": 0.08360110968351364, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045759312342852354, + "grad_norm": 9.881040573120117, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8813362121582031, + "num_tokens": 875634587.0, + "step": 22949 + }, + { + "epoch": 2.919475893652207, + "ewc_loss": 0.08309600502252579, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045254206634126604, + "grad_norm": 9.731853485107422, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.86341392993927, + "num_tokens": 875671956.0, + "step": 22950 + }, + { + "epoch": 2.9196031039307977, + "ewc_loss": 0.08348681032657623, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004564500995911658, + "grad_norm": 9.912142753601074, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8669632077217102, + "num_tokens": 875706327.0, + "step": 22951 + }, + { + "epoch": 2.9197303142093882, + "ewc_loss": 0.08308520913124084, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045243409113027155, + "grad_norm": 9.717133522033691, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8716564774513245, + "num_tokens": 875749320.0, + "step": 22952 + }, + { + "epoch": 2.9198575244879788, + "ewc_loss": 0.08347176015377045, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004562996618915349, + "grad_norm": 9.83170223236084, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8753339052200317, + "num_tokens": 875783871.0, + "step": 22953 + }, + { + "epoch": 2.9199847347665693, + "ewc_loss": 0.08297255635261536, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045130756916478276, + "grad_norm": 9.718588829040527, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8649051189422607, + "num_tokens": 875821716.0, + "step": 22954 + }, + { + "epoch": 2.92011194504516, + "ewc_loss": 0.08364004641771317, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004579825035762042, + "grad_norm": 9.912858963012695, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8684217929840088, + "num_tokens": 875854339.0, + "step": 22955 + }, + { + "epoch": 2.92023915532375, + "ewc_loss": 0.08275720477104187, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004491540603339672, + "grad_norm": 9.65390396118164, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8656952381134033, + "num_tokens": 875889456.0, + "step": 22956 + }, + { + "epoch": 2.920366365602341, + "ewc_loss": 0.0840008407831192, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046159038902260363, + "grad_norm": 9.916755676269531, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8565641641616821, + "num_tokens": 875926659.0, + "step": 22957 + }, + { + "epoch": 2.920493575880931, + "ewc_loss": 0.08293494582176208, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004509315185714513, + "grad_norm": 9.692351341247559, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.874259889125824, + "num_tokens": 875972766.0, + "step": 22958 + }, + { + "epoch": 2.920620786159522, + "ewc_loss": 0.08388777077198029, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004604597052093595, + "grad_norm": 9.91193675994873, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8601826429367065, + "num_tokens": 876011824.0, + "step": 22959 + }, + { + "epoch": 2.920747996438112, + "ewc_loss": 0.08293430507183075, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045092508662492037, + "grad_norm": 9.71766471862793, + "learning_rate": 1e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8461101055145264, + "num_tokens": 876050353.0, + "step": 22960 + }, + { + "epoch": 2.920875206716703, + "ewc_loss": 0.08402430266141891, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004618250532075763, + "grad_norm": 9.950922012329102, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8507529497146606, + "num_tokens": 876089358.0, + "step": 22961 + }, + { + "epoch": 2.921002416995293, + "ewc_loss": 0.08266658335924149, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004482478543650359, + "grad_norm": 9.65353012084961, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8508990406990051, + "num_tokens": 876126287.0, + "step": 22962 + }, + { + "epoch": 2.9211296272738836, + "ewc_loss": 0.08390998840332031, + "ewc_loss_diag": 3.7670135498046875e-05, + "ewc_loss_parallel": 0.00046312331687659025, + "grad_norm": 9.933342933654785, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8752989172935486, + "num_tokens": 876165157.0, + "step": 22963 + }, + { + "epoch": 2.921256837552474, + "ewc_loss": 0.08264059573411942, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004479879862628877, + "grad_norm": 9.616605758666992, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8658477067947388, + "num_tokens": 876201391.0, + "step": 22964 + }, + { + "epoch": 2.9213840478310646, + "ewc_loss": 0.08409380912780762, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000462520110886544, + "grad_norm": 9.8944091796875, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8719642162322998, + "num_tokens": 876243004.0, + "step": 22965 + }, + { + "epoch": 2.921511258109655, + "ewc_loss": 0.08284348249435425, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000450016901595518, + "grad_norm": 9.676514625549316, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8681275248527527, + "num_tokens": 876279245.0, + "step": 22966 + }, + { + "epoch": 2.9216384683882457, + "ewc_loss": 0.08409126102924347, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046249464503489435, + "grad_norm": 9.986952781677246, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.864112377166748, + "num_tokens": 876312145.0, + "step": 22967 + }, + { + "epoch": 2.9217656786668362, + "ewc_loss": 0.08261904120445251, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044777244329452515, + "grad_norm": 9.618709564208984, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8663476705551147, + "num_tokens": 876351354.0, + "step": 22968 + }, + { + "epoch": 2.9218928889454268, + "ewc_loss": 0.08423100411891937, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046389209455810487, + "grad_norm": 9.991034507751465, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.854336142539978, + "num_tokens": 876388621.0, + "step": 22969 + }, + { + "epoch": 2.9220200992240173, + "ewc_loss": 0.08253921568393707, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044697424164041877, + "grad_norm": 9.718788146972656, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8580042123794556, + "num_tokens": 876418377.0, + "step": 22970 + }, + { + "epoch": 2.922147309502608, + "ewc_loss": 0.08393905311822891, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046097254380583763, + "grad_norm": 9.930272102355957, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8680387735366821, + "num_tokens": 876453351.0, + "step": 22971 + }, + { + "epoch": 2.9222745197811983, + "ewc_loss": 0.08278985321521759, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004494805180002004, + "grad_norm": 9.884567260742188, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8711796402931213, + "num_tokens": 876484830.0, + "step": 22972 + }, + { + "epoch": 2.922401730059789, + "ewc_loss": 0.083139568567276, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004529776924755424, + "grad_norm": 9.728461265563965, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8761961460113525, + "num_tokens": 876524318.0, + "step": 22973 + }, + { + "epoch": 2.9225289403383794, + "ewc_loss": 0.08330675959587097, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045464964932762086, + "grad_norm": 9.791603088378906, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8624914884567261, + "num_tokens": 876562421.0, + "step": 22974 + }, + { + "epoch": 2.92265615061697, + "ewc_loss": 0.08303671330213547, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045194916310720146, + "grad_norm": 9.750722885131836, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8587442636489868, + "num_tokens": 876605681.0, + "step": 22975 + }, + { + "epoch": 2.9227833608955605, + "ewc_loss": 0.08313177525997162, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045289978152140975, + "grad_norm": 9.753230094909668, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8548640012741089, + "num_tokens": 876647371.0, + "step": 22976 + }, + { + "epoch": 2.922910571174151, + "ewc_loss": 0.08302822709083557, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045186426723375916, + "grad_norm": 9.725325584411621, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8700164556503296, + "num_tokens": 876685563.0, + "step": 22977 + }, + { + "epoch": 2.9230377814527415, + "ewc_loss": 0.08328017592430115, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045438375673256814, + "grad_norm": 9.826937675476074, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8609115481376648, + "num_tokens": 876718404.0, + "step": 22978 + }, + { + "epoch": 2.923164991731332, + "ewc_loss": 0.08280504494905472, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000449632469099015, + "grad_norm": 9.69996452331543, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8665934801101685, + "num_tokens": 876751888.0, + "step": 22979 + }, + { + "epoch": 2.9232922020099226, + "ewc_loss": 0.0836455374956131, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004580374516081065, + "grad_norm": 9.88576889038086, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.869949221611023, + "num_tokens": 876786813.0, + "step": 22980 + }, + { + "epoch": 2.9234194122885127, + "ewc_loss": 0.08269408345222473, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044852285645902157, + "grad_norm": 9.695818901062012, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.874770998954773, + "num_tokens": 876824511.0, + "step": 22981 + }, + { + "epoch": 2.9235466225671036, + "ewc_loss": 0.0836365669965744, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045794775360263884, + "grad_norm": 9.900712966918945, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8679518699645996, + "num_tokens": 876856409.0, + "step": 22982 + }, + { + "epoch": 2.9236738328456937, + "ewc_loss": 0.08275000751018524, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044908211566507816, + "grad_norm": 9.750693321228027, + "learning_rate": 1e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8493545651435852, + "num_tokens": 876889204.0, + "step": 22983 + }, + { + "epoch": 2.9238010431242847, + "ewc_loss": 0.08338427543640137, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045542482985183597, + "grad_norm": 9.873785972595215, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8665128946304321, + "num_tokens": 876927185.0, + "step": 22984 + }, + { + "epoch": 2.9239282534028748, + "ewc_loss": 0.08288000524044037, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004503820964600891, + "grad_norm": 9.889520645141602, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8624692559242249, + "num_tokens": 876965744.0, + "step": 22985 + }, + { + "epoch": 2.9240554636814657, + "ewc_loss": 0.0828883945941925, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004504660319071263, + "grad_norm": 9.748414039611816, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8713709712028503, + "num_tokens": 876997107.0, + "step": 22986 + }, + { + "epoch": 2.924182673960056, + "ewc_loss": 0.08321808278560638, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045376288471743464, + "grad_norm": 9.794858932495117, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8556710481643677, + "num_tokens": 877037894.0, + "step": 22987 + }, + { + "epoch": 2.9243098842386464, + "ewc_loss": 0.08296197652816772, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004512017476372421, + "grad_norm": 9.767420768737793, + "learning_rate": 1e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8528367877006531, + "num_tokens": 877078724.0, + "step": 22988 + }, + { + "epoch": 2.924437094517237, + "ewc_loss": 0.08314846456050873, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004530666919890791, + "grad_norm": 9.752375602722168, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8651427030563354, + "num_tokens": 877119685.0, + "step": 22989 + }, + { + "epoch": 2.9245643047958274, + "ewc_loss": 0.08313890546560287, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045297108590602875, + "grad_norm": 9.838173866271973, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8681378364562988, + "num_tokens": 877157055.0, + "step": 22990 + }, + { + "epoch": 2.924691515074418, + "ewc_loss": 0.08269327878952026, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004485147655941546, + "grad_norm": 9.725600242614746, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8707187175750732, + "num_tokens": 877188569.0, + "step": 22991 + }, + { + "epoch": 2.9248187253530085, + "ewc_loss": 0.0832219049334526, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045380109804682434, + "grad_norm": 9.786706924438477, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8569362163543701, + "num_tokens": 877222745.0, + "step": 22992 + }, + { + "epoch": 2.924945935631599, + "ewc_loss": 0.08278843015432358, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004494663153309375, + "grad_norm": 9.716773986816406, + "learning_rate": 1e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8485391736030579, + "num_tokens": 877262510.0, + "step": 22993 + }, + { + "epoch": 2.9250731459101895, + "ewc_loss": 0.08314242959022522, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045300627243705094, + "grad_norm": 9.77651596069336, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8577550053596497, + "num_tokens": 877300164.0, + "step": 22994 + }, + { + "epoch": 2.92520035618878, + "ewc_loss": 0.0829906016588211, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004514881002251059, + "grad_norm": 9.697220802307129, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8743149638175964, + "num_tokens": 877337187.0, + "step": 22995 + }, + { + "epoch": 2.9253275664673706, + "ewc_loss": 0.08309061825275421, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004524882242549211, + "grad_norm": 9.82441234588623, + "learning_rate": 1e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8462283611297607, + "num_tokens": 877375902.0, + "step": 22996 + }, + { + "epoch": 2.925454776745961, + "ewc_loss": 0.08283976465463638, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044997967779636383, + "grad_norm": 9.698150634765625, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8644614815711975, + "num_tokens": 877414406.0, + "step": 22997 + }, + { + "epoch": 2.9255819870245516, + "ewc_loss": 0.0833144411444664, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045472642523236573, + "grad_norm": 9.745575904846191, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8718688488006592, + "num_tokens": 877448977.0, + "step": 22998 + }, + { + "epoch": 2.925709197303142, + "ewc_loss": 0.08283993601799011, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004499813658185303, + "grad_norm": 9.647501945495605, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8725255131721497, + "num_tokens": 877486766.0, + "step": 22999 + }, + { + "epoch": 2.9258364075817327, + "ewc_loss": 0.08337732404470444, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045535527169704437, + "grad_norm": 9.792057991027832, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8790161609649658, + "num_tokens": 877519320.0, + "step": 23000 + }, + { + "epoch": 2.925963617860323, + "ewc_loss": 0.0829697698354721, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004512796876952052, + "grad_norm": 9.671804428100586, + "learning_rate": 1e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8404138088226318, + "num_tokens": 877557913.0, + "step": 23001 + }, + { + "epoch": 2.9260908281389137, + "ewc_loss": 0.08344622701406479, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045604430488310754, + "grad_norm": 9.798261642456055, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8636026978492737, + "num_tokens": 877591197.0, + "step": 23002 + }, + { + "epoch": 2.9262180384175043, + "ewc_loss": 0.08288701623678207, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045045220758765936, + "grad_norm": 9.674152374267578, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8809667229652405, + "num_tokens": 877625018.0, + "step": 23003 + }, + { + "epoch": 2.926345248696095, + "ewc_loss": 0.08348642289638519, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004564462578855455, + "grad_norm": 9.744484901428223, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8692169189453125, + "num_tokens": 877662433.0, + "step": 23004 + }, + { + "epoch": 2.9264724589746853, + "ewc_loss": 0.08289654552936554, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045054752263240516, + "grad_norm": 9.700772285461426, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.876893937587738, + "num_tokens": 877703576.0, + "step": 23005 + }, + { + "epoch": 2.9265996692532754, + "ewc_loss": 0.08341268450021744, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004557088832370937, + "grad_norm": 9.81554126739502, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8596950769424438, + "num_tokens": 877734025.0, + "step": 23006 + }, + { + "epoch": 2.9267268795318664, + "ewc_loss": 0.0827237218618393, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044881924986839294, + "grad_norm": 9.591224670410156, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8760174512863159, + "num_tokens": 877771942.0, + "step": 23007 + }, + { + "epoch": 2.9268540898104565, + "ewc_loss": 0.08366426825523376, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045822476386092603, + "grad_norm": 9.918174743652344, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8680436611175537, + "num_tokens": 877815662.0, + "step": 23008 + }, + { + "epoch": 2.9269813000890474, + "ewc_loss": 0.08250992000102997, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004466812242753804, + "grad_norm": 9.674295425415039, + "learning_rate": 1e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8479642868041992, + "num_tokens": 877855398.0, + "step": 23009 + }, + { + "epoch": 2.9271085103676375, + "ewc_loss": 0.08362720906734467, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045785415568389, + "grad_norm": 9.794300079345703, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8589149713516235, + "num_tokens": 877898544.0, + "step": 23010 + }, + { + "epoch": 2.927235720646228, + "ewc_loss": 0.08266995847225189, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044828164391219616, + "grad_norm": 9.761590957641602, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8643159866333008, + "num_tokens": 877937914.0, + "step": 23011 + }, + { + "epoch": 2.9273629309248186, + "ewc_loss": 0.0833544060587883, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045512610813602805, + "grad_norm": 9.749490737915039, + "learning_rate": 1e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.8483135104179382, + "num_tokens": 877979266.0, + "step": 23012 + }, + { + "epoch": 2.927490141203409, + "ewc_loss": 0.08302357792854309, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004518177593126893, + "grad_norm": 9.73766040802002, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8689391613006592, + "num_tokens": 878019762.0, + "step": 23013 + }, + { + "epoch": 2.9276173514819996, + "ewc_loss": 0.0831545740365982, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004531277809292078, + "grad_norm": 9.77379035949707, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8588451147079468, + "num_tokens": 878062174.0, + "step": 23014 + }, + { + "epoch": 2.92774456176059, + "ewc_loss": 0.08325782418251038, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004541603266261518, + "grad_norm": 9.773674964904785, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8577964305877686, + "num_tokens": 878098197.0, + "step": 23015 + }, + { + "epoch": 2.9278717720391807, + "ewc_loss": 0.08309128880500793, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004524949472397566, + "grad_norm": 9.808422088623047, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8701222538948059, + "num_tokens": 878130478.0, + "step": 23016 + }, + { + "epoch": 2.9279989823177712, + "ewc_loss": 0.08312534540891647, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004528354911599308, + "grad_norm": 9.722944259643555, + "learning_rate": 1e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8483500480651855, + "num_tokens": 878175670.0, + "step": 23017 + }, + { + "epoch": 2.9281261925963618, + "ewc_loss": 0.08332201838493347, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004548022407107055, + "grad_norm": 9.845100402832031, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8524429798126221, + "num_tokens": 878216415.0, + "step": 23018 + }, + { + "epoch": 2.9282534028749523, + "ewc_loss": 0.08286310732364655, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045021314872428775, + "grad_norm": 9.769457817077637, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8639366030693054, + "num_tokens": 878249582.0, + "step": 23019 + }, + { + "epoch": 2.928380613153543, + "ewc_loss": 0.08329595625400543, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045454161590896547, + "grad_norm": 9.781364440917969, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8753038644790649, + "num_tokens": 878290859.0, + "step": 23020 + }, + { + "epoch": 2.9285078234321333, + "ewc_loss": 0.08306050300598145, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045218708692118526, + "grad_norm": 9.766067504882812, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8792985677719116, + "num_tokens": 878322463.0, + "step": 23021 + }, + { + "epoch": 2.928635033710724, + "ewc_loss": 0.0831681489944458, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045326355029828846, + "grad_norm": 9.727828979492188, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8749992847442627, + "num_tokens": 878365754.0, + "step": 23022 + }, + { + "epoch": 2.9287622439893144, + "ewc_loss": 0.0832921713590622, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004545037227217108, + "grad_norm": 9.757888793945312, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8779177069664001, + "num_tokens": 878410325.0, + "step": 23023 + }, + { + "epoch": 2.928889454267905, + "ewc_loss": 0.08319519460201263, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000453533953987062, + "grad_norm": 9.869623184204102, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8700443506240845, + "num_tokens": 878451604.0, + "step": 23024 + }, + { + "epoch": 2.9290166645464955, + "ewc_loss": 0.08303387463092804, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004519207577686757, + "grad_norm": 9.776799201965332, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8674144744873047, + "num_tokens": 878485174.0, + "step": 23025 + }, + { + "epoch": 2.929143874825086, + "ewc_loss": 0.08334897458553314, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045507182949222624, + "grad_norm": 9.85363483428955, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8572204113006592, + "num_tokens": 878525867.0, + "step": 23026 + }, + { + "epoch": 2.9292710851036765, + "ewc_loss": 0.08286519348621368, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045023392885923386, + "grad_norm": 9.715188026428223, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8542641401290894, + "num_tokens": 878561327.0, + "step": 23027 + }, + { + "epoch": 2.929398295382267, + "ewc_loss": 0.08342295140028, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004558115324471146, + "grad_norm": 9.868843078613281, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8710982799530029, + "num_tokens": 878603045.0, + "step": 23028 + }, + { + "epoch": 2.929525505660857, + "ewc_loss": 0.08269219100475311, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044850396807305515, + "grad_norm": 9.688607215881348, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8711304068565369, + "num_tokens": 878645051.0, + "step": 23029 + }, + { + "epoch": 2.929652715939448, + "ewc_loss": 0.0835425853729248, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004570078745018691, + "grad_norm": 9.837736129760742, + "learning_rate": 1e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.8490580320358276, + "num_tokens": 878682003.0, + "step": 23030 + }, + { + "epoch": 2.929779926218038, + "ewc_loss": 0.08265109360218048, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044809302198700607, + "grad_norm": 9.66423225402832, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8692950010299683, + "num_tokens": 878720509.0, + "step": 23031 + }, + { + "epoch": 2.929907136496629, + "ewc_loss": 0.08365556597709656, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004581376852001995, + "grad_norm": 9.855212211608887, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8721702098846436, + "num_tokens": 878757950.0, + "step": 23032 + }, + { + "epoch": 2.9300343467752192, + "ewc_loss": 0.08253943920135498, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004469764535315335, + "grad_norm": 9.659836769104004, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8582598567008972, + "num_tokens": 878801477.0, + "step": 23033 + }, + { + "epoch": 2.93016155705381, + "ewc_loss": 0.08359198272228241, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004575018829200417, + "grad_norm": 9.819387435913086, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8569825887680054, + "num_tokens": 878838802.0, + "step": 23034 + }, + { + "epoch": 2.9302887673324003, + "ewc_loss": 0.08295749127864838, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004511569277383387, + "grad_norm": 9.71115779876709, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.859112560749054, + "num_tokens": 878875566.0, + "step": 23035 + }, + { + "epoch": 2.930415977610991, + "ewc_loss": 0.08355925977230072, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004571746685542166, + "grad_norm": 9.868995666503906, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8574813604354858, + "num_tokens": 878913463.0, + "step": 23036 + }, + { + "epoch": 2.9305431878895813, + "ewc_loss": 0.08284859359264374, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045006797881796956, + "grad_norm": 9.6622953414917, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8515430688858032, + "num_tokens": 878950769.0, + "step": 23037 + }, + { + "epoch": 2.930670398168172, + "ewc_loss": 0.0837392508983612, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045897450763732195, + "grad_norm": 9.86551284790039, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8746886253356934, + "num_tokens": 878989041.0, + "step": 23038 + }, + { + "epoch": 2.9307976084467624, + "ewc_loss": 0.08284743130207062, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004500563954934478, + "grad_norm": 9.647123336791992, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8733848929405212, + "num_tokens": 879020195.0, + "step": 23039 + }, + { + "epoch": 2.930924818725353, + "ewc_loss": 0.08406218141317368, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046220383956097066, + "grad_norm": 9.857762336730957, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8632388114929199, + "num_tokens": 879057818.0, + "step": 23040 + }, + { + "epoch": 2.9310520290039435, + "ewc_loss": 0.08280317485332489, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004496137553360313, + "grad_norm": 9.66058349609375, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8598415851593018, + "num_tokens": 879094747.0, + "step": 23041 + }, + { + "epoch": 2.931179239282534, + "ewc_loss": 0.08410374075174332, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004626194422598928, + "grad_norm": 9.911336898803711, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.857410728931427, + "num_tokens": 879131127.0, + "step": 23042 + }, + { + "epoch": 2.9313064495611245, + "ewc_loss": 0.08292776346206665, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004508596321102232, + "grad_norm": 9.663361549377441, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8644300103187561, + "num_tokens": 879168541.0, + "step": 23043 + }, + { + "epoch": 2.931433659839715, + "ewc_loss": 0.08406803756952286, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046226242557168007, + "grad_norm": 9.924593925476074, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8608748316764832, + "num_tokens": 879199409.0, + "step": 23044 + }, + { + "epoch": 2.9315608701183056, + "ewc_loss": 0.08269977569580078, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004485798126552254, + "grad_norm": 9.618227005004883, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8782406449317932, + "num_tokens": 879236447.0, + "step": 23045 + }, + { + "epoch": 2.931688080396896, + "ewc_loss": 0.08402073383331299, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004617894010152668, + "grad_norm": 9.875565528869629, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8652130365371704, + "num_tokens": 879275596.0, + "step": 23046 + }, + { + "epoch": 2.9318152906754866, + "ewc_loss": 0.08288480341434479, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004504301177803427, + "grad_norm": 9.647712707519531, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8643598556518555, + "num_tokens": 879315620.0, + "step": 23047 + }, + { + "epoch": 2.931942500954077, + "ewc_loss": 0.08407378196716309, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004623199056368321, + "grad_norm": 9.93989086151123, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8660117387771606, + "num_tokens": 879356220.0, + "step": 23048 + }, + { + "epoch": 2.9320697112326677, + "ewc_loss": 0.08288915455341339, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045047359890304506, + "grad_norm": 9.712469100952148, + "learning_rate": 1e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.849531352519989, + "num_tokens": 879397544.0, + "step": 23049 + }, + { + "epoch": 2.932196921511258, + "ewc_loss": 0.08395819365978241, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004611640179064125, + "grad_norm": 10.058415412902832, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.875175952911377, + "num_tokens": 879438165.0, + "step": 23050 + }, + { + "epoch": 2.9323241317898487, + "ewc_loss": 0.08273573219776154, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044893933227285743, + "grad_norm": 9.924881935119629, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8681358098983765, + "num_tokens": 879473601.0, + "step": 23051 + }, + { + "epoch": 2.9324513420684393, + "ewc_loss": 0.08336597681045532, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045524179586209357, + "grad_norm": 10.237157821655273, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.858826756477356, + "num_tokens": 879510998.0, + "step": 23052 + }, + { + "epoch": 2.93257855234703, + "ewc_loss": 0.08213809132575989, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004429629188962281, + "grad_norm": 9.572097778320312, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8690944910049438, + "num_tokens": 879551469.0, + "step": 23053 + }, + { + "epoch": 2.93270576262562, + "ewc_loss": 0.08406540751457214, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046223605750128627, + "grad_norm": 10.17336654663086, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.85719233751297, + "num_tokens": 879585620.0, + "step": 23054 + }, + { + "epoch": 2.932832972904211, + "ewc_loss": 0.0816827267408371, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00043840930447913706, + "grad_norm": 9.68889331817627, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8747975826263428, + "num_tokens": 879621749.0, + "step": 23055 + }, + { + "epoch": 2.932960183182801, + "ewc_loss": 0.08427666872739792, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000464348733657971, + "grad_norm": 10.004186630249023, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8667289614677429, + "num_tokens": 879664529.0, + "step": 23056 + }, + { + "epoch": 2.933087393461392, + "ewc_loss": 0.0820787101984024, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044236911344341934, + "grad_norm": 9.663473129272461, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8701698780059814, + "num_tokens": 879696293.0, + "step": 23057 + }, + { + "epoch": 2.933214603739982, + "ewc_loss": 0.08376471698284149, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004592291952576488, + "grad_norm": 9.934505462646484, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.870147168636322, + "num_tokens": 879733562.0, + "step": 23058 + }, + { + "epoch": 2.933341814018573, + "ewc_loss": 0.08251309394836426, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004467129474505782, + "grad_norm": 9.632925987243652, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8620460033416748, + "num_tokens": 879771740.0, + "step": 23059 + }, + { + "epoch": 2.933469024297163, + "ewc_loss": 0.08544120192527771, + "ewc_loss_diag": 3.933906555175781e-05, + "ewc_loss_parallel": 0.00046134559670463204, + "grad_norm": 54.20041275024414, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8603954315185547, + "num_tokens": 879805578.0, + "step": 23060 + }, + { + "epoch": 2.9335962345757536, + "ewc_loss": 0.14621618390083313, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0010813024127855897, + "grad_norm": 16.347801208496094, + "learning_rate": 1e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8641995191574097, + "num_tokens": 879845175.0, + "step": 23061 + }, + { + "epoch": 2.933723444854344, + "ewc_loss": 0.08147146552801132, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00043385528260841966, + "grad_norm": 8.506322860717773, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8770954012870789, + "num_tokens": 879878975.0, + "step": 23062 + }, + { + "epoch": 2.9338506551329346, + "ewc_loss": 0.12650859355926514, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0008866680436767638, + "grad_norm": 15.276741027832031, + "learning_rate": 1e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8572084903717041, + "num_tokens": 879919373.0, + "step": 23063 + }, + { + "epoch": 2.933977865411525, + "ewc_loss": 0.13420800864696503, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000963662110734731, + "grad_norm": 15.14807415008545, + "learning_rate": 1e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.845362663269043, + "num_tokens": 879953821.0, + "step": 23064 + }, + { + "epoch": 2.9341050756901157, + "ewc_loss": 0.09636037051677704, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005851857713423669, + "grad_norm": 10.37230110168457, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.856449544429779, + "num_tokens": 879993505.0, + "step": 23065 + }, + { + "epoch": 2.934232285968706, + "ewc_loss": 0.1048600822687149, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0006701828679069877, + "grad_norm": 12.838350296020508, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8687396049499512, + "num_tokens": 880030124.0, + "step": 23066 + }, + { + "epoch": 2.9343594962472968, + "ewc_loss": 0.10982727259397507, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0007198547245934606, + "grad_norm": 12.60099983215332, + "learning_rate": 1e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.8408072590827942, + "num_tokens": 880059480.0, + "step": 23067 + }, + { + "epoch": 2.9344867065258873, + "ewc_loss": 0.09365381300449371, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005581201985478401, + "grad_norm": 10.6556978225708, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8650221228599548, + "num_tokens": 880093402.0, + "step": 23068 + }, + { + "epoch": 2.934613916804478, + "ewc_loss": 0.09691157937049866, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005906978622078896, + "grad_norm": 11.699234962463379, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8614697456359863, + "num_tokens": 880127874.0, + "step": 23069 + }, + { + "epoch": 2.9347411270830683, + "ewc_loss": 0.09535600244998932, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000575142097659409, + "grad_norm": 10.931486129760742, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8740701079368591, + "num_tokens": 880165755.0, + "step": 23070 + }, + { + "epoch": 2.934868337361659, + "ewc_loss": 0.09136936068534851, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005352756706997752, + "grad_norm": 10.767091751098633, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8587886095046997, + "num_tokens": 880203931.0, + "step": 23071 + }, + { + "epoch": 2.9349955476402494, + "ewc_loss": 0.09099449217319489, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005315269227139652, + "grad_norm": 10.77924633026123, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8672427535057068, + "num_tokens": 880242844.0, + "step": 23072 + }, + { + "epoch": 2.93512275791884, + "ewc_loss": 0.08964648842811584, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005180469597689807, + "grad_norm": 10.470905303955078, + "learning_rate": 1e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.8488316535949707, + "num_tokens": 880280068.0, + "step": 23073 + }, + { + "epoch": 2.9352499681974304, + "ewc_loss": 0.08770120143890381, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004985940176993608, + "grad_norm": 10.389050483703613, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8719573020935059, + "num_tokens": 880314044.0, + "step": 23074 + }, + { + "epoch": 2.935377178476021, + "ewc_loss": 0.08797825872898102, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0005013645859435201, + "grad_norm": 10.39029312133789, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8627548217773438, + "num_tokens": 880355077.0, + "step": 23075 + }, + { + "epoch": 2.9355043887546115, + "ewc_loss": 0.08629429340362549, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004845249350182712, + "grad_norm": 10.154071807861328, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8794995546340942, + "num_tokens": 880388906.0, + "step": 23076 + }, + { + "epoch": 2.935631599033202, + "ewc_loss": 0.08644170314073563, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00048599904403090477, + "grad_norm": 10.211551666259766, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8752809762954712, + "num_tokens": 880426834.0, + "step": 23077 + }, + { + "epoch": 2.9357588093117926, + "ewc_loss": 0.08517942577600479, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00047337630530819297, + "grad_norm": 10.104828834533691, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.863364577293396, + "num_tokens": 880463477.0, + "step": 23078 + }, + { + "epoch": 2.9358860195903826, + "ewc_loss": 0.0852748304605484, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00047433029976673424, + "grad_norm": 10.082484245300293, + "learning_rate": 1e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8483209609985352, + "num_tokens": 880500884.0, + "step": 23079 + }, + { + "epoch": 2.9360132298689736, + "ewc_loss": 0.08489004522562027, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004704824823420495, + "grad_norm": 10.014890670776367, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8598416447639465, + "num_tokens": 880539724.0, + "step": 23080 + }, + { + "epoch": 2.9361404401475637, + "ewc_loss": 0.084458127617836, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000466163270175457, + "grad_norm": 10.009099960327148, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8579156398773193, + "num_tokens": 880577820.0, + "step": 23081 + }, + { + "epoch": 2.9362676504261547, + "ewc_loss": 0.08440694212913513, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046565147931687534, + "grad_norm": 9.979641914367676, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8695157170295715, + "num_tokens": 880612055.0, + "step": 23082 + }, + { + "epoch": 2.9363948607047448, + "ewc_loss": 0.08417581021785736, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046334011130966246, + "grad_norm": 9.926973342895508, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8733141422271729, + "num_tokens": 880654907.0, + "step": 23083 + }, + { + "epoch": 2.9365220709833357, + "ewc_loss": 0.08387543261051178, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000460336304968223, + "grad_norm": 9.938888549804688, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8803049325942993, + "num_tokens": 880692418.0, + "step": 23084 + }, + { + "epoch": 2.936649281261926, + "ewc_loss": 0.08375896513462067, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004591716278810054, + "grad_norm": 9.89891242980957, + "learning_rate": 1e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8499346375465393, + "num_tokens": 880727742.0, + "step": 23085 + }, + { + "epoch": 2.9367764915405163, + "ewc_loss": 0.08372890949249268, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004588710726238787, + "grad_norm": 9.894171714782715, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8769499063491821, + "num_tokens": 880766673.0, + "step": 23086 + }, + { + "epoch": 2.936903701819107, + "ewc_loss": 0.08350566774606705, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045663872151635587, + "grad_norm": 9.831192970275879, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8559795618057251, + "num_tokens": 880810390.0, + "step": 23087 + }, + { + "epoch": 2.9370309120976974, + "ewc_loss": 0.08381743729114532, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045975641114637256, + "grad_norm": 9.904351234436035, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8608490824699402, + "num_tokens": 880846920.0, + "step": 23088 + }, + { + "epoch": 2.937158122376288, + "ewc_loss": 0.08339491486549377, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045553117524832487, + "grad_norm": 9.844043731689453, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8707951307296753, + "num_tokens": 880885867.0, + "step": 23089 + }, + { + "epoch": 2.9372853326548785, + "ewc_loss": 0.08357551693916321, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004573372134473175, + "grad_norm": 10.171073913574219, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.875019907951355, + "num_tokens": 880928408.0, + "step": 23090 + }, + { + "epoch": 2.937412542933469, + "ewc_loss": 0.08271477371454239, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004487297555897385, + "grad_norm": 10.317231178283691, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8643432855606079, + "num_tokens": 880960681.0, + "step": 23091 + }, + { + "epoch": 2.9375397532120595, + "ewc_loss": 0.08252976834774017, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004468797123990953, + "grad_norm": 9.698713302612305, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8546639680862427, + "num_tokens": 881000358.0, + "step": 23092 + }, + { + "epoch": 2.93766696349065, + "ewc_loss": 0.08409607410430908, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046254272456280887, + "grad_norm": 9.975242614746094, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.86638343334198, + "num_tokens": 881037765.0, + "step": 23093 + }, + { + "epoch": 2.9377941737692406, + "ewc_loss": 0.08228477090597153, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044442975195124745, + "grad_norm": 9.667855262756348, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8788369297981262, + "num_tokens": 881076091.0, + "step": 23094 + }, + { + "epoch": 2.937921384047831, + "ewc_loss": 0.0841725692152977, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004633077187463641, + "grad_norm": 9.996025085449219, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8641483783721924, + "num_tokens": 881112784.0, + "step": 23095 + }, + { + "epoch": 2.9380485943264216, + "ewc_loss": 0.08243604004383087, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004459424817468971, + "grad_norm": 9.73129940032959, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8721780776977539, + "num_tokens": 881149938.0, + "step": 23096 + }, + { + "epoch": 2.938175804605012, + "ewc_loss": 0.08400420844554901, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000461624120362103, + "grad_norm": 9.924931526184082, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8560415506362915, + "num_tokens": 881187541.0, + "step": 23097 + }, + { + "epoch": 2.9383030148836027, + "ewc_loss": 0.08301511406898499, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004517332126852125, + "grad_norm": 9.817434310913086, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8674126863479614, + "num_tokens": 881221920.0, + "step": 23098 + }, + { + "epoch": 2.938430225162193, + "ewc_loss": 0.08366652578115463, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045824729022569954, + "grad_norm": 9.904277801513672, + "learning_rate": 1e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.8499203324317932, + "num_tokens": 881264804.0, + "step": 23099 + }, + { + "epoch": 2.9385574354407837, + "ewc_loss": 0.08307361602783203, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004523181414697319, + "grad_norm": 9.731295585632324, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.858664870262146, + "num_tokens": 881304482.0, + "step": 23100 + }, + { + "epoch": 2.9386846457193743, + "ewc_loss": 0.08368892967700958, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045847136061638594, + "grad_norm": 9.959816932678223, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8552858829498291, + "num_tokens": 881340822.0, + "step": 23101 + }, + { + "epoch": 2.938811855997965, + "ewc_loss": 0.08281406760215759, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004497227491810918, + "grad_norm": 9.739051818847656, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8632454872131348, + "num_tokens": 881374352.0, + "step": 23102 + }, + { + "epoch": 2.9389390662765553, + "ewc_loss": 0.08382990956306458, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045988112105987966, + "grad_norm": 9.893346786499023, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8586559295654297, + "num_tokens": 881413176.0, + "step": 23103 + }, + { + "epoch": 2.9390662765551454, + "ewc_loss": 0.08298847079277039, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004514667671173811, + "grad_norm": 9.740341186523438, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8609424829483032, + "num_tokens": 881454309.0, + "step": 23104 + }, + { + "epoch": 2.9391934868337364, + "ewc_loss": 0.08395202457904816, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046110228868201375, + "grad_norm": 9.899641036987305, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8653314113616943, + "num_tokens": 881490702.0, + "step": 23105 + }, + { + "epoch": 2.9393206971123265, + "ewc_loss": 0.08295691013336182, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004511511360760778, + "grad_norm": 9.763176918029785, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8711777329444885, + "num_tokens": 881530394.0, + "step": 23106 + }, + { + "epoch": 2.9394479073909174, + "ewc_loss": 0.08378437161445618, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004594257625285536, + "grad_norm": 9.892891883850098, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8790941834449768, + "num_tokens": 881567286.0, + "step": 23107 + }, + { + "epoch": 2.9395751176695075, + "ewc_loss": 0.08316968381404877, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045327882980927825, + "grad_norm": 9.767642974853516, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8606335520744324, + "num_tokens": 881607017.0, + "step": 23108 + }, + { + "epoch": 2.939702327948098, + "ewc_loss": 0.08373101055622101, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004588921437971294, + "grad_norm": 9.82872486114502, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8642703890800476, + "num_tokens": 881645793.0, + "step": 23109 + }, + { + "epoch": 2.9398295382266886, + "ewc_loss": 0.08321884274482727, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004537705099210143, + "grad_norm": 9.750716209411621, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.867426335811615, + "num_tokens": 881683596.0, + "step": 23110 + }, + { + "epoch": 2.939956748505279, + "ewc_loss": 0.08381064236164093, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045968848280608654, + "grad_norm": 9.9039888381958, + "learning_rate": 1e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8477007150650024, + "num_tokens": 881717607.0, + "step": 23111 + }, + { + "epoch": 2.9400839587838696, + "ewc_loss": 0.08314092457294464, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004529912257567048, + "grad_norm": 9.748802185058594, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8780007362365723, + "num_tokens": 881750884.0, + "step": 23112 + }, + { + "epoch": 2.94021116906246, + "ewc_loss": 0.08389085531234741, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004604906134773046, + "grad_norm": 9.891127586364746, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.86092209815979, + "num_tokens": 881789345.0, + "step": 23113 + }, + { + "epoch": 2.9403383793410507, + "ewc_loss": 0.08313243091106415, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045290638809092343, + "grad_norm": 9.721147537231445, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.858896017074585, + "num_tokens": 881828459.0, + "step": 23114 + }, + { + "epoch": 2.940465589619641, + "ewc_loss": 0.08396352827548981, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004612173652276397, + "grad_norm": 9.937335014343262, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8582316637039185, + "num_tokens": 881865294.0, + "step": 23115 + }, + { + "epoch": 2.9405927998982317, + "ewc_loss": 0.0829332023859024, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004509140853770077, + "grad_norm": 9.674317359924316, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8674209117889404, + "num_tokens": 881904936.0, + "step": 23116 + }, + { + "epoch": 2.9407200101768223, + "ewc_loss": 0.08421291410923004, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046371121425181627, + "grad_norm": 10.013565063476562, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8750590085983276, + "num_tokens": 881936300.0, + "step": 23117 + }, + { + "epoch": 2.940847220455413, + "ewc_loss": 0.08275379240512848, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004491199797485024, + "grad_norm": 9.636465072631836, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8757287263870239, + "num_tokens": 881974092.0, + "step": 23118 + }, + { + "epoch": 2.9409744307340033, + "ewc_loss": 0.08442334085702896, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004658154502976686, + "grad_norm": 10.017496109008789, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8655412793159485, + "num_tokens": 882019339.0, + "step": 23119 + }, + { + "epoch": 2.941101641012594, + "ewc_loss": 0.08263637870550156, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004479458148125559, + "grad_norm": 9.725129127502441, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8672807216644287, + "num_tokens": 882052837.0, + "step": 23120 + }, + { + "epoch": 2.9412288512911844, + "ewc_loss": 0.0844392254948616, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046597429900430143, + "grad_norm": 10.030928611755371, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8692623376846313, + "num_tokens": 882088449.0, + "step": 23121 + }, + { + "epoch": 2.941356061569775, + "ewc_loss": 0.0828152447938919, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004497344489209354, + "grad_norm": 9.74135971069336, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8568816781044006, + "num_tokens": 882124844.0, + "step": 23122 + }, + { + "epoch": 2.9414832718483654, + "ewc_loss": 0.08424942195415497, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046407629270106554, + "grad_norm": 9.989188194274902, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8690521717071533, + "num_tokens": 882162246.0, + "step": 23123 + }, + { + "epoch": 2.941610482126956, + "ewc_loss": 0.08276555687189102, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004492376174312085, + "grad_norm": 9.739459037780762, + "learning_rate": 1e-06, + "loss": 0.5563, + "mean_token_accuracy": 0.854620099067688, + "num_tokens": 882202825.0, + "step": 23124 + }, + { + "epoch": 2.9417376924055465, + "ewc_loss": 0.08411726355552673, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046275468776002526, + "grad_norm": 9.9811429977417, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8511086702346802, + "num_tokens": 882237951.0, + "step": 23125 + }, + { + "epoch": 2.941864902684137, + "ewc_loss": 0.08282360434532166, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004498180642258376, + "grad_norm": 9.737222671508789, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.861911952495575, + "num_tokens": 882279147.0, + "step": 23126 + }, + { + "epoch": 2.941992112962727, + "ewc_loss": 0.08402283489704132, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004618103848770261, + "grad_norm": 9.972719192504883, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8861626982688904, + "num_tokens": 882322777.0, + "step": 23127 + }, + { + "epoch": 2.942119323241318, + "ewc_loss": 0.08283789455890656, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044996096403338015, + "grad_norm": 9.776384353637695, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8691557049751282, + "num_tokens": 882360681.0, + "step": 23128 + }, + { + "epoch": 2.942246533519908, + "ewc_loss": 0.08381088078022003, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000459690869320184, + "grad_norm": 9.900443077087402, + "learning_rate": 1e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8439502716064453, + "num_tokens": 882403823.0, + "step": 23129 + }, + { + "epoch": 2.942373743798499, + "ewc_loss": 0.08294348418712616, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045101690921001136, + "grad_norm": 9.782123565673828, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8731003403663635, + "num_tokens": 882443723.0, + "step": 23130 + }, + { + "epoch": 2.9425009540770892, + "ewc_loss": 0.08360236883163452, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004576057253871113, + "grad_norm": 9.941339492797852, + "learning_rate": 1e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8495724201202393, + "num_tokens": 882480610.0, + "step": 23131 + }, + { + "epoch": 2.94262816435568, + "ewc_loss": 0.08291656523942947, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004507476696744561, + "grad_norm": 9.796011924743652, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8659586906433105, + "num_tokens": 882518789.0, + "step": 23132 + }, + { + "epoch": 2.9427553746342703, + "ewc_loss": 0.08350692689418793, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004566512943711132, + "grad_norm": 9.927386283874512, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8520712852478027, + "num_tokens": 882556598.0, + "step": 23133 + }, + { + "epoch": 2.942882584912861, + "ewc_loss": 0.08294801414012909, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004510622238740325, + "grad_norm": 9.738542556762695, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8590432405471802, + "num_tokens": 882593788.0, + "step": 23134 + }, + { + "epoch": 2.9430097951914513, + "ewc_loss": 0.08371323347091675, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045871431939303875, + "grad_norm": 9.893346786499023, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8706560134887695, + "num_tokens": 882630861.0, + "step": 23135 + }, + { + "epoch": 2.943137005470042, + "ewc_loss": 0.08299822360277176, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045156426494941115, + "grad_norm": 9.832863807678223, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8715014457702637, + "num_tokens": 882670950.0, + "step": 23136 + }, + { + "epoch": 2.9432642157486324, + "ewc_loss": 0.08349371701478958, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045651919208467007, + "grad_norm": 9.881244659423828, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8537954092025757, + "num_tokens": 882705680.0, + "step": 23137 + }, + { + "epoch": 2.943391426027223, + "ewc_loss": 0.08297869563102722, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004513690364547074, + "grad_norm": 9.763696670532227, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8742051124572754, + "num_tokens": 882737589.0, + "step": 23138 + }, + { + "epoch": 2.9435186363058135, + "ewc_loss": 0.08363401889801025, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004579222295433283, + "grad_norm": 9.907977104187012, + "learning_rate": 1e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8488602042198181, + "num_tokens": 882774502.0, + "step": 23139 + }, + { + "epoch": 2.943645846584404, + "ewc_loss": 0.08311232179403305, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045270525151863694, + "grad_norm": 9.797964096069336, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8544517159461975, + "num_tokens": 882815184.0, + "step": 23140 + }, + { + "epoch": 2.9437730568629945, + "ewc_loss": 0.08353020250797272, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004568840085994452, + "grad_norm": 9.838050842285156, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8755810260772705, + "num_tokens": 882856793.0, + "step": 23141 + }, + { + "epoch": 2.943900267141585, + "ewc_loss": 0.0831819400191307, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000453401415143162, + "grad_norm": 9.785867691040039, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8748310804367065, + "num_tokens": 882896128.0, + "step": 23142 + }, + { + "epoch": 2.9440274774201756, + "ewc_loss": 0.08345672488212585, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004561492532957345, + "grad_norm": 9.941187858581543, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.871658205986023, + "num_tokens": 882931253.0, + "step": 23143 + }, + { + "epoch": 2.944154687698766, + "ewc_loss": 0.08301223814487457, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004517043998930603, + "grad_norm": 9.693864822387695, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8812837600708008, + "num_tokens": 882975072.0, + "step": 23144 + }, + { + "epoch": 2.9442818979773566, + "ewc_loss": 0.08392852544784546, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046086724614724517, + "grad_norm": 9.956838607788086, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8768681883811951, + "num_tokens": 883013576.0, + "step": 23145 + }, + { + "epoch": 2.944409108255947, + "ewc_loss": 0.08276496827602386, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004492317675612867, + "grad_norm": 9.670027732849121, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8662601113319397, + "num_tokens": 883046052.0, + "step": 23146 + }, + { + "epoch": 2.9445363185345377, + "ewc_loss": 0.0842289924621582, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046387192560359836, + "grad_norm": 10.028782844543457, + "learning_rate": 1e-06, + "loss": 0.5446, + "mean_token_accuracy": 0.8443629741668701, + "num_tokens": 883080398.0, + "step": 23147 + }, + { + "epoch": 2.944663528813128, + "ewc_loss": 0.08254434168338776, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044702543527819216, + "grad_norm": 9.69070816040039, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8559619188308716, + "num_tokens": 883120321.0, + "step": 23148 + }, + { + "epoch": 2.9447907390917187, + "ewc_loss": 0.08426201343536377, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004642021667677909, + "grad_norm": 9.894394874572754, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8711156249046326, + "num_tokens": 883161285.0, + "step": 23149 + }, + { + "epoch": 2.9449179493703093, + "ewc_loss": 0.08283115923404694, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004498936759773642, + "grad_norm": 9.665224075317383, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.863921046257019, + "num_tokens": 883199857.0, + "step": 23150 + }, + { + "epoch": 2.9450451596489, + "ewc_loss": 0.08426359295845032, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000464217911940068, + "grad_norm": 9.991792678833008, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8717579245567322, + "num_tokens": 883238325.0, + "step": 23151 + }, + { + "epoch": 2.94517236992749, + "ewc_loss": 0.08284260332584381, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045000805403105915, + "grad_norm": 9.673069953918457, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8553861975669861, + "num_tokens": 883281814.0, + "step": 23152 + }, + { + "epoch": 2.945299580206081, + "ewc_loss": 0.08415554463863373, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046313743223436177, + "grad_norm": 9.906368255615234, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8692973852157593, + "num_tokens": 883317385.0, + "step": 23153 + }, + { + "epoch": 2.945426790484671, + "ewc_loss": 0.08320660889148712, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004536481574177742, + "grad_norm": 9.750580787658691, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8726058602333069, + "num_tokens": 883353498.0, + "step": 23154 + }, + { + "epoch": 2.945554000763262, + "ewc_loss": 0.08404246717691422, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046200669021345675, + "grad_norm": 9.909491539001465, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8717211484909058, + "num_tokens": 883391709.0, + "step": 23155 + }, + { + "epoch": 2.945681211041852, + "ewc_loss": 0.08314816653728485, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004530636942945421, + "grad_norm": 9.814040184020996, + "learning_rate": 1e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8495437502861023, + "num_tokens": 883426329.0, + "step": 23156 + }, + { + "epoch": 2.945808421320443, + "ewc_loss": 0.08380809426307678, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045966298785060644, + "grad_norm": 9.874309539794922, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8626673817634583, + "num_tokens": 883465797.0, + "step": 23157 + }, + { + "epoch": 2.945935631599033, + "ewc_loss": 0.08322973549365997, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004538793582469225, + "grad_norm": 9.779090881347656, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8660874962806702, + "num_tokens": 883501632.0, + "step": 23158 + }, + { + "epoch": 2.9460628418776236, + "ewc_loss": 0.0836259126663208, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004578411462716758, + "grad_norm": 9.860811233520508, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8747819066047668, + "num_tokens": 883537622.0, + "step": 23159 + }, + { + "epoch": 2.946190052156214, + "ewc_loss": 0.08340960741043091, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045567806228064, + "grad_norm": 9.869950294494629, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8660277128219604, + "num_tokens": 883576458.0, + "step": 23160 + }, + { + "epoch": 2.9463172624348046, + "ewc_loss": 0.08314353227615356, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045301730278879404, + "grad_norm": 9.804685592651367, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8554502725601196, + "num_tokens": 883620929.0, + "step": 23161 + }, + { + "epoch": 2.946444472713395, + "ewc_loss": 0.0835222601890564, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045680461334995925, + "grad_norm": 9.893625259399414, + "learning_rate": 1e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8409543633460999, + "num_tokens": 883652589.0, + "step": 23162 + }, + { + "epoch": 2.9465716829919857, + "ewc_loss": 0.08317900449037552, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004533720784820616, + "grad_norm": 9.816006660461426, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8654792904853821, + "num_tokens": 883691844.0, + "step": 23163 + }, + { + "epoch": 2.946698893270576, + "ewc_loss": 0.08347474783658981, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045632952242158353, + "grad_norm": 9.912589073181152, + "learning_rate": 1e-06, + "loss": 0.5443, + "mean_token_accuracy": 0.8449466824531555, + "num_tokens": 883732838.0, + "step": 23164 + }, + { + "epoch": 2.9468261035491667, + "ewc_loss": 0.08313256502151489, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045290772686712444, + "grad_norm": 9.767975807189941, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8632086515426636, + "num_tokens": 883771415.0, + "step": 23165 + }, + { + "epoch": 2.9469533138277573, + "ewc_loss": 0.08367224782705307, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004583045083563775, + "grad_norm": 9.912657737731934, + "learning_rate": 1e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8500484228134155, + "num_tokens": 883808807.0, + "step": 23166 + }, + { + "epoch": 2.947080524106348, + "ewc_loss": 0.08297626674175262, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045134464744478464, + "grad_norm": 9.738301277160645, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8795783519744873, + "num_tokens": 883844300.0, + "step": 23167 + }, + { + "epoch": 2.9472077343849383, + "ewc_loss": 0.08384332060813904, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046001520240679383, + "grad_norm": 9.97120475769043, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8800439834594727, + "num_tokens": 883879337.0, + "step": 23168 + }, + { + "epoch": 2.947334944663529, + "ewc_loss": 0.08268357813358307, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004484178207349032, + "grad_norm": 9.702964782714844, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8545734882354736, + "num_tokens": 883925810.0, + "step": 23169 + }, + { + "epoch": 2.9474621549421194, + "ewc_loss": 0.08395159244537354, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046109798131510615, + "grad_norm": 9.970739364624023, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8735226392745972, + "num_tokens": 883964753.0, + "step": 23170 + }, + { + "epoch": 2.94758936522071, + "ewc_loss": 0.08258756995201111, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044745768536813557, + "grad_norm": 9.720702171325684, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8565367460250854, + "num_tokens": 884005737.0, + "step": 23171 + }, + { + "epoch": 2.9477165754993004, + "ewc_loss": 0.08417533338069916, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046333539648912847, + "grad_norm": 10.000768661499023, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8657021522521973, + "num_tokens": 884043414.0, + "step": 23172 + }, + { + "epoch": 2.947843785777891, + "ewc_loss": 0.08274589478969574, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004490409337449819, + "grad_norm": 9.780109405517578, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8703720569610596, + "num_tokens": 884077399.0, + "step": 23173 + }, + { + "epoch": 2.9479709960564815, + "ewc_loss": 0.08392427861690521, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046082481276243925, + "grad_norm": 9.945982933044434, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8631860017776489, + "num_tokens": 884116539.0, + "step": 23174 + }, + { + "epoch": 2.948098206335072, + "ewc_loss": 0.08276523649692535, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004492343869060278, + "grad_norm": 9.828277587890625, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8705456256866455, + "num_tokens": 884149756.0, + "step": 23175 + }, + { + "epoch": 2.9482254166136626, + "ewc_loss": 0.08363228291273117, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004579048545565456, + "grad_norm": 9.994802474975586, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8841316103935242, + "num_tokens": 884181960.0, + "step": 23176 + }, + { + "epoch": 2.9483526268922526, + "ewc_loss": 0.08273144066333771, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044889646233059466, + "grad_norm": 9.755528450012207, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8598552942276001, + "num_tokens": 884223562.0, + "step": 23177 + }, + { + "epoch": 2.9484798371708436, + "ewc_loss": 0.08377018570899963, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045928393956273794, + "grad_norm": 10.013846397399902, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8693970441818237, + "num_tokens": 884264541.0, + "step": 23178 + }, + { + "epoch": 2.9486070474494337, + "ewc_loss": 0.08255548030138016, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004471368156373501, + "grad_norm": 9.77817440032959, + "learning_rate": 1e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8507076501846313, + "num_tokens": 884301171.0, + "step": 23179 + }, + { + "epoch": 2.9487342577280247, + "ewc_loss": 0.08359271287918091, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045750915887765586, + "grad_norm": 9.988622665405273, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8625576496124268, + "num_tokens": 884332481.0, + "step": 23180 + }, + { + "epoch": 2.9488614680066147, + "ewc_loss": 0.08278307318687439, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00044697130215354264, + "grad_norm": 9.763240814208984, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.885838508605957, + "num_tokens": 884371616.0, + "step": 23181 + }, + { + "epoch": 2.9489886782852053, + "ewc_loss": 0.08396738767623901, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004588145238813013, + "grad_norm": 9.941277503967285, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8539655208587646, + "num_tokens": 884411245.0, + "step": 23182 + }, + { + "epoch": 2.949115888563796, + "ewc_loss": 0.0826793760061264, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004483757948037237, + "grad_norm": 9.751852035522461, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8618472814559937, + "num_tokens": 884445712.0, + "step": 23183 + }, + { + "epoch": 2.9492430988423863, + "ewc_loss": 0.08370999246835709, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045868195593357086, + "grad_norm": 9.967694282531738, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8696203231811523, + "num_tokens": 884484935.0, + "step": 23184 + }, + { + "epoch": 2.949370309120977, + "ewc_loss": 0.08248589187860489, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004464409430511296, + "grad_norm": 9.747135162353516, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8650597333908081, + "num_tokens": 884518714.0, + "step": 23185 + }, + { + "epoch": 2.9494975193995674, + "ewc_loss": 0.08371308445930481, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004587128642015159, + "grad_norm": 9.926694869995117, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8579456806182861, + "num_tokens": 884560430.0, + "step": 23186 + }, + { + "epoch": 2.949624729678158, + "ewc_loss": 0.08312321454286575, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004503727541305125, + "grad_norm": 9.760965347290039, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.856147825717926, + "num_tokens": 884595937.0, + "step": 23187 + }, + { + "epoch": 2.9497519399567484, + "ewc_loss": 0.08375584334135056, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004566990537568927, + "grad_norm": 9.934803009033203, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8595308065414429, + "num_tokens": 884633201.0, + "step": 23188 + }, + { + "epoch": 2.949879150235339, + "ewc_loss": 0.0828462690114975, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004500446957536042, + "grad_norm": 9.780123710632324, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.86739182472229, + "num_tokens": 884672602.0, + "step": 23189 + }, + { + "epoch": 2.9500063605139295, + "ewc_loss": 0.0835128054022789, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004567100841086358, + "grad_norm": 9.900872230529785, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8664549589157104, + "num_tokens": 884708378.0, + "step": 23190 + }, + { + "epoch": 2.95013357079252, + "ewc_loss": 0.08271467685699463, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004487287951633334, + "grad_norm": 9.706578254699707, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8610034584999084, + "num_tokens": 884747527.0, + "step": 23191 + }, + { + "epoch": 2.9502607810711106, + "ewc_loss": 0.08367234468460083, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045830552699044347, + "grad_norm": 9.866740226745605, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8843806982040405, + "num_tokens": 884781738.0, + "step": 23192 + }, + { + "epoch": 2.950387991349701, + "ewc_loss": 0.08301430195569992, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004517250636126846, + "grad_norm": 9.833874702453613, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8512402176856995, + "num_tokens": 884818884.0, + "step": 23193 + }, + { + "epoch": 2.9505152016282916, + "ewc_loss": 0.08334121108055115, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004549940931610763, + "grad_norm": 9.8599271774292, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8740543723106384, + "num_tokens": 884851751.0, + "step": 23194 + }, + { + "epoch": 2.950642411906882, + "ewc_loss": 0.08305545151233673, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004521365335676819, + "grad_norm": 9.784016609191895, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8680946230888367, + "num_tokens": 884892935.0, + "step": 23195 + }, + { + "epoch": 2.9507696221854727, + "ewc_loss": 0.08339906483888626, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004555726773105562, + "grad_norm": 9.837876319885254, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8704112768173218, + "num_tokens": 884935820.0, + "step": 23196 + }, + { + "epoch": 2.950896832464063, + "ewc_loss": 0.0831051915884018, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045263394713401794, + "grad_norm": 9.735820770263672, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8625212907791138, + "num_tokens": 884976042.0, + "step": 23197 + }, + { + "epoch": 2.9510240427426537, + "ewc_loss": 0.08337637782096863, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045534581295214593, + "grad_norm": 9.852184295654297, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8835809230804443, + "num_tokens": 885013511.0, + "step": 23198 + }, + { + "epoch": 2.9511512530212443, + "ewc_loss": 0.08293764293193817, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045095841051079333, + "grad_norm": 9.831308364868164, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8526047468185425, + "num_tokens": 885047291.0, + "step": 23199 + }, + { + "epoch": 2.951278463299835, + "ewc_loss": 0.08314866572618484, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004530686710495502, + "grad_norm": 9.8816556930542, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8605812191963196, + "num_tokens": 885083116.0, + "step": 23200 + }, + { + "epoch": 2.9514056735784253, + "ewc_loss": 0.08293527364730835, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045093471999280155, + "grad_norm": 9.763640403747559, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8616563677787781, + "num_tokens": 885123282.0, + "step": 23201 + }, + { + "epoch": 2.9515328838570154, + "ewc_loss": 0.08328105509281158, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045439257519319654, + "grad_norm": 9.851616859436035, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8690465688705444, + "num_tokens": 885166277.0, + "step": 23202 + }, + { + "epoch": 2.9516600941356064, + "ewc_loss": 0.08281093835830688, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004496914625633508, + "grad_norm": 9.724072456359863, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8766169548034668, + "num_tokens": 885206414.0, + "step": 23203 + }, + { + "epoch": 2.9517873044141965, + "ewc_loss": 0.08340372145175934, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004556192725431174, + "grad_norm": 9.893385887145996, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8729292750358582, + "num_tokens": 885240977.0, + "step": 23204 + }, + { + "epoch": 2.9519145146927874, + "ewc_loss": 0.08264902979135513, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044807232916355133, + "grad_norm": 9.730036735534668, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8815188407897949, + "num_tokens": 885283190.0, + "step": 23205 + }, + { + "epoch": 2.9520417249713775, + "ewc_loss": 0.0835629254579544, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004572112811729312, + "grad_norm": 9.869037628173828, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8790386915206909, + "num_tokens": 885321340.0, + "step": 23206 + }, + { + "epoch": 2.952168935249968, + "ewc_loss": 0.08270072937011719, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004485893005039543, + "grad_norm": 9.821599006652832, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8735513687133789, + "num_tokens": 885357524.0, + "step": 23207 + }, + { + "epoch": 2.9522961455285586, + "ewc_loss": 0.08331508934497833, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004547329735942185, + "grad_norm": 9.87244701385498, + "learning_rate": 1e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8405338525772095, + "num_tokens": 885389696.0, + "step": 23208 + }, + { + "epoch": 2.952423355807149, + "ewc_loss": 0.0829099714756012, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004506816912908107, + "grad_norm": 9.737190246582031, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8803048133850098, + "num_tokens": 885429574.0, + "step": 23209 + }, + { + "epoch": 2.9525505660857396, + "ewc_loss": 0.08361546695232391, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045773666352033615, + "grad_norm": 9.940591812133789, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8678708672523499, + "num_tokens": 885459961.0, + "step": 23210 + }, + { + "epoch": 2.95267777636433, + "ewc_loss": 0.0827200710773468, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004487827536650002, + "grad_norm": 9.702940940856934, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8583475351333618, + "num_tokens": 885506725.0, + "step": 23211 + }, + { + "epoch": 2.9528049866429207, + "ewc_loss": 0.08365018665790558, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045808390132151544, + "grad_norm": 9.843944549560547, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8690809011459351, + "num_tokens": 885549409.0, + "step": 23212 + }, + { + "epoch": 2.952932196921511, + "ewc_loss": 0.08283785730600357, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044996061478741467, + "grad_norm": 9.72061824798584, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.872013509273529, + "num_tokens": 885589272.0, + "step": 23213 + }, + { + "epoch": 2.9530594072001017, + "ewc_loss": 0.0837239921092987, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045882194535806775, + "grad_norm": 9.956928253173828, + "learning_rate": 1e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.8484866619110107, + "num_tokens": 885625951.0, + "step": 23214 + }, + { + "epoch": 2.9531866174786923, + "ewc_loss": 0.08274123072624207, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004489943094085902, + "grad_norm": 9.685218811035156, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.868423342704773, + "num_tokens": 885665844.0, + "step": 23215 + }, + { + "epoch": 2.953313827757283, + "ewc_loss": 0.08422844111919403, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004638663958758116, + "grad_norm": 9.982515335083008, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8634446859359741, + "num_tokens": 885705616.0, + "step": 23216 + }, + { + "epoch": 2.9534410380358733, + "ewc_loss": 0.08259713649749756, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044755340786650777, + "grad_norm": 9.652560234069824, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8767616748809814, + "num_tokens": 885752362.0, + "step": 23217 + }, + { + "epoch": 2.953568248314464, + "ewc_loss": 0.08435061573982239, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004650882037822157, + "grad_norm": 9.974388122558594, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.860305666923523, + "num_tokens": 885792485.0, + "step": 23218 + }, + { + "epoch": 2.9536954585930544, + "ewc_loss": 0.08284080028533936, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044999003876000643, + "grad_norm": 9.691187858581543, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.866727888584137, + "num_tokens": 885834809.0, + "step": 23219 + }, + { + "epoch": 2.953822668871645, + "ewc_loss": 0.08430320024490356, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004646140441764146, + "grad_norm": 10.034927368164062, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8700907230377197, + "num_tokens": 885870555.0, + "step": 23220 + }, + { + "epoch": 2.9539498791502354, + "ewc_loss": 0.08284635841846466, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004500456270761788, + "grad_norm": 9.654316902160645, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.867770254611969, + "num_tokens": 885911911.0, + "step": 23221 + }, + { + "epoch": 2.954077089428826, + "ewc_loss": 0.0845697671175003, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004672796931117773, + "grad_norm": 10.028390884399414, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8778684139251709, + "num_tokens": 885941094.0, + "step": 23222 + }, + { + "epoch": 2.9542042997074165, + "ewc_loss": 0.08289346098899841, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045051664346829057, + "grad_norm": 9.714944839477539, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8643426895141602, + "num_tokens": 885978094.0, + "step": 23223 + }, + { + "epoch": 2.954331509986007, + "ewc_loss": 0.08454756438732147, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046705768909305334, + "grad_norm": 10.036389350891113, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8822259902954102, + "num_tokens": 886014859.0, + "step": 23224 + }, + { + "epoch": 2.954458720264597, + "ewc_loss": 0.08298151940107346, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004513972089625895, + "grad_norm": 9.680145263671875, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8727462887763977, + "num_tokens": 886048317.0, + "step": 23225 + }, + { + "epoch": 2.954585930543188, + "ewc_loss": 0.08448018878698349, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004663839063141495, + "grad_norm": 9.969785690307617, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8669919967651367, + "num_tokens": 886081797.0, + "step": 23226 + }, + { + "epoch": 2.954713140821778, + "ewc_loss": 0.08288484811782837, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045043055433779955, + "grad_norm": 9.637134552001953, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8515639305114746, + "num_tokens": 886124111.0, + "step": 23227 + }, + { + "epoch": 2.954840351100369, + "ewc_loss": 0.08459006994962692, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004674827214330435, + "grad_norm": 9.922240257263184, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8765872716903687, + "num_tokens": 886162823.0, + "step": 23228 + }, + { + "epoch": 2.954967561378959, + "ewc_loss": 0.08323109149932861, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045389292063191533, + "grad_norm": 9.7705078125, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8667710423469543, + "num_tokens": 886204706.0, + "step": 23229 + }, + { + "epoch": 2.95509477165755, + "ewc_loss": 0.084255650639534, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004641385457944125, + "grad_norm": 9.927746772766113, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8741681575775146, + "num_tokens": 886241461.0, + "step": 23230 + }, + { + "epoch": 2.9552219819361403, + "ewc_loss": 0.0834817886352539, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045639986637979746, + "grad_norm": 9.797270774841309, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.861146092414856, + "num_tokens": 886279276.0, + "step": 23231 + }, + { + "epoch": 2.955349192214731, + "ewc_loss": 0.08393876254558563, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046096969163045287, + "grad_norm": 9.829256057739258, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.866851270198822, + "num_tokens": 886317703.0, + "step": 23232 + }, + { + "epoch": 2.9554764024933213, + "ewc_loss": 0.08379805088043213, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045956249232403934, + "grad_norm": 9.899118423461914, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.871732234954834, + "num_tokens": 886355235.0, + "step": 23233 + }, + { + "epoch": 2.955603612771912, + "ewc_loss": 0.08345507085323334, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000456132780527696, + "grad_norm": 9.77011775970459, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8900490999221802, + "num_tokens": 886387354.0, + "step": 23234 + }, + { + "epoch": 2.9557308230505024, + "ewc_loss": 0.08394456654787064, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046102769556455314, + "grad_norm": 9.885042190551758, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8642303943634033, + "num_tokens": 886419742.0, + "step": 23235 + }, + { + "epoch": 2.955858033329093, + "ewc_loss": 0.08330088108778, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045459085959009826, + "grad_norm": 9.744755744934082, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8668738603591919, + "num_tokens": 886455218.0, + "step": 23236 + }, + { + "epoch": 2.9559852436076834, + "ewc_loss": 0.08407767117023468, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046235875925049186, + "grad_norm": 9.860916137695312, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8661482334136963, + "num_tokens": 886494454.0, + "step": 23237 + }, + { + "epoch": 2.956112453886274, + "ewc_loss": 0.08341746032238007, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004557566426228732, + "grad_norm": 9.740702629089355, + "learning_rate": 1e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8423903584480286, + "num_tokens": 886532854.0, + "step": 23238 + }, + { + "epoch": 2.9562396641648645, + "ewc_loss": 0.0839962586760521, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004615446086972952, + "grad_norm": 9.883021354675293, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8611103892326355, + "num_tokens": 886564288.0, + "step": 23239 + }, + { + "epoch": 2.956366874443455, + "ewc_loss": 0.08323405683040619, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004539225483313203, + "grad_norm": 9.761960983276367, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8685440421104431, + "num_tokens": 886597155.0, + "step": 23240 + }, + { + "epoch": 2.9564940847220456, + "ewc_loss": 0.0840056836605072, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004616388469003141, + "grad_norm": 9.946562767028809, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8695635795593262, + "num_tokens": 886632015.0, + "step": 23241 + }, + { + "epoch": 2.956621295000636, + "ewc_loss": 0.08301068097352982, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045168885844759643, + "grad_norm": 9.693758964538574, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8817839026451111, + "num_tokens": 886665736.0, + "step": 23242 + }, + { + "epoch": 2.9567485052792266, + "ewc_loss": 0.08425977826118469, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046417981502600014, + "grad_norm": 10.020886421203613, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.864936351776123, + "num_tokens": 886702652.0, + "step": 23243 + }, + { + "epoch": 2.956875715557817, + "ewc_loss": 0.08270345628261566, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004486165416892618, + "grad_norm": 9.606359481811523, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.862707257270813, + "num_tokens": 886743573.0, + "step": 23244 + }, + { + "epoch": 2.9570029258364077, + "ewc_loss": 0.08459139615297318, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046749599277973175, + "grad_norm": 10.042214393615723, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8701494932174683, + "num_tokens": 886777932.0, + "step": 23245 + }, + { + "epoch": 2.957130136114998, + "ewc_loss": 0.08260814845561981, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044766347855329514, + "grad_norm": 9.722156524658203, + "learning_rate": 1e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8550555109977722, + "num_tokens": 886813211.0, + "step": 23246 + }, + { + "epoch": 2.9572573463935887, + "ewc_loss": 0.08448465913534164, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046642860979773104, + "grad_norm": 10.021862030029297, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8699620962142944, + "num_tokens": 886849416.0, + "step": 23247 + }, + { + "epoch": 2.9573845566721793, + "ewc_loss": 0.0826302170753479, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000447884202003479, + "grad_norm": 9.62817096710205, + "learning_rate": 1e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.850556492805481, + "num_tokens": 886887983.0, + "step": 23248 + }, + { + "epoch": 2.95751176695077, + "ewc_loss": 0.08453473448753357, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004669293703045696, + "grad_norm": 10.14255428314209, + "learning_rate": 1e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.849149227142334, + "num_tokens": 886930867.0, + "step": 23249 + }, + { + "epoch": 2.95763897722936, + "ewc_loss": 0.08238779008388519, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000445459911134094, + "grad_norm": 9.558836936950684, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8653048276901245, + "num_tokens": 886971862.0, + "step": 23250 + }, + { + "epoch": 2.957766187507951, + "ewc_loss": 0.08512260764837265, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00047280811122618616, + "grad_norm": 10.299607276916504, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8713237047195435, + "num_tokens": 887005238.0, + "step": 23251 + }, + { + "epoch": 2.957893397786541, + "ewc_loss": 0.08201020956039429, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004416840965859592, + "grad_norm": 9.576345443725586, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8581406474113464, + "num_tokens": 887035588.0, + "step": 23252 + }, + { + "epoch": 2.958020608065132, + "ewc_loss": 0.08537831902503967, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00047536520287394524, + "grad_norm": 10.215930938720703, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8667435050010681, + "num_tokens": 887079962.0, + "step": 23253 + }, + { + "epoch": 2.958147818343722, + "ewc_loss": 0.08216904103755951, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000443272409029305, + "grad_norm": 9.578714370727539, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8853981494903564, + "num_tokens": 887114572.0, + "step": 23254 + }, + { + "epoch": 2.958275028622313, + "ewc_loss": 0.08533483743667603, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004749304207507521, + "grad_norm": 10.221510887145996, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8699997067451477, + "num_tokens": 887149396.0, + "step": 23255 + }, + { + "epoch": 2.958402238900903, + "ewc_loss": 0.08242622017860413, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044584422721527517, + "grad_norm": 9.573163032531738, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8603441715240479, + "num_tokens": 887191916.0, + "step": 23256 + }, + { + "epoch": 2.9585294491794936, + "ewc_loss": 0.0854063332080841, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00047564541455358267, + "grad_norm": 10.16767692565918, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8748818635940552, + "num_tokens": 887228263.0, + "step": 23257 + }, + { + "epoch": 2.958656659458084, + "ewc_loss": 0.0826760083436966, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044834212167188525, + "grad_norm": 9.76104736328125, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8820477724075317, + "num_tokens": 887258688.0, + "step": 23258 + }, + { + "epoch": 2.9587838697366746, + "ewc_loss": 0.08455224335193634, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004671044589485973, + "grad_norm": 10.024574279785156, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8605333566665649, + "num_tokens": 887301565.0, + "step": 23259 + }, + { + "epoch": 2.958911080015265, + "ewc_loss": 0.08305738121271133, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004521558294072747, + "grad_norm": 9.84364128112793, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8626695871353149, + "num_tokens": 887337511.0, + "step": 23260 + }, + { + "epoch": 2.9590382902938557, + "ewc_loss": 0.08370484411716461, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045863044215366244, + "grad_norm": 9.883346557617188, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8683733344078064, + "num_tokens": 887370694.0, + "step": 23261 + }, + { + "epoch": 2.959165500572446, + "ewc_loss": 0.08344031870365143, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045598516589961946, + "grad_norm": 9.853811264038086, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8631228804588318, + "num_tokens": 887412131.0, + "step": 23262 + }, + { + "epoch": 2.9592927108510367, + "ewc_loss": 0.08308786898851395, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045246072113513947, + "grad_norm": 9.809884071350098, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8615861535072327, + "num_tokens": 887447179.0, + "step": 23263 + }, + { + "epoch": 2.9594199211296273, + "ewc_loss": 0.08366093039512634, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004581912944559008, + "grad_norm": 9.88792610168457, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8600426912307739, + "num_tokens": 887486079.0, + "step": 23264 + }, + { + "epoch": 2.959547131408218, + "ewc_loss": 0.08301367610692978, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045171877718530595, + "grad_norm": 9.849654197692871, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.875468373298645, + "num_tokens": 887519092.0, + "step": 23265 + }, + { + "epoch": 2.9596743416868083, + "ewc_loss": 0.08324071019887924, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004539891378954053, + "grad_norm": 9.829361915588379, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8679156303405762, + "num_tokens": 887552008.0, + "step": 23266 + }, + { + "epoch": 2.959801551965399, + "ewc_loss": 0.0831550806760788, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004531328449957073, + "grad_norm": 9.831197738647461, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8610928654670715, + "num_tokens": 887585827.0, + "step": 23267 + }, + { + "epoch": 2.9599287622439894, + "ewc_loss": 0.0830787867307663, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004523699462879449, + "grad_norm": 9.784346580505371, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8642964959144592, + "num_tokens": 887628041.0, + "step": 23268 + }, + { + "epoch": 2.96005597252258, + "ewc_loss": 0.08346156775951385, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000456197711173445, + "grad_norm": 9.805461883544922, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8702062368392944, + "num_tokens": 887667210.0, + "step": 23269 + }, + { + "epoch": 2.9601831828011704, + "ewc_loss": 0.08310765027999878, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004526585398707539, + "grad_norm": 9.883241653442383, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8713470697402954, + "num_tokens": 887705429.0, + "step": 23270 + }, + { + "epoch": 2.960310393079761, + "ewc_loss": 0.08312730491161346, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045285504893399775, + "grad_norm": 9.753622055053711, + "learning_rate": 1e-06, + "loss": 0.541, + "mean_token_accuracy": 0.8468633890151978, + "num_tokens": 887753188.0, + "step": 23271 + }, + { + "epoch": 2.9604376033583515, + "ewc_loss": 0.08360767364501953, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004576588107738644, + "grad_norm": 9.878649711608887, + "learning_rate": 1e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8485959768295288, + "num_tokens": 887795907.0, + "step": 23272 + }, + { + "epoch": 2.960564813636942, + "ewc_loss": 0.08306858688592911, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004522679082583636, + "grad_norm": 9.881709098815918, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8659495711326599, + "num_tokens": 887834859.0, + "step": 23273 + }, + { + "epoch": 2.9606920239155325, + "ewc_loss": 0.08319409191608429, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004535229236353189, + "grad_norm": 9.746305465698242, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8621889352798462, + "num_tokens": 887879041.0, + "step": 23274 + }, + { + "epoch": 2.9608192341941226, + "ewc_loss": 0.08345967531204224, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045617876457981765, + "grad_norm": 9.905052185058594, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8601278066635132, + "num_tokens": 887913752.0, + "step": 23275 + }, + { + "epoch": 2.9609464444727136, + "ewc_loss": 0.08307437598705292, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004523257666733116, + "grad_norm": 9.745535850524902, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8713253736495972, + "num_tokens": 887950946.0, + "step": 23276 + }, + { + "epoch": 2.9610736547513037, + "ewc_loss": 0.08390377461910248, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046061977627687156, + "grad_norm": 9.871893882751465, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8623368740081787, + "num_tokens": 887993155.0, + "step": 23277 + }, + { + "epoch": 2.9612008650298947, + "ewc_loss": 0.08289741724729538, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045055619557388127, + "grad_norm": 9.772180557250977, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8652603030204773, + "num_tokens": 888027831.0, + "step": 23278 + }, + { + "epoch": 2.9613280753084847, + "ewc_loss": 0.08362729847431183, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045785505790263414, + "grad_norm": 9.806717872619629, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8751765489578247, + "num_tokens": 888066941.0, + "step": 23279 + }, + { + "epoch": 2.9614552855870753, + "ewc_loss": 0.08315704762935638, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000453152519185096, + "grad_norm": 9.7568941116333, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8800581693649292, + "num_tokens": 888099739.0, + "step": 23280 + }, + { + "epoch": 2.961582495865666, + "ewc_loss": 0.0833551287651062, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004551333258859813, + "grad_norm": 9.779891967773438, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8666585683822632, + "num_tokens": 888139675.0, + "step": 23281 + }, + { + "epoch": 2.9617097061442563, + "ewc_loss": 0.0833728164434433, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004553102480713278, + "grad_norm": 9.741487503051758, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8532781600952148, + "num_tokens": 888179374.0, + "step": 23282 + }, + { + "epoch": 2.961836916422847, + "ewc_loss": 0.0835784301161766, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004573663172777742, + "grad_norm": 9.828774452209473, + "learning_rate": 1e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8531811237335205, + "num_tokens": 888214139.0, + "step": 23283 + }, + { + "epoch": 2.9619641267014374, + "ewc_loss": 0.08324019610881805, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004539840156212449, + "grad_norm": 9.754270553588867, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8643022775650024, + "num_tokens": 888246365.0, + "step": 23284 + }, + { + "epoch": 2.962091336980028, + "ewc_loss": 0.08362159132957458, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045779795618727803, + "grad_norm": 9.756505966186523, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8601188659667969, + "num_tokens": 888290903.0, + "step": 23285 + }, + { + "epoch": 2.9622185472586184, + "ewc_loss": 0.08358173072338104, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045739932102151215, + "grad_norm": 9.785638809204102, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8650312423706055, + "num_tokens": 888331329.0, + "step": 23286 + }, + { + "epoch": 2.962345757537209, + "ewc_loss": 0.08356049656867981, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004571869794744998, + "grad_norm": 9.696208000183105, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8588043451309204, + "num_tokens": 888372411.0, + "step": 23287 + }, + { + "epoch": 2.9624729678157995, + "ewc_loss": 0.08388900756835938, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046047207433730364, + "grad_norm": 9.8363676071167, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8650798797607422, + "num_tokens": 888409778.0, + "step": 23288 + }, + { + "epoch": 2.96260017809439, + "ewc_loss": 0.08348791301250458, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045646115904673934, + "grad_norm": 9.757648468017578, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8717511296272278, + "num_tokens": 888453593.0, + "step": 23289 + }, + { + "epoch": 2.9627273883729806, + "ewc_loss": 0.08384228497743607, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046000489965081215, + "grad_norm": 9.764625549316406, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.876836895942688, + "num_tokens": 888488245.0, + "step": 23290 + }, + { + "epoch": 2.962854598651571, + "ewc_loss": 0.08350677043199539, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000456649751868099, + "grad_norm": 9.77297306060791, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8646628856658936, + "num_tokens": 888524306.0, + "step": 23291 + }, + { + "epoch": 2.9629818089301616, + "ewc_loss": 0.08383306860923767, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045991275692358613, + "grad_norm": 9.802179336547852, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8767746686935425, + "num_tokens": 888563791.0, + "step": 23292 + }, + { + "epoch": 2.963109019208752, + "ewc_loss": 0.08377169072628021, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045929892803542316, + "grad_norm": 9.787384986877441, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8703050017356873, + "num_tokens": 888605021.0, + "step": 23293 + }, + { + "epoch": 2.9632362294873427, + "ewc_loss": 0.08380922675132751, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045967433834448457, + "grad_norm": 9.804922103881836, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8651975989341736, + "num_tokens": 888647382.0, + "step": 23294 + }, + { + "epoch": 2.963363439765933, + "ewc_loss": 0.08365625143051147, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004581445246003568, + "grad_norm": 9.77951431274414, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8648656606674194, + "num_tokens": 888682174.0, + "step": 23295 + }, + { + "epoch": 2.9634906500445237, + "ewc_loss": 0.08388678729534149, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004604499554261565, + "grad_norm": 9.804567337036133, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.867316722869873, + "num_tokens": 888724401.0, + "step": 23296 + }, + { + "epoch": 2.9636178603231143, + "ewc_loss": 0.08405515551567078, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00045969217899255455, + "grad_norm": 9.83232593536377, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8722717761993408, + "num_tokens": 888766230.0, + "step": 23297 + }, + { + "epoch": 2.963745070601705, + "ewc_loss": 0.08365729451179504, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004581549728754908, + "grad_norm": 9.759349822998047, + "learning_rate": 1e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8436672687530518, + "num_tokens": 888811865.0, + "step": 23298 + }, + { + "epoch": 2.9638722808802953, + "ewc_loss": 0.08425705879926682, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00046171119902282953, + "grad_norm": 9.888167381286621, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8667826652526855, + "num_tokens": 888849549.0, + "step": 23299 + }, + { + "epoch": 2.9639994911588854, + "ewc_loss": 0.08361998200416565, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045778186176903546, + "grad_norm": 9.846902847290039, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8683722615242004, + "num_tokens": 888881305.0, + "step": 23300 + }, + { + "epoch": 2.9641267014374764, + "ewc_loss": 0.08384941518306732, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046007620403543115, + "grad_norm": 9.82752799987793, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8702963590621948, + "num_tokens": 888917862.0, + "step": 23301 + }, + { + "epoch": 2.9642539117160664, + "ewc_loss": 0.08379421383142471, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004595241625793278, + "grad_norm": 9.850234985351562, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8664097785949707, + "num_tokens": 888953883.0, + "step": 23302 + }, + { + "epoch": 2.9643811219946574, + "ewc_loss": 0.08355474472045898, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045712949940934777, + "grad_norm": 9.742485046386719, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8831266760826111, + "num_tokens": 888991052.0, + "step": 23303 + }, + { + "epoch": 2.9645083322732475, + "ewc_loss": 0.08401712775230408, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046175328316166997, + "grad_norm": 9.872734069824219, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8770397901535034, + "num_tokens": 889022523.0, + "step": 23304 + }, + { + "epoch": 2.964635542551838, + "ewc_loss": 0.0833781510591507, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045536350808106363, + "grad_norm": 9.787888526916504, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8559169769287109, + "num_tokens": 889063470.0, + "step": 23305 + }, + { + "epoch": 2.9647627528304286, + "ewc_loss": 0.08412393927574158, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046282142284326255, + "grad_norm": 9.887324333190918, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8737369775772095, + "num_tokens": 889101160.0, + "step": 23306 + }, + { + "epoch": 2.964889963109019, + "ewc_loss": 0.0832781046628952, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045436303480528295, + "grad_norm": 9.77804183959961, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8641554117202759, + "num_tokens": 889139128.0, + "step": 23307 + }, + { + "epoch": 2.9650171733876096, + "ewc_loss": 0.08392481505870819, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004608301678672433, + "grad_norm": 9.908435821533203, + "learning_rate": 1e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8419395685195923, + "num_tokens": 889175574.0, + "step": 23308 + }, + { + "epoch": 2.9651443836662, + "ewc_loss": 0.08341595530509949, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045574159594252706, + "grad_norm": 9.82840633392334, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8626813888549805, + "num_tokens": 889216520.0, + "step": 23309 + }, + { + "epoch": 2.9652715939447907, + "ewc_loss": 0.08372560143470764, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004588380397763103, + "grad_norm": 9.855367660522461, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8768925666809082, + "num_tokens": 889256746.0, + "step": 23310 + }, + { + "epoch": 2.965398804223381, + "ewc_loss": 0.08339804410934448, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004555624909698963, + "grad_norm": 9.921253204345703, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8617401123046875, + "num_tokens": 889298868.0, + "step": 23311 + }, + { + "epoch": 2.9655260145019717, + "ewc_loss": 0.08338434994220734, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004554255574475974, + "grad_norm": 9.816142082214355, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8613572120666504, + "num_tokens": 889337758.0, + "step": 23312 + }, + { + "epoch": 2.9656532247805623, + "ewc_loss": 0.08383263647556305, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004599084204528481, + "grad_norm": 9.948619842529297, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8722916841506958, + "num_tokens": 889375308.0, + "step": 23313 + }, + { + "epoch": 2.965780435059153, + "ewc_loss": 0.08304934203624725, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004520754737313837, + "grad_norm": 9.69884204864502, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8725704550743103, + "num_tokens": 889413735.0, + "step": 23314 + }, + { + "epoch": 2.9659076453377433, + "ewc_loss": 0.08425460010766983, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004641280393116176, + "grad_norm": 9.928515434265137, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8670320510864258, + "num_tokens": 889451585.0, + "step": 23315 + }, + { + "epoch": 2.966034855616334, + "ewc_loss": 0.08297783136367798, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045136039261706173, + "grad_norm": 9.735197067260742, + "learning_rate": 1e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8492119312286377, + "num_tokens": 889487405.0, + "step": 23316 + }, + { + "epoch": 2.9661620658949244, + "ewc_loss": 0.08411350846290588, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004627171438187361, + "grad_norm": 9.955968856811523, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8531990051269531, + "num_tokens": 889523051.0, + "step": 23317 + }, + { + "epoch": 2.966289276173515, + "ewc_loss": 0.08315955102443695, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045317751937545836, + "grad_norm": 9.796504020690918, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8475810289382935, + "num_tokens": 889559822.0, + "step": 23318 + }, + { + "epoch": 2.9664164864521054, + "ewc_loss": 0.08387897908687592, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046037184074521065, + "grad_norm": 9.890856742858887, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8635908365249634, + "num_tokens": 889591598.0, + "step": 23319 + }, + { + "epoch": 2.966543696730696, + "ewc_loss": 0.08328860253095627, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045446804142557085, + "grad_norm": 9.757076263427734, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8766764402389526, + "num_tokens": 889629787.0, + "step": 23320 + }, + { + "epoch": 2.9666709070092865, + "ewc_loss": 0.08406178653240204, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046219993964768946, + "grad_norm": 9.895380973815918, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8728600740432739, + "num_tokens": 889668730.0, + "step": 23321 + }, + { + "epoch": 2.966798117287877, + "ewc_loss": 0.08337819576263428, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045536403195001185, + "grad_norm": 9.86034870147705, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8637374639511108, + "num_tokens": 889705215.0, + "step": 23322 + }, + { + "epoch": 2.966925327566467, + "ewc_loss": 0.08382521569728851, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045983423478901386, + "grad_norm": 9.97903823852539, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8684762716293335, + "num_tokens": 889738528.0, + "step": 23323 + }, + { + "epoch": 2.967052537845058, + "ewc_loss": 0.08323566615581512, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004539386718533933, + "grad_norm": 9.8389892578125, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.869307279586792, + "num_tokens": 889771694.0, + "step": 23324 + }, + { + "epoch": 2.967179748123648, + "ewc_loss": 0.08372028172016144, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045878480887040496, + "grad_norm": 9.862458229064941, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8783213496208191, + "num_tokens": 889806096.0, + "step": 23325 + }, + { + "epoch": 2.967306958402239, + "ewc_loss": 0.08338818699121475, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045546391629613936, + "grad_norm": 9.85060977935791, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8663369417190552, + "num_tokens": 889848514.0, + "step": 23326 + }, + { + "epoch": 2.967434168680829, + "ewc_loss": 0.0834910050034523, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045649209641851485, + "grad_norm": 9.787331581115723, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.885671079158783, + "num_tokens": 889889689.0, + "step": 23327 + }, + { + "epoch": 2.96756137895942, + "ewc_loss": 0.08405932784080505, + "ewc_loss_diag": 3.838539123535156e-05, + "ewc_loss_parallel": 0.0004572924808599055, + "grad_norm": 9.899158477783203, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8689388036727905, + "num_tokens": 889925550.0, + "step": 23328 + }, + { + "epoch": 2.9676885892380103, + "ewc_loss": 0.08321191370487213, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004537012137006968, + "grad_norm": 9.795990943908691, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8516744375228882, + "num_tokens": 889961797.0, + "step": 23329 + }, + { + "epoch": 2.967815799516601, + "ewc_loss": 0.08382651954889297, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045984721509739757, + "grad_norm": 10.036941528320312, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8719176054000854, + "num_tokens": 889997935.0, + "step": 23330 + }, + { + "epoch": 2.9679430097951913, + "ewc_loss": 0.08277714252471924, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044935342157259583, + "grad_norm": 9.782939910888672, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.877479612827301, + "num_tokens": 890037900.0, + "step": 23331 + }, + { + "epoch": 2.968070220073782, + "ewc_loss": 0.08391429483890533, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046072498662397265, + "grad_norm": 9.925846099853516, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8726162910461426, + "num_tokens": 890074814.0, + "step": 23332 + }, + { + "epoch": 2.9681974303523724, + "ewc_loss": 0.08280584216117859, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004496404726523906, + "grad_norm": 9.701713562011719, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8791894912719727, + "num_tokens": 890107091.0, + "step": 23333 + }, + { + "epoch": 2.968324640630963, + "ewc_loss": 0.08404141664505005, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046199618373066187, + "grad_norm": 9.972887992858887, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8734298944473267, + "num_tokens": 890140558.0, + "step": 23334 + }, + { + "epoch": 2.9684518509095534, + "ewc_loss": 0.08266519010066986, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004482339136302471, + "grad_norm": 9.716354370117188, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8648694753646851, + "num_tokens": 890174563.0, + "step": 23335 + }, + { + "epoch": 2.968579061188144, + "ewc_loss": 0.08431480824947357, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046473011025227606, + "grad_norm": 9.991559028625488, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8638086318969727, + "num_tokens": 890210584.0, + "step": 23336 + }, + { + "epoch": 2.9687062714667345, + "ewc_loss": 0.08285074681043625, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004500894865486771, + "grad_norm": 9.702474594116211, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8799065947532654, + "num_tokens": 890251955.0, + "step": 23337 + }, + { + "epoch": 2.968833481745325, + "ewc_loss": 0.0841924399137497, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046350641059689224, + "grad_norm": 10.034886360168457, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8616925477981567, + "num_tokens": 890289095.0, + "step": 23338 + }, + { + "epoch": 2.9689606920239155, + "ewc_loss": 0.0827118456363678, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000448700477136299, + "grad_norm": 9.681455612182617, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8679451942443848, + "num_tokens": 890327864.0, + "step": 23339 + }, + { + "epoch": 2.969087902302506, + "ewc_loss": 0.084347665309906, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004650586924981326, + "grad_norm": 10.065433502197266, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8660752773284912, + "num_tokens": 890364244.0, + "step": 23340 + }, + { + "epoch": 2.9692151125810966, + "ewc_loss": 0.0825488343834877, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044707037159241736, + "grad_norm": 9.83868408203125, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8519218564033508, + "num_tokens": 890397652.0, + "step": 23341 + }, + { + "epoch": 2.969342322859687, + "ewc_loss": 0.08395648002624512, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046114687575027347, + "grad_norm": 9.948053359985352, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.871794581413269, + "num_tokens": 890435598.0, + "step": 23342 + }, + { + "epoch": 2.9694695331382777, + "ewc_loss": 0.0829981341958046, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045156339183449745, + "grad_norm": 9.792176246643066, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8700165748596191, + "num_tokens": 890473445.0, + "step": 23343 + }, + { + "epoch": 2.969596743416868, + "ewc_loss": 0.08349556475877762, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004565376730170101, + "grad_norm": 9.853507995605469, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.870091438293457, + "num_tokens": 890514263.0, + "step": 23344 + }, + { + "epoch": 2.9697239536954587, + "ewc_loss": 0.08313842862844467, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045296631287783384, + "grad_norm": 9.784270286560059, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8709458112716675, + "num_tokens": 890554123.0, + "step": 23345 + }, + { + "epoch": 2.9698511639740492, + "ewc_loss": 0.08349287509918213, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004565107519738376, + "grad_norm": 9.91938591003418, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8648129105567932, + "num_tokens": 890593938.0, + "step": 23346 + }, + { + "epoch": 2.9699783742526398, + "ewc_loss": 0.08304286003112793, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004520106012932956, + "grad_norm": 9.760821342468262, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8576533198356628, + "num_tokens": 890632260.0, + "step": 23347 + }, + { + "epoch": 2.97010558453123, + "ewc_loss": 0.08388411998748779, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004604231799021363, + "grad_norm": 10.077914237976074, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8642064929008484, + "num_tokens": 890666852.0, + "step": 23348 + }, + { + "epoch": 2.970232794809821, + "ewc_loss": 0.0825546532869339, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044712855014950037, + "grad_norm": 9.703169822692871, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8615862727165222, + "num_tokens": 890707959.0, + "step": 23349 + }, + { + "epoch": 2.970360005088411, + "ewc_loss": 0.08422510325908661, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046383304288610816, + "grad_norm": 9.995220184326172, + "learning_rate": 1e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8449697494506836, + "num_tokens": 890748729.0, + "step": 23350 + }, + { + "epoch": 2.970487215367002, + "ewc_loss": 0.08257352560758591, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004473172884900123, + "grad_norm": 9.732830047607422, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8800861835479736, + "num_tokens": 890782116.0, + "step": 23351 + }, + { + "epoch": 2.970614425645592, + "ewc_loss": 0.08395293354988098, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004611113981809467, + "grad_norm": 9.907145500183105, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8536450862884521, + "num_tokens": 890827309.0, + "step": 23352 + }, + { + "epoch": 2.970741635924183, + "ewc_loss": 0.08301703631877899, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000451752363005653, + "grad_norm": 9.781133651733398, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8571367859840393, + "num_tokens": 890863879.0, + "step": 23353 + }, + { + "epoch": 2.970868846202773, + "ewc_loss": 0.08389849960803986, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004605670692399144, + "grad_norm": 9.934630393981934, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8690167665481567, + "num_tokens": 890901184.0, + "step": 23354 + }, + { + "epoch": 2.9709960564813636, + "ewc_loss": 0.08305218815803528, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045210387906990945, + "grad_norm": 9.738286972045898, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8750789165496826, + "num_tokens": 890937779.0, + "step": 23355 + }, + { + "epoch": 2.971123266759954, + "ewc_loss": 0.08386927098035812, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000460274750366807, + "grad_norm": 9.892306327819824, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8796033263206482, + "num_tokens": 890978055.0, + "step": 23356 + }, + { + "epoch": 2.9712504770385446, + "ewc_loss": 0.08310714364051819, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004526534175965935, + "grad_norm": 9.815808296203613, + "learning_rate": 1e-06, + "loss": 0.6072, + "mean_token_accuracy": 0.8338781595230103, + "num_tokens": 891010985.0, + "step": 23357 + }, + { + "epoch": 2.971377687317135, + "ewc_loss": 0.08350994437932968, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004566814750432968, + "grad_norm": 9.876602172851562, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.87134850025177, + "num_tokens": 891045904.0, + "step": 23358 + }, + { + "epoch": 2.9715048975957257, + "ewc_loss": 0.0833628922700882, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045521100400947034, + "grad_norm": 9.794897079467773, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8704396486282349, + "num_tokens": 891086240.0, + "step": 23359 + }, + { + "epoch": 2.971632107874316, + "ewc_loss": 0.08363395929336548, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004579216183628887, + "grad_norm": 9.843754768371582, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8629328608512878, + "num_tokens": 891126482.0, + "step": 23360 + }, + { + "epoch": 2.9717593181529067, + "ewc_loss": 0.08335874229669571, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045516944373957813, + "grad_norm": 9.8130464553833, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8545043468475342, + "num_tokens": 891165982.0, + "step": 23361 + }, + { + "epoch": 2.9718865284314973, + "ewc_loss": 0.08361084014177322, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045769044663757086, + "grad_norm": 9.88692855834961, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8511884808540344, + "num_tokens": 891199101.0, + "step": 23362 + }, + { + "epoch": 2.972013738710088, + "ewc_loss": 0.08349016308784485, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045648362720385194, + "grad_norm": 9.823799133300781, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8645364046096802, + "num_tokens": 891231511.0, + "step": 23363 + }, + { + "epoch": 2.9721409489886783, + "ewc_loss": 0.08360093832015991, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045759143540635705, + "grad_norm": 9.956218719482422, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8543820977210999, + "num_tokens": 891262369.0, + "step": 23364 + }, + { + "epoch": 2.972268159267269, + "ewc_loss": 0.08328604698181152, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004544424591585994, + "grad_norm": 9.79540729522705, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8510117530822754, + "num_tokens": 891304206.0, + "step": 23365 + }, + { + "epoch": 2.9723953695458594, + "ewc_loss": 0.08371530473232269, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004587351286318153, + "grad_norm": 9.86972713470459, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.853496789932251, + "num_tokens": 891348006.0, + "step": 23366 + }, + { + "epoch": 2.97252257982445, + "ewc_loss": 0.0830112099647522, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045169409713707864, + "grad_norm": 9.730761528015137, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8730219602584839, + "num_tokens": 891386851.0, + "step": 23367 + }, + { + "epoch": 2.9726497901030404, + "ewc_loss": 0.08370359987020493, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004586180439218879, + "grad_norm": 9.86898136138916, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8748453855514526, + "num_tokens": 891423582.0, + "step": 23368 + }, + { + "epoch": 2.972777000381631, + "ewc_loss": 0.08302459120750427, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045182794565334916, + "grad_norm": 9.711745262145996, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8822771906852722, + "num_tokens": 891463773.0, + "step": 23369 + }, + { + "epoch": 2.9729042106602215, + "ewc_loss": 0.08368406444787979, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004584226699080318, + "grad_norm": 9.904280662536621, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8756502866744995, + "num_tokens": 891498117.0, + "step": 23370 + }, + { + "epoch": 2.973031420938812, + "ewc_loss": 0.08315630257129669, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045314503950066864, + "grad_norm": 9.747718811035156, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8746611475944519, + "num_tokens": 891529055.0, + "step": 23371 + }, + { + "epoch": 2.9731586312174025, + "ewc_loss": 0.083820641040802, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004597884544637054, + "grad_norm": 9.857483863830566, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8744997382164001, + "num_tokens": 891567370.0, + "step": 23372 + }, + { + "epoch": 2.9732858414959926, + "ewc_loss": 0.08333393931388855, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004549214500002563, + "grad_norm": 9.834771156311035, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8691074848175049, + "num_tokens": 891602737.0, + "step": 23373 + }, + { + "epoch": 2.9734130517745836, + "ewc_loss": 0.08361995220184326, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045778159983456135, + "grad_norm": 9.904788970947266, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8595870733261108, + "num_tokens": 891639713.0, + "step": 23374 + }, + { + "epoch": 2.9735402620531737, + "ewc_loss": 0.08320663869380951, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045364844845607877, + "grad_norm": 9.781340599060059, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8785406351089478, + "num_tokens": 891673931.0, + "step": 23375 + }, + { + "epoch": 2.9736674723317646, + "ewc_loss": 0.083720862865448, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045879065874032676, + "grad_norm": 9.89717960357666, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8638592958450317, + "num_tokens": 891708015.0, + "step": 23376 + }, + { + "epoch": 2.9737946826103547, + "ewc_loss": 0.08304283022880554, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004520103393588215, + "grad_norm": 9.700820922851562, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8619920015335083, + "num_tokens": 891746023.0, + "step": 23377 + }, + { + "epoch": 2.9739218928889453, + "ewc_loss": 0.08393405377864838, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004609226016327739, + "grad_norm": 9.877459526062012, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.864867627620697, + "num_tokens": 891777767.0, + "step": 23378 + }, + { + "epoch": 2.974049103167536, + "ewc_loss": 0.08302197605371475, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000451801810413599, + "grad_norm": 9.758450508117676, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.86911940574646, + "num_tokens": 891810816.0, + "step": 23379 + }, + { + "epoch": 2.9741763134461263, + "ewc_loss": 0.08374525606632233, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045903463615104556, + "grad_norm": 9.836017608642578, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8588504195213318, + "num_tokens": 891848810.0, + "step": 23380 + }, + { + "epoch": 2.974303523724717, + "ewc_loss": 0.08330415189266205, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004546236013993621, + "grad_norm": 9.719021797180176, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8686540722846985, + "num_tokens": 891892553.0, + "step": 23381 + }, + { + "epoch": 2.9744307340033074, + "ewc_loss": 0.0838005393743515, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045958743430674076, + "grad_norm": 9.93538761138916, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8770867586135864, + "num_tokens": 891927539.0, + "step": 23382 + }, + { + "epoch": 2.974557944281898, + "ewc_loss": 0.08303135633468628, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004518956411629915, + "grad_norm": 9.685293197631836, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8695573806762695, + "num_tokens": 891959577.0, + "step": 23383 + }, + { + "epoch": 2.9746851545604884, + "ewc_loss": 0.08447563648223877, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.0004638970422092825, + "grad_norm": 9.946264266967773, + "learning_rate": 1e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8379347324371338, + "num_tokens": 892000799.0, + "step": 23384 + }, + { + "epoch": 2.974812364839079, + "ewc_loss": 0.08292597532272339, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004508417332544923, + "grad_norm": 9.750393867492676, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8587743043899536, + "num_tokens": 892042200.0, + "step": 23385 + }, + { + "epoch": 2.9749395751176695, + "ewc_loss": 0.0839521586894989, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046110356925055385, + "grad_norm": 9.922867774963379, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8556898832321167, + "num_tokens": 892084838.0, + "step": 23386 + }, + { + "epoch": 2.97506678539626, + "ewc_loss": 0.0832604244351387, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004541862872429192, + "grad_norm": 9.683355331420898, + "learning_rate": 1e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8493781685829163, + "num_tokens": 892126442.0, + "step": 23387 + }, + { + "epoch": 2.9751939956748505, + "ewc_loss": 0.08432149887084961, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046479699085466564, + "grad_norm": 10.033232688903809, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8530784249305725, + "num_tokens": 892165019.0, + "step": 23388 + }, + { + "epoch": 2.975321205953441, + "ewc_loss": 0.0828368216753006, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044995025382377207, + "grad_norm": 9.670709609985352, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8681796789169312, + "num_tokens": 892202291.0, + "step": 23389 + }, + { + "epoch": 2.9754484162320316, + "ewc_loss": 0.08444283157587051, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004660103586502373, + "grad_norm": 9.968073844909668, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8564304113388062, + "num_tokens": 892235159.0, + "step": 23390 + }, + { + "epoch": 2.975575626510622, + "ewc_loss": 0.08298070728778839, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045138911809772253, + "grad_norm": 9.807554244995117, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8533210158348083, + "num_tokens": 892271967.0, + "step": 23391 + }, + { + "epoch": 2.9757028367892127, + "ewc_loss": 0.08398254960775375, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000461407529655844, + "grad_norm": 9.910944938659668, + "learning_rate": 1e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8504838943481445, + "num_tokens": 892306230.0, + "step": 23392 + }, + { + "epoch": 2.975830047067803, + "ewc_loss": 0.08320566266775131, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004536386695690453, + "grad_norm": 9.778980255126953, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8663172721862793, + "num_tokens": 892342118.0, + "step": 23393 + }, + { + "epoch": 2.9759572573463937, + "ewc_loss": 0.08382423222064972, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045982436859048903, + "grad_norm": 9.907121658325195, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.871127188205719, + "num_tokens": 892380764.0, + "step": 23394 + }, + { + "epoch": 2.9760844676249842, + "ewc_loss": 0.0832238644361496, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004538207140285522, + "grad_norm": 9.797613143920898, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8620119690895081, + "num_tokens": 892421113.0, + "step": 23395 + }, + { + "epoch": 2.9762116779035748, + "ewc_loss": 0.08358295261859894, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045741157373413444, + "grad_norm": 9.828190803527832, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8734757900238037, + "num_tokens": 892457512.0, + "step": 23396 + }, + { + "epoch": 2.9763388881821653, + "ewc_loss": 0.08327880501747131, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004543701361399144, + "grad_norm": 9.77214527130127, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8714208602905273, + "num_tokens": 892493599.0, + "step": 23397 + }, + { + "epoch": 2.9764660984607554, + "ewc_loss": 0.08367621153593063, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045834414777345955, + "grad_norm": 9.830591201782227, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8553905487060547, + "num_tokens": 892528474.0, + "step": 23398 + }, + { + "epoch": 2.9765933087393464, + "ewc_loss": 0.08322553336620331, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004538373614195734, + "grad_norm": 9.714025497436523, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8726550340652466, + "num_tokens": 892569591.0, + "step": 23399 + }, + { + "epoch": 2.9767205190179364, + "ewc_loss": 0.08375845849514008, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004591665929183364, + "grad_norm": 9.794997215270996, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8695021867752075, + "num_tokens": 892607185.0, + "step": 23400 + }, + { + "epoch": 2.9768477292965274, + "ewc_loss": 0.08344369381666183, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004560189845506102, + "grad_norm": 9.813796997070312, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8680760264396667, + "num_tokens": 892641453.0, + "step": 23401 + }, + { + "epoch": 2.9769749395751175, + "ewc_loss": 0.08363518118858337, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004579338419716805, + "grad_norm": 9.827520370483398, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8596593737602234, + "num_tokens": 892676802.0, + "step": 23402 + }, + { + "epoch": 2.977102149853708, + "ewc_loss": 0.08359029144048691, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045748494449071586, + "grad_norm": 9.821086883544922, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.877456784248352, + "num_tokens": 892711694.0, + "step": 23403 + }, + { + "epoch": 2.9772293601322986, + "ewc_loss": 0.0834437906742096, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004560199158731848, + "grad_norm": 9.851302146911621, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8562201261520386, + "num_tokens": 892752662.0, + "step": 23404 + }, + { + "epoch": 2.977356570410889, + "ewc_loss": 0.08329129219055176, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004544949915725738, + "grad_norm": 9.792823791503906, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8654226064682007, + "num_tokens": 892790170.0, + "step": 23405 + }, + { + "epoch": 2.9774837806894796, + "ewc_loss": 0.08355531841516495, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045713523286394775, + "grad_norm": 9.89197063446045, + "learning_rate": 1e-06, + "loss": 0.5432, + "mean_token_accuracy": 0.8485653400421143, + "num_tokens": 892831437.0, + "step": 23406 + }, + { + "epoch": 2.97761099096807, + "ewc_loss": 0.08285756409168243, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004501576768234372, + "grad_norm": 9.707986831665039, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8872857689857483, + "num_tokens": 892869022.0, + "step": 23407 + }, + { + "epoch": 2.9777382012466607, + "ewc_loss": 0.08389496803283691, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004605317662935704, + "grad_norm": 9.923203468322754, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8663896322250366, + "num_tokens": 892910738.0, + "step": 23408 + }, + { + "epoch": 2.977865411525251, + "ewc_loss": 0.08285132050514221, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000450095278210938, + "grad_norm": 9.704366683959961, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.874165415763855, + "num_tokens": 892952055.0, + "step": 23409 + }, + { + "epoch": 2.9779926218038417, + "ewc_loss": 0.08398666977882385, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046144871157594025, + "grad_norm": 9.985536575317383, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8573222160339355, + "num_tokens": 892985843.0, + "step": 23410 + }, + { + "epoch": 2.9781198320824323, + "ewc_loss": 0.08271194994449615, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044870158308185637, + "grad_norm": 9.671117782592773, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8555632829666138, + "num_tokens": 893031197.0, + "step": 23411 + }, + { + "epoch": 2.978247042361023, + "ewc_loss": 0.08439194411039352, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046550147817470133, + "grad_norm": 10.003835678100586, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8804197311401367, + "num_tokens": 893068113.0, + "step": 23412 + }, + { + "epoch": 2.9783742526396133, + "ewc_loss": 0.08249212801456451, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004465032834559679, + "grad_norm": 9.621789932250977, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8773030042648315, + "num_tokens": 893109148.0, + "step": 23413 + }, + { + "epoch": 2.978501462918204, + "ewc_loss": 0.0845353752374649, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004669357731472701, + "grad_norm": 9.968286514282227, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8715765476226807, + "num_tokens": 893151406.0, + "step": 23414 + }, + { + "epoch": 2.9786286731967944, + "ewc_loss": 0.0828230082988739, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044981209794059396, + "grad_norm": 9.641783714294434, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8703979849815369, + "num_tokens": 893190209.0, + "step": 23415 + }, + { + "epoch": 2.978755883475385, + "ewc_loss": 0.08453813940286636, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000466963421786204, + "grad_norm": 9.958965301513672, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8648229837417603, + "num_tokens": 893232949.0, + "step": 23416 + }, + { + "epoch": 2.9788830937539754, + "ewc_loss": 0.08309942483901978, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004525763215497136, + "grad_norm": 9.721512794494629, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.865870475769043, + "num_tokens": 893273931.0, + "step": 23417 + }, + { + "epoch": 2.979010304032566, + "ewc_loss": 0.0843178778886795, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046476078568957746, + "grad_norm": 9.88778018951416, + "learning_rate": 1e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8447622060775757, + "num_tokens": 893317465.0, + "step": 23418 + }, + { + "epoch": 2.9791375143111565, + "ewc_loss": 0.0834985077381134, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045656715519726276, + "grad_norm": 9.790315628051758, + "learning_rate": 1e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8504714965820312, + "num_tokens": 893354089.0, + "step": 23419 + }, + { + "epoch": 2.979264724589747, + "ewc_loss": 0.08389948308467865, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004605768481269479, + "grad_norm": 9.872105598449707, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8619452714920044, + "num_tokens": 893388051.0, + "step": 23420 + }, + { + "epoch": 2.979391934868337, + "ewc_loss": 0.08372725546360016, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045885462895967066, + "grad_norm": 9.792826652526855, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8600214123725891, + "num_tokens": 893426191.0, + "step": 23421 + }, + { + "epoch": 2.979519145146928, + "ewc_loss": 0.08423852920532227, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004639672697521746, + "grad_norm": 9.937857627868652, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8579185009002686, + "num_tokens": 893462134.0, + "step": 23422 + }, + { + "epoch": 2.979646355425518, + "ewc_loss": 0.08344489336013794, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045603091712109745, + "grad_norm": 9.837549209594727, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8602937459945679, + "num_tokens": 893503159.0, + "step": 23423 + }, + { + "epoch": 2.979773565704109, + "ewc_loss": 0.08414503931999207, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046303245471790433, + "grad_norm": 9.937277793884277, + "learning_rate": 1e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.8470171689987183, + "num_tokens": 893538379.0, + "step": 23424 + }, + { + "epoch": 2.979900775982699, + "ewc_loss": 0.08334814012050629, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045506341848522425, + "grad_norm": 9.75948715209961, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8527429103851318, + "num_tokens": 893580431.0, + "step": 23425 + }, + { + "epoch": 2.98002798626129, + "ewc_loss": 0.08399523794651031, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046153442235663533, + "grad_norm": 9.921858787536621, + "learning_rate": 1e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8594534993171692, + "num_tokens": 893622412.0, + "step": 23426 + }, + { + "epoch": 2.9801551965398803, + "ewc_loss": 0.08343924582004547, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045597454300150275, + "grad_norm": 9.760469436645508, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.868392288684845, + "num_tokens": 893658575.0, + "step": 23427 + }, + { + "epoch": 2.980282406818471, + "ewc_loss": 0.08402028679847717, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004617849481292069, + "grad_norm": 9.845983505249023, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8727689981460571, + "num_tokens": 893702812.0, + "step": 23428 + }, + { + "epoch": 2.9804096170970613, + "ewc_loss": 0.08339067548513412, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004554887709673494, + "grad_norm": 9.758331298828125, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.857632577419281, + "num_tokens": 893742785.0, + "step": 23429 + }, + { + "epoch": 2.980536827375652, + "ewc_loss": 0.08430042862892151, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046458636643365026, + "grad_norm": 9.90239143371582, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8722524046897888, + "num_tokens": 893782310.0, + "step": 23430 + }, + { + "epoch": 2.9806640376542424, + "ewc_loss": 0.08351599425077438, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045674198190681636, + "grad_norm": 9.761382102966309, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8820035457611084, + "num_tokens": 893822794.0, + "step": 23431 + }, + { + "epoch": 2.980791247932833, + "ewc_loss": 0.08407524973154068, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046233454486355186, + "grad_norm": 9.876906394958496, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8717854022979736, + "num_tokens": 893870462.0, + "step": 23432 + }, + { + "epoch": 2.9809184582114234, + "ewc_loss": 0.0836808905005455, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000458390946732834, + "grad_norm": 9.809394836425781, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8641924262046814, + "num_tokens": 893910654.0, + "step": 23433 + }, + { + "epoch": 2.981045668490014, + "ewc_loss": 0.08409042656421661, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004624863213393837, + "grad_norm": 9.892961502075195, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8627883195877075, + "num_tokens": 893948819.0, + "step": 23434 + }, + { + "epoch": 2.9811728787686045, + "ewc_loss": 0.0835484117269516, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004570661985781044, + "grad_norm": 9.85926342010498, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8711875081062317, + "num_tokens": 893983325.0, + "step": 23435 + }, + { + "epoch": 2.981300089047195, + "ewc_loss": 0.08386364579200745, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004602184926625341, + "grad_norm": 9.878368377685547, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8639992475509644, + "num_tokens": 894023712.0, + "step": 23436 + }, + { + "epoch": 2.9814272993257855, + "ewc_loss": 0.08377163857221603, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045929840416647494, + "grad_norm": 9.842313766479492, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8753626346588135, + "num_tokens": 894059719.0, + "step": 23437 + }, + { + "epoch": 2.981554509604376, + "ewc_loss": 0.08375363051891327, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045911830966360867, + "grad_norm": 9.847636222839355, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8663859963417053, + "num_tokens": 894100150.0, + "step": 23438 + }, + { + "epoch": 2.9816817198829666, + "ewc_loss": 0.08360085636377335, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004575905913952738, + "grad_norm": 9.811641693115234, + "learning_rate": 1e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.8447035551071167, + "num_tokens": 894134336.0, + "step": 23439 + }, + { + "epoch": 2.981808930161557, + "ewc_loss": 0.08386265486478806, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046020859736017883, + "grad_norm": 9.928093910217285, + "learning_rate": 1e-06, + "loss": 0.5241, + "mean_token_accuracy": 0.8511204719543457, + "num_tokens": 894168972.0, + "step": 23440 + }, + { + "epoch": 2.9819361404401477, + "ewc_loss": 0.08326081931591034, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004541901871562004, + "grad_norm": 9.793728828430176, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8624177575111389, + "num_tokens": 894204641.0, + "step": 23441 + }, + { + "epoch": 2.982063350718738, + "ewc_loss": 0.08383937180042267, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004599757376126945, + "grad_norm": 9.861944198608398, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8673363924026489, + "num_tokens": 894248403.0, + "step": 23442 + }, + { + "epoch": 2.9821905609973287, + "ewc_loss": 0.08338092267513275, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045539127313531935, + "grad_norm": 9.755877494812012, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.882355809211731, + "num_tokens": 894284035.0, + "step": 23443 + }, + { + "epoch": 2.9823177712759192, + "ewc_loss": 0.08402305096387863, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004618125385604799, + "grad_norm": 9.922490119934082, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8704094886779785, + "num_tokens": 894321804.0, + "step": 23444 + }, + { + "epoch": 2.9824449815545098, + "ewc_loss": 0.08323357254266739, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004539177753031254, + "grad_norm": 9.707442283630371, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8727243542671204, + "num_tokens": 894355648.0, + "step": 23445 + }, + { + "epoch": 2.9825721918331, + "ewc_loss": 0.08405985683202744, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004621806147042662, + "grad_norm": 9.960855484008789, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8653963804244995, + "num_tokens": 894392947.0, + "step": 23446 + }, + { + "epoch": 2.982699402111691, + "ewc_loss": 0.08325150609016418, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045409705489873886, + "grad_norm": 9.760002136230469, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8546251058578491, + "num_tokens": 894433437.0, + "step": 23447 + }, + { + "epoch": 2.982826612390281, + "ewc_loss": 0.08396066725254059, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004611887561623007, + "grad_norm": 9.865058898925781, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8707354068756104, + "num_tokens": 894481428.0, + "step": 23448 + }, + { + "epoch": 2.982953822668872, + "ewc_loss": 0.08346794545650482, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045626144856214523, + "grad_norm": 9.814273834228516, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8657799959182739, + "num_tokens": 894518055.0, + "step": 23449 + }, + { + "epoch": 2.983081032947462, + "ewc_loss": 0.08387532085180283, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046033525723032653, + "grad_norm": 9.844423294067383, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8594123125076294, + "num_tokens": 894562693.0, + "step": 23450 + }, + { + "epoch": 2.983208243226053, + "ewc_loss": 0.08362279087305069, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004578099469654262, + "grad_norm": 9.810859680175781, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8631513118743896, + "num_tokens": 894604973.0, + "step": 23451 + }, + { + "epoch": 2.983335453504643, + "ewc_loss": 0.08374828845262527, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045906490413472056, + "grad_norm": 9.79824161529541, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8714819550514221, + "num_tokens": 894645672.0, + "step": 23452 + }, + { + "epoch": 2.9834626637832335, + "ewc_loss": 0.08382917940616608, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004598738451022655, + "grad_norm": 9.85543441772461, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8622556328773499, + "num_tokens": 894686702.0, + "step": 23453 + }, + { + "epoch": 2.983589874061824, + "ewc_loss": 0.08363527059555054, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045793477329425514, + "grad_norm": 9.837562561035156, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8737255334854126, + "num_tokens": 894722768.0, + "step": 23454 + }, + { + "epoch": 2.9837170843404146, + "ewc_loss": 0.08381804823875427, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000459762493846938, + "grad_norm": 9.90196418762207, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8776767253875732, + "num_tokens": 894759216.0, + "step": 23455 + }, + { + "epoch": 2.983844294619005, + "ewc_loss": 0.08326476067304611, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045422965195029974, + "grad_norm": 9.763720512390137, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8697944283485413, + "num_tokens": 894799691.0, + "step": 23456 + }, + { + "epoch": 2.9839715048975957, + "ewc_loss": 0.08384126424789429, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004599947133101523, + "grad_norm": 9.896181106567383, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8612947463989258, + "num_tokens": 894835869.0, + "step": 23457 + }, + { + "epoch": 2.984098715176186, + "ewc_loss": 0.08328983932733536, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045448041055351496, + "grad_norm": 9.913039207458496, + "learning_rate": 1e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8437416553497314, + "num_tokens": 894875706.0, + "step": 23458 + }, + { + "epoch": 2.9842259254547767, + "ewc_loss": 0.08340853452682495, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045566735207103193, + "grad_norm": 9.77295970916748, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8713123798370361, + "num_tokens": 894920135.0, + "step": 23459 + }, + { + "epoch": 2.9843531357333672, + "ewc_loss": 0.08396600931882858, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046124213258735836, + "grad_norm": 9.863508224487305, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.867624819278717, + "num_tokens": 894960825.0, + "step": 23460 + }, + { + "epoch": 2.9844803460119578, + "ewc_loss": 0.0832487940788269, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045406995923258364, + "grad_norm": 9.78917121887207, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.867067277431488, + "num_tokens": 895001935.0, + "step": 23461 + }, + { + "epoch": 2.9846075562905483, + "ewc_loss": 0.08381174504756927, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045969945495016873, + "grad_norm": 9.90351390838623, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8745115995407104, + "num_tokens": 895033559.0, + "step": 23462 + }, + { + "epoch": 2.984734766569139, + "ewc_loss": 0.08326534181833267, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045423544361256063, + "grad_norm": 9.753660202026367, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8661652207374573, + "num_tokens": 895070958.0, + "step": 23463 + }, + { + "epoch": 2.9848619768477294, + "ewc_loss": 0.08413778245449066, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004629598115570843, + "grad_norm": 9.932168960571289, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8643640279769897, + "num_tokens": 895111533.0, + "step": 23464 + }, + { + "epoch": 2.98498918712632, + "ewc_loss": 0.0831817016005516, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045339902862906456, + "grad_norm": 9.712685585021973, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8611778020858765, + "num_tokens": 895149861.0, + "step": 23465 + }, + { + "epoch": 2.9851163974049104, + "ewc_loss": 0.08425063639879227, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046408839989453554, + "grad_norm": 9.915315628051758, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8724326491355896, + "num_tokens": 895192958.0, + "step": 23466 + }, + { + "epoch": 2.985243607683501, + "ewc_loss": 0.08318602293729782, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045344227692112327, + "grad_norm": 9.760858535766602, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8763148784637451, + "num_tokens": 895226710.0, + "step": 23467 + }, + { + "epoch": 2.9853708179620915, + "ewc_loss": 0.08410687744617462, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046265084529295564, + "grad_norm": 9.857498168945312, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8686681985855103, + "num_tokens": 895267695.0, + "step": 23468 + }, + { + "epoch": 2.985498028240682, + "ewc_loss": 0.08345629274845123, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045614494592882693, + "grad_norm": 9.77846622467041, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8804991245269775, + "num_tokens": 895303101.0, + "step": 23469 + }, + { + "epoch": 2.9856252385192725, + "ewc_loss": 0.08388695120811462, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046045149792917073, + "grad_norm": 9.875568389892578, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8709641695022583, + "num_tokens": 895340437.0, + "step": 23470 + }, + { + "epoch": 2.9857524487978626, + "ewc_loss": 0.08340749144554138, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004556569328997284, + "grad_norm": 9.736419677734375, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8818613886833191, + "num_tokens": 895377695.0, + "step": 23471 + }, + { + "epoch": 2.9858796590764536, + "ewc_loss": 0.08408400416374207, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046242206008173525, + "grad_norm": 9.90762996673584, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8573305606842041, + "num_tokens": 895418599.0, + "step": 23472 + }, + { + "epoch": 2.9860068693550437, + "ewc_loss": 0.0833132416009903, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004547144053503871, + "grad_norm": 9.74378490447998, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8674128651618958, + "num_tokens": 895454832.0, + "step": 23473 + }, + { + "epoch": 2.9861340796336346, + "ewc_loss": 0.08428265154361725, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004644086002372205, + "grad_norm": 9.860950469970703, + "learning_rate": 1e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8480534553527832, + "num_tokens": 895490607.0, + "step": 23474 + }, + { + "epoch": 2.9862612899122247, + "ewc_loss": 0.08343599736690521, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045594197581522167, + "grad_norm": 9.738224983215332, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8601663112640381, + "num_tokens": 895531340.0, + "step": 23475 + }, + { + "epoch": 2.9863885001908153, + "ewc_loss": 0.08413630723953247, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046294511412270367, + "grad_norm": 9.866101264953613, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8661270141601562, + "num_tokens": 895568688.0, + "step": 23476 + }, + { + "epoch": 2.986515710469406, + "ewc_loss": 0.08342476189136505, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045582966413348913, + "grad_norm": 9.723645210266113, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8546469211578369, + "num_tokens": 895607316.0, + "step": 23477 + }, + { + "epoch": 2.9866429207479963, + "ewc_loss": 0.08443091809749603, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046589120756834745, + "grad_norm": 9.914770126342773, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8836619853973389, + "num_tokens": 895648694.0, + "step": 23478 + }, + { + "epoch": 2.986770131026587, + "ewc_loss": 0.08338715136051178, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004554535844363272, + "grad_norm": 9.752187728881836, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8607439994812012, + "num_tokens": 895687996.0, + "step": 23479 + }, + { + "epoch": 2.9868973413051774, + "ewc_loss": 0.08427943289279938, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046437635319307446, + "grad_norm": 9.928838729858398, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8702400326728821, + "num_tokens": 895726727.0, + "step": 23480 + }, + { + "epoch": 2.987024551583768, + "ewc_loss": 0.0835220068693161, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004568020813167095, + "grad_norm": 9.751758575439453, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8705201745033264, + "num_tokens": 895760688.0, + "step": 23481 + }, + { + "epoch": 2.9871517618623584, + "ewc_loss": 0.0844041034579277, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046562307397834957, + "grad_norm": 9.920233726501465, + "learning_rate": 1e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.8461052179336548, + "num_tokens": 895797303.0, + "step": 23482 + }, + { + "epoch": 2.987278972140949, + "ewc_loss": 0.08352030813694, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045678511378355324, + "grad_norm": 9.7774019241333, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8706046342849731, + "num_tokens": 895834078.0, + "step": 23483 + }, + { + "epoch": 2.9874061824195395, + "ewc_loss": 0.08442084491252899, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004657904792111367, + "grad_norm": 9.874752044677734, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.854904294013977, + "num_tokens": 895872640.0, + "step": 23484 + }, + { + "epoch": 2.98753339269813, + "ewc_loss": 0.08371864259243011, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004587685107253492, + "grad_norm": 9.811945915222168, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8800750970840454, + "num_tokens": 895907646.0, + "step": 23485 + }, + { + "epoch": 2.9876606029767205, + "ewc_loss": 0.08418780565261841, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046346013550646603, + "grad_norm": 9.875594139099121, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8756715059280396, + "num_tokens": 895944183.0, + "step": 23486 + }, + { + "epoch": 2.987787813255311, + "ewc_loss": 0.0839480459690094, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046106247464194894, + "grad_norm": 9.826363563537598, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8677475452423096, + "num_tokens": 895983987.0, + "step": 23487 + }, + { + "epoch": 2.9879150235339016, + "ewc_loss": 0.08411762118339539, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046275820932351053, + "grad_norm": 9.897882461547852, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8536709547042847, + "num_tokens": 896020928.0, + "step": 23488 + }, + { + "epoch": 2.988042233812492, + "ewc_loss": 0.0839126706123352, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004607087466865778, + "grad_norm": 9.821532249450684, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8727878928184509, + "num_tokens": 896057459.0, + "step": 23489 + }, + { + "epoch": 2.9881694440910826, + "ewc_loss": 0.0841594785451889, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004631768388208002, + "grad_norm": 9.884988784790039, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.862289547920227, + "num_tokens": 896095049.0, + "step": 23490 + }, + { + "epoch": 2.988296654369673, + "ewc_loss": 0.0838218480348587, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045980053255334496, + "grad_norm": 9.832984924316406, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8764231204986572, + "num_tokens": 896127433.0, + "step": 23491 + }, + { + "epoch": 2.9884238646482637, + "ewc_loss": 0.08411777764558792, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046275981003418565, + "grad_norm": 9.826386451721191, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8721741437911987, + "num_tokens": 896165662.0, + "step": 23492 + }, + { + "epoch": 2.9885510749268542, + "ewc_loss": 0.083974689245224, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046132897841744125, + "grad_norm": 9.837873458862305, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8620675206184387, + "num_tokens": 896207753.0, + "step": 23493 + }, + { + "epoch": 2.9886782852054448, + "ewc_loss": 0.08392317593097687, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046081384061835706, + "grad_norm": 9.847143173217773, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8637689352035522, + "num_tokens": 896249105.0, + "step": 23494 + }, + { + "epoch": 2.9888054954840353, + "ewc_loss": 0.08389122784137726, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004604942805599421, + "grad_norm": 9.837503433227539, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8655319213867188, + "num_tokens": 896291728.0, + "step": 23495 + }, + { + "epoch": 2.9889327057626254, + "ewc_loss": 0.08412066102027893, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046278868103399873, + "grad_norm": 9.8843994140625, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8549585342407227, + "num_tokens": 896331729.0, + "step": 23496 + }, + { + "epoch": 2.9890599160412163, + "ewc_loss": 0.08390987664461136, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046068080700933933, + "grad_norm": 9.815688133239746, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.855870246887207, + "num_tokens": 896371804.0, + "step": 23497 + }, + { + "epoch": 2.9891871263198064, + "ewc_loss": 0.08425027132034302, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046408476191572845, + "grad_norm": 9.932389259338379, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8616454005241394, + "num_tokens": 896408398.0, + "step": 23498 + }, + { + "epoch": 2.9893143365983974, + "ewc_loss": 0.0835152119398117, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045673418208025396, + "grad_norm": 9.83635139465332, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8809041380882263, + "num_tokens": 896440496.0, + "step": 23499 + }, + { + "epoch": 2.9894415468769875, + "ewc_loss": 0.08412064611911774, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004627884482033551, + "grad_norm": 9.966531753540039, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8681347966194153, + "num_tokens": 896475212.0, + "step": 23500 + }, + { + "epoch": 2.989568757155578, + "ewc_loss": 0.08345048129558563, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004560868546832353, + "grad_norm": 9.800400733947754, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8609157800674438, + "num_tokens": 896517122.0, + "step": 23501 + }, + { + "epoch": 2.9896959674341685, + "ewc_loss": 0.08417443931102753, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004633263743016869, + "grad_norm": 9.982224464416504, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8723605871200562, + "num_tokens": 896557259.0, + "step": 23502 + }, + { + "epoch": 2.989823177712759, + "ewc_loss": 0.08333701640367508, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045495221274904907, + "grad_norm": 9.797772407531738, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8568909764289856, + "num_tokens": 896600237.0, + "step": 23503 + }, + { + "epoch": 2.9899503879913496, + "ewc_loss": 0.08416485786437988, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004632306518033147, + "grad_norm": 9.933560371398926, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8777266144752502, + "num_tokens": 896640564.0, + "step": 23504 + }, + { + "epoch": 2.99007759826994, + "ewc_loss": 0.0834137573838234, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045571959344670177, + "grad_norm": 9.796287536621094, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.87678062915802, + "num_tokens": 896674164.0, + "step": 23505 + }, + { + "epoch": 2.9902048085485307, + "ewc_loss": 0.0841439887881279, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000463021919131279, + "grad_norm": 9.987505912780762, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8596478700637817, + "num_tokens": 896716613.0, + "step": 23506 + }, + { + "epoch": 2.990332018827121, + "ewc_loss": 0.08336726576089859, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004552546888589859, + "grad_norm": 9.828082084655762, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8557824492454529, + "num_tokens": 896755586.0, + "step": 23507 + }, + { + "epoch": 2.9904592291057117, + "ewc_loss": 0.084203340113163, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046361543354578316, + "grad_norm": 9.965194702148438, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8633663654327393, + "num_tokens": 896797641.0, + "step": 23508 + }, + { + "epoch": 2.9905864393843022, + "ewc_loss": 0.08339090645313263, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004554911283776164, + "grad_norm": 9.82686996459961, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8627777099609375, + "num_tokens": 896835113.0, + "step": 23509 + }, + { + "epoch": 2.9907136496628928, + "ewc_loss": 0.08419369161128998, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004635189543478191, + "grad_norm": 10.023796081542969, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8653576970100403, + "num_tokens": 896866496.0, + "step": 23510 + }, + { + "epoch": 2.9908408599414833, + "ewc_loss": 0.0832371711730957, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045395377674140036, + "grad_norm": 9.775127410888672, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8630697727203369, + "num_tokens": 896907535.0, + "step": 23511 + }, + { + "epoch": 2.990968070220074, + "ewc_loss": 0.08419545739889145, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046353662037290633, + "grad_norm": 9.962560653686523, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8829732537269592, + "num_tokens": 896948574.0, + "step": 23512 + }, + { + "epoch": 2.9910952804986644, + "ewc_loss": 0.08325526118278503, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045413465704768896, + "grad_norm": 9.836718559265137, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8599358797073364, + "num_tokens": 896992377.0, + "step": 23513 + }, + { + "epoch": 2.991222490777255, + "ewc_loss": 0.08398433029651642, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046142537030391395, + "grad_norm": 9.924973487854004, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8610763549804688, + "num_tokens": 897035411.0, + "step": 23514 + }, + { + "epoch": 2.9913497010558454, + "ewc_loss": 0.08350356668233871, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045661767944693565, + "grad_norm": 9.81829833984375, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8689749836921692, + "num_tokens": 897074661.0, + "step": 23515 + }, + { + "epoch": 2.991476911334436, + "ewc_loss": 0.08407402038574219, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046232223394326866, + "grad_norm": 9.998127937316895, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8678611516952515, + "num_tokens": 897115550.0, + "step": 23516 + }, + { + "epoch": 2.9916041216130265, + "ewc_loss": 0.0834093689918518, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000455675704870373, + "grad_norm": 9.796845436096191, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8594446182250977, + "num_tokens": 897155091.0, + "step": 23517 + }, + { + "epoch": 2.991731331891617, + "ewc_loss": 0.08455397188663483, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00046468034270219505, + "grad_norm": 10.042009353637695, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8613460659980774, + "num_tokens": 897196551.0, + "step": 23518 + }, + { + "epoch": 2.991858542170207, + "ewc_loss": 0.08306726813316345, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004522547242231667, + "grad_norm": 9.715407371520996, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.875978946685791, + "num_tokens": 897231885.0, + "step": 23519 + }, + { + "epoch": 2.991985752448798, + "ewc_loss": 0.08468630164861679, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004684450395870954, + "grad_norm": 10.071237564086914, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8653608560562134, + "num_tokens": 897267944.0, + "step": 23520 + }, + { + "epoch": 2.992112962727388, + "ewc_loss": 0.08297712355852127, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045135326217859983, + "grad_norm": 9.75485897064209, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8693318963050842, + "num_tokens": 897307330.0, + "step": 23521 + }, + { + "epoch": 2.992240173005979, + "ewc_loss": 0.0846451073884964, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004680330748669803, + "grad_norm": 10.083505630493164, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8663938641548157, + "num_tokens": 897344028.0, + "step": 23522 + }, + { + "epoch": 2.992367383284569, + "ewc_loss": 0.0828409194946289, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004499912611208856, + "grad_norm": 9.718940734863281, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8575588464736938, + "num_tokens": 897391213.0, + "step": 23523 + }, + { + "epoch": 2.99249459356316, + "ewc_loss": 0.08499618619680405, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00047154389903880656, + "grad_norm": 10.242880821228027, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8595747351646423, + "num_tokens": 897428222.0, + "step": 23524 + }, + { + "epoch": 2.9926218038417502, + "ewc_loss": 0.08248218148946762, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004464038647711277, + "grad_norm": 9.64979076385498, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.852778434753418, + "num_tokens": 897457737.0, + "step": 23525 + }, + { + "epoch": 2.992749014120341, + "ewc_loss": 0.08530266582965851, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004746086779050529, + "grad_norm": 10.131598472595215, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8745542764663696, + "num_tokens": 897498925.0, + "step": 23526 + }, + { + "epoch": 2.9928762243989313, + "ewc_loss": 0.0827542245388031, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00044912428711540997, + "grad_norm": 9.695006370544434, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8607109785079956, + "num_tokens": 897533157.0, + "step": 23527 + }, + { + "epoch": 2.993003434677522, + "ewc_loss": 0.08504292368888855, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00047201121924445033, + "grad_norm": 10.094030380249023, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8671660423278809, + "num_tokens": 897574159.0, + "step": 23528 + }, + { + "epoch": 2.9931306449561124, + "ewc_loss": 0.083023801445961, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045182008761912584, + "grad_norm": 9.686079978942871, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8568998575210571, + "num_tokens": 897606970.0, + "step": 23529 + }, + { + "epoch": 2.993257855234703, + "ewc_loss": 0.08512043207883835, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004727863415610045, + "grad_norm": 10.129707336425781, + "learning_rate": 1e-06, + "loss": 0.555, + "mean_token_accuracy": 0.84046471118927, + "num_tokens": 897647725.0, + "step": 23530 + }, + { + "epoch": 2.9933850655132934, + "ewc_loss": 0.08287795633077621, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045036160736344755, + "grad_norm": 9.640813827514648, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8678262233734131, + "num_tokens": 897687109.0, + "step": 23531 + }, + { + "epoch": 2.993512275791884, + "ewc_loss": 0.08522343635559082, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004738163552246988, + "grad_norm": 10.08315372467041, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8773096799850464, + "num_tokens": 897720994.0, + "step": 23532 + }, + { + "epoch": 2.9936394860704745, + "ewc_loss": 0.08312203735113144, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004528024001047015, + "grad_norm": 9.706006050109863, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8803709149360657, + "num_tokens": 897749855.0, + "step": 23533 + }, + { + "epoch": 2.993766696349065, + "ewc_loss": 0.08504427969455719, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004720248107332736, + "grad_norm": 10.167373657226562, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8620388507843018, + "num_tokens": 897792846.0, + "step": 23534 + }, + { + "epoch": 2.9938939066276555, + "ewc_loss": 0.08320441097021103, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00045118475100025535, + "grad_norm": 9.647111892700195, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8683252334594727, + "num_tokens": 897836765.0, + "step": 23535 + }, + { + "epoch": 2.994021116906246, + "ewc_loss": 0.08514110743999481, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00047299309517256916, + "grad_norm": 10.083342552185059, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8659543395042419, + "num_tokens": 897880918.0, + "step": 23536 + }, + { + "epoch": 2.9941483271848366, + "ewc_loss": 0.08308269828557968, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045240900362841785, + "grad_norm": 9.718213081359863, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8614426255226135, + "num_tokens": 897916676.0, + "step": 23537 + }, + { + "epoch": 2.994275537463427, + "ewc_loss": 0.0850120335817337, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00047170236939564347, + "grad_norm": 10.09110164642334, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8713929653167725, + "num_tokens": 897953968.0, + "step": 23538 + }, + { + "epoch": 2.9944027477420176, + "ewc_loss": 0.08320062607526779, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004535882908385247, + "grad_norm": 9.748835563659668, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8655969500541687, + "num_tokens": 897993055.0, + "step": 23539 + }, + { + "epoch": 2.994529958020608, + "ewc_loss": 0.08479494601488113, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046953148557804525, + "grad_norm": 10.055530548095703, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8742625713348389, + "num_tokens": 898030202.0, + "step": 23540 + }, + { + "epoch": 2.9946571682991987, + "ewc_loss": 0.08312533050775528, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045283534564077854, + "grad_norm": 9.763601303100586, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8600499629974365, + "num_tokens": 898067234.0, + "step": 23541 + }, + { + "epoch": 2.9947843785777892, + "ewc_loss": 0.08444783091545105, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004660603008233011, + "grad_norm": 10.020791053771973, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8693010807037354, + "num_tokens": 898105423.0, + "step": 23542 + }, + { + "epoch": 2.9949115888563798, + "ewc_loss": 0.08347707986831665, + "ewc_loss_diag": 3.814697265625e-05, + "ewc_loss_parallel": 0.00045391145977191627, + "grad_norm": 9.767867088317871, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8856033682823181, + "num_tokens": 898144550.0, + "step": 23543 + }, + { + "epoch": 2.99503879913497, + "ewc_loss": 0.08429598808288574, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004645419539883733, + "grad_norm": 10.042091369628906, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8691681623458862, + "num_tokens": 898179277.0, + "step": 23544 + }, + { + "epoch": 2.995166009413561, + "ewc_loss": 0.08313168585300446, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004528989375103265, + "grad_norm": 9.7459716796875, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8653199672698975, + "num_tokens": 898218518.0, + "step": 23545 + }, + { + "epoch": 2.995293219692151, + "ewc_loss": 0.0844501182436943, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004660832055378705, + "grad_norm": 9.971997261047363, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8540782928466797, + "num_tokens": 898259266.0, + "step": 23546 + }, + { + "epoch": 2.995420429970742, + "ewc_loss": 0.08322282880544662, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004538103239610791, + "grad_norm": 9.836448669433594, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8633631467819214, + "num_tokens": 898292648.0, + "step": 23547 + }, + { + "epoch": 2.995547640249332, + "ewc_loss": 0.08402375876903534, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004618196689989418, + "grad_norm": 9.953474998474121, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8678250312805176, + "num_tokens": 898328413.0, + "step": 23548 + }, + { + "epoch": 2.995674850527923, + "ewc_loss": 0.08335411548614502, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045512313954532146, + "grad_norm": 9.7253999710083, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8685324192047119, + "num_tokens": 898376963.0, + "step": 23549 + }, + { + "epoch": 2.995802060806513, + "ewc_loss": 0.08434136211872101, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046499562449753284, + "grad_norm": 9.981544494628906, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8772615194320679, + "num_tokens": 898407256.0, + "step": 23550 + }, + { + "epoch": 2.9959292710851035, + "ewc_loss": 0.08307565003633499, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045233851415105164, + "grad_norm": 9.791475296020508, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.868705153465271, + "num_tokens": 898448800.0, + "step": 23551 + }, + { + "epoch": 2.996056481363694, + "ewc_loss": 0.08429384231567383, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004645204753614962, + "grad_norm": 9.924524307250977, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8616920709609985, + "num_tokens": 898488912.0, + "step": 23552 + }, + { + "epoch": 2.9961836916422846, + "ewc_loss": 0.08339251577854156, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000455507222795859, + "grad_norm": 9.829815864562988, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8600425124168396, + "num_tokens": 898530434.0, + "step": 23553 + }, + { + "epoch": 2.996310901920875, + "ewc_loss": 0.08389798551797867, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004605618887580931, + "grad_norm": 9.943036079406738, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8662488460540771, + "num_tokens": 898565044.0, + "step": 23554 + }, + { + "epoch": 2.9964381121994657, + "ewc_loss": 0.0833774283528328, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004553563194349408, + "grad_norm": 9.790997505187988, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8767539858818054, + "num_tokens": 898601798.0, + "step": 23555 + }, + { + "epoch": 2.996565322478056, + "ewc_loss": 0.08400450646877289, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046162711805664003, + "grad_norm": 9.886791229248047, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8564953804016113, + "num_tokens": 898644789.0, + "step": 23556 + }, + { + "epoch": 2.9966925327566467, + "ewc_loss": 0.08350370824337006, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045661916374228895, + "grad_norm": 9.786065101623535, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.865151047706604, + "num_tokens": 898683928.0, + "step": 23557 + }, + { + "epoch": 2.9968197430352372, + "ewc_loss": 0.0840618759393692, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004622008418664336, + "grad_norm": 9.871882438659668, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8601465225219727, + "num_tokens": 898726217.0, + "step": 23558 + }, + { + "epoch": 2.9969469533138278, + "ewc_loss": 0.08358980715274811, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045748005504719913, + "grad_norm": 9.802652359008789, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8748266696929932, + "num_tokens": 898760575.0, + "step": 23559 + }, + { + "epoch": 2.9970741635924183, + "ewc_loss": 0.08399280160665512, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046151006245054305, + "grad_norm": 9.847111701965332, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8627023696899414, + "num_tokens": 898801232.0, + "step": 23560 + }, + { + "epoch": 2.997201373871009, + "ewc_loss": 0.08382347971200943, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045981683069840074, + "grad_norm": 9.850846290588379, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8729204535484314, + "num_tokens": 898837174.0, + "step": 23561 + }, + { + "epoch": 2.9973285841495994, + "ewc_loss": 0.08382519334554672, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045983397285453975, + "grad_norm": 9.875801086425781, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.85639488697052, + "num_tokens": 898875849.0, + "step": 23562 + }, + { + "epoch": 2.99745579442819, + "ewc_loss": 0.08375301212072372, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004591121687553823, + "grad_norm": 9.84937858581543, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8750914931297302, + "num_tokens": 898909975.0, + "step": 23563 + }, + { + "epoch": 2.9975830047067804, + "ewc_loss": 0.08371089398860931, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000458690949017182, + "grad_norm": 9.868083953857422, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8581209182739258, + "num_tokens": 898948250.0, + "step": 23564 + }, + { + "epoch": 2.997710214985371, + "ewc_loss": 0.08382327854633331, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004598148516379297, + "grad_norm": 9.855256080627441, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8826107382774353, + "num_tokens": 898989122.0, + "step": 23565 + }, + { + "epoch": 2.9978374252639615, + "ewc_loss": 0.08387792110443115, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046036121784709394, + "grad_norm": 9.861753463745117, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8619868755340576, + "num_tokens": 899028715.0, + "step": 23566 + }, + { + "epoch": 2.997964635542552, + "ewc_loss": 0.08371306955814362, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004587127477861941, + "grad_norm": 9.887998580932617, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8690569996833801, + "num_tokens": 899064101.0, + "step": 23567 + }, + { + "epoch": 2.9980918458211425, + "ewc_loss": 0.08376787602901459, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045926080201752484, + "grad_norm": 9.8652982711792, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8701145648956299, + "num_tokens": 899101759.0, + "step": 23568 + }, + { + "epoch": 2.9982190560997326, + "ewc_loss": 0.08379107713699341, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000459492759546265, + "grad_norm": 9.767732620239258, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.859165370464325, + "num_tokens": 899142273.0, + "step": 23569 + }, + { + "epoch": 2.9983462663783236, + "ewc_loss": 0.08411674946546555, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046274950727820396, + "grad_norm": 9.968728065490723, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8673182725906372, + "num_tokens": 899175878.0, + "step": 23570 + }, + { + "epoch": 2.9984734766569137, + "ewc_loss": 0.08343850821256638, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004559671215247363, + "grad_norm": 9.806026458740234, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8773629665374756, + "num_tokens": 899208937.0, + "step": 23571 + }, + { + "epoch": 2.9986006869355046, + "ewc_loss": 0.08412976562976837, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004628797178156674, + "grad_norm": 9.897317886352539, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8658461570739746, + "num_tokens": 899244088.0, + "step": 23572 + }, + { + "epoch": 2.9987278972140947, + "ewc_loss": 0.08331337571144104, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.000454715802334249, + "grad_norm": 9.833925247192383, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8616405725479126, + "num_tokens": 899280111.0, + "step": 23573 + }, + { + "epoch": 2.9988551074926852, + "ewc_loss": 0.08406873792409897, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004622694104909897, + "grad_norm": 9.952436447143555, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.865109920501709, + "num_tokens": 899310882.0, + "step": 23574 + }, + { + "epoch": 2.9989823177712758, + "ewc_loss": 0.08323375880718231, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004539195797406137, + "grad_norm": 9.754073143005371, + "learning_rate": 1e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8525093793869019, + "num_tokens": 899346820.0, + "step": 23575 + }, + { + "epoch": 2.9991095280498663, + "ewc_loss": 0.08409130573272705, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046249505248852074, + "grad_norm": 9.909123420715332, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8797506093978882, + "num_tokens": 899382044.0, + "step": 23576 + }, + { + "epoch": 2.999236738328457, + "ewc_loss": 0.08327138423919678, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045429583406075835, + "grad_norm": 9.715877532958984, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8661974668502808, + "num_tokens": 899418389.0, + "step": 23577 + }, + { + "epoch": 2.9993639486070474, + "ewc_loss": 0.08430510759353638, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004646331362891942, + "grad_norm": 9.940337181091309, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8634564876556396, + "num_tokens": 899458486.0, + "step": 23578 + }, + { + "epoch": 2.999491158885638, + "ewc_loss": 0.08308637142181396, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045244579087011516, + "grad_norm": 9.768289566040039, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.86408531665802, + "num_tokens": 899498168.0, + "step": 23579 + }, + { + "epoch": 2.9996183691642284, + "ewc_loss": 0.08409100770950317, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00046249208389781415, + "grad_norm": 9.901549339294434, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8813127875328064, + "num_tokens": 899538655.0, + "step": 23580 + }, + { + "epoch": 2.999745579442819, + "ewc_loss": 0.08333614468574524, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004549434524960816, + "grad_norm": 9.770834922790527, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8539049625396729, + "num_tokens": 899581693.0, + "step": 23581 + }, + { + "epoch": 2.9998727897214095, + "ewc_loss": 0.08380505442619324, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.0004596325452439487, + "grad_norm": 9.85854434967041, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8632458448410034, + "num_tokens": 899623817.0, + "step": 23582 + }, + { + "epoch": 3.0, + "ewc_loss": 0.08365151286125183, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045809714356437325, + "grad_norm": 9.853402137756348, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8693987131118774, + "num_tokens": 899664226.0, + "step": 23583 + }, + { + "epoch": 3.0, + "ewc_loss": 0.08365151286125183, + "ewc_loss_diag": 3.790855407714844e-05, + "ewc_loss_parallel": 0.00045809714356437325, + "step": 23583, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.4756091437999748, + "train_runtime": 47393.4997, + "train_samples_per_second": 7.961, + "train_steps_per_second": 0.498 + } + ], + "logging_steps": 1, + "max_steps": 23583, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 11792, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.62815163329864e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..a6b9161 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d9174c5b018094a932b8f141710aaa6872d476a0de37af20abe17a4774da5b2 +size 13393